import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import joblib import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Dropout # Load Dataset def load_data(file_path): df = pd.read_csv(file_path) df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnecessary index column if present return df # Preprocessing def preprocess_data(df): df['Original_Year'] = df['Year'] df['Car_Age'] = 2024 - df['Year'] df.drop(columns=['Year'], inplace=True) # Handle missing values df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median()) df['Power'] = df['Power'].fillna(df['Power'].median()) df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0]) # Remove rows with missing target variable df = df.dropna(subset=['Mileage Km/L', 'Price']) # Remove outliers in 'Kilometers Driven' q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75]) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)] return df # Random Forest Model def train_rf_model(df, target, model_name): X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) y = df[target] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # ColumnTransformer for preprocessing categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols), ('cat', OneHotEncoder(drop='first'), categorical_cols) ] ) rf_pipeline = Pipeline([ ('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42)) ]) # Hyperparameter tuning param_grid = { 'regressor__n_estimators': [50, 100, 200], 'regressor__max_depth': [10, 20, None] } grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_train, y_train) best_model = grid_search.best_estimator_ # Test predictions y_pred = best_model.predict(X_test) # Evaluation print(f"Random Forest Model Performance for {target}:") print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}") print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}") print(f"R^2: {r2_score(y_test, y_pred):.2f}") # Save the model model_file = f'{model_name}_rf.pkl' joblib.dump(best_model, model_file) print(f"Random Forest model saved as '{model_file}'") # LSTM Model def train_lstm_model(df, target, model_name): X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) y = df[target] # Encode categorical variables categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] for col in categorical_cols: le = LabelEncoder() X[col] = le.fit_transform(X[col]) scaler = StandardScaler() X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) # Reshape data for LSTM (samples, timesteps, features) X = np.expand_dims(X.values, axis=1) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Build LSTM model model = Sequential([ LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True), Dropout(0.2), LSTM(32, activation='tanh'), Dropout(0.2), Dense(1) ]) model.compile(optimizer='adam', loss='mse', metrics=['mae']) # Train the model history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1) # Test evaluation loss, mae = model.evaluate(X_test, y_test, verbose=1) print(f"LSTM Model Performance for {target}:") print(f"MAE: {mae:.2f}") # Save the model model.save(f'{model_name}_lstm.h5') print(f"LSTM model saved as '{model_name}_lstm.h5'") # Main Function def main(): file_path = 'data.csv' # Update with your dataset file path df = load_data(file_path) print("Dataset loaded.") df = preprocess_data(df) print("Data preprocessing complete.") print("Training mileage prediction models...") train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor') train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor') print("Training price prediction models...") train_rf_model(df, target='Price', model_name='price_predictor') train_lstm_model(df, target='Price', model_name='price_predictor') print("Training year prediction models...") train_rf_model(df, target='Original_Year', model_name='year_predictor') train_lstm_model(df, target='Original_Year', model_name='year_predictor') if __name__ == "__main__": main()