diff options
author | Biswakalyan Bhuyan <biswa@surgot.in> | 2024-11-30 23:14:55 +0530 |
---|---|---|
committer | Biswakalyan Bhuyan <biswa@surgot.in> | 2024-11-30 23:14:55 +0530 |
commit | 3b3a6547649f75066f45ea2a7e1c46e34d2e26a9 (patch) | |
tree | fa1e228ab570d315ea5e8c3bf5361f1be7df8e4b /main.py | |
parent | 54c94edb7a222c7c5585ee18648ce809e5d4ad0e (diff) | |
download | autopredict-master.tar.gz autopredict-master.tar.bz2 autopredict-master.zip |
Diffstat (limited to 'main.py')
-rw-r--r-- | main.py | 89 |
1 files changed, 67 insertions, 22 deletions
@@ -1,26 +1,25 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV -from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import joblib +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import LSTM, Dense, Dropout # Load Dataset def load_data(file_path): df = pd.read_csv(file_path) - # Drop unnecessary index column if present - df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnecessary index column if present return df # Preprocessing def preprocess_data(df): - # Save the 'Year' column before calculating car age df['Original_Year'] = df['Year'] - - # Calculate Car Age df['Car_Age'] = 2024 - df['Year'] df.drop(columns=['Year'], inplace=True) @@ -39,12 +38,10 @@ def preprocess_data(df): upper_bound = q3 + 1.5 * iqr df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)] - # Return processed dataframe return df -# Train Model -def train_model(df, target, model_name): - # Features and target +# Random Forest Model +def train_rf_model(df, target, model_name): X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) y = df[target] @@ -55,12 +52,13 @@ def train_model(df, target, model_name): categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] - preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols), + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numerical_cols), ('cat', OneHotEncoder(drop='first'), categorical_cols) ] ) - # Random Forest Regressor pipeline rf_pipeline = Pipeline([ ('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42)) @@ -75,22 +73,65 @@ def train_model(df, target, model_name): grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_train, y_train) - # Best model best_model = grid_search.best_estimator_ # Test predictions y_pred = best_model.predict(X_test) # Evaluation - print(f"Model Performance for {target}:") + print(f"Random Forest Model Performance for {target}:") print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}") print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}") print(f"R^2: {r2_score(y_test, y_pred):.2f}") # Save the model - model_file = f'{model_name}.pkl' + model_file = f'{model_name}_rf.pkl' joblib.dump(best_model, model_file) - print(f"Model saved as '{model_file}'") + print(f"Random Forest model saved as '{model_file}'") + +# LSTM Model +def train_lstm_model(df, target, model_name): + X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) + y = df[target] + + # Encode categorical variables + categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] + numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] + + for col in categorical_cols: + le = LabelEncoder() + X[col] = le.fit_transform(X[col]) + + scaler = StandardScaler() + X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) + + # Reshape data for LSTM (samples, timesteps, features) + X = np.expand_dims(X.values, axis=1) + + # Split data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Build LSTM model + model = Sequential([ + LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True), + Dropout(0.2), + LSTM(32, activation='tanh'), + Dropout(0.2), + Dense(1) + ]) + model.compile(optimizer='adam', loss='mse', metrics=['mae']) + + # Train the model + history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1) + + # Test evaluation + loss, mae = model.evaluate(X_test, y_test, verbose=1) + print(f"LSTM Model Performance for {target}:") + print(f"MAE: {mae:.2f}") + + # Save the model + model.save(f'{model_name}_lstm.h5') + print(f"LSTM model saved as '{model_name}_lstm.h5'") # Main Function def main(): @@ -101,14 +142,18 @@ def main(): df = preprocess_data(df) print("Data preprocessing complete.") - print("Training mileage prediction model...") - train_model(df, target='Mileage Km/L', model_name='mileage_predictor') + print("Training mileage prediction models...") + train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor') + train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor') - print("Training price prediction model...") - train_model(df, target='Price', model_name='price_predictor') + print("Training price prediction models...") + train_rf_model(df, target='Price', model_name='price_predictor') + train_lstm_model(df, target='Price', model_name='price_predictor') - print("Training year prediction model...") - train_model(df, target='Original_Year', model_name='year_predictor') + print("Training year prediction models...") + train_rf_model(df, target='Original_Year', model_name='year_predictor') + train_lstm_model(df, target='Original_Year', model_name='year_predictor') if __name__ == "__main__": main() + |