From 3b3a6547649f75066f45ea2a7e1c46e34d2e26a9 Mon Sep 17 00:00:00 2001 From: Biswakalyan Bhuyan Date: Sat, 30 Nov 2024 23:14:55 +0530 Subject: Added LSTM for neural networking --- .~lock.predicted_data.csv# | 1 + edit.py | 55 ++++++++++++++++++++++++++++ main.py | 89 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 123 insertions(+), 22 deletions(-) create mode 100644 .~lock.predicted_data.csv# create mode 100644 edit.py diff --git a/.~lock.predicted_data.csv# b/.~lock.predicted_data.csv# new file mode 100644 index 0000000..9c6d033 --- /dev/null +++ b/.~lock.predicted_data.csv# @@ -0,0 +1 @@ +,surgot,zeus,28.11.2024 00:11,file:///home/surgot/.config/libreoffice/4; \ No newline at end of file diff --git a/edit.py b/edit.py new file mode 100644 index 0000000..d2bf57a --- /dev/null +++ b/edit.py @@ -0,0 +1,55 @@ +import joblib +import pandas as pd + +# Load the models +mileage_model = joblib.load('mileage_predictor.pkl') +price_model = joblib.load('price_predictor.pkl') +year_model = joblib.load('year_predictor.pkl') + +# Required columns +REQUIRED_COLUMNS = [ + 'Name', 'Manufacturer', 'Location', 'Year', 'Kilometers_Driven', + 'Fuel_Type', 'Transmission', 'Owner_Type', 'Engine CC', 'Power', 'Seats' +] + +# Prepare input data for prediction +def prepare_input(df): + # Select only necessary columns + df = df[REQUIRED_COLUMNS] + + # Add 'Car_Age' and drop 'Year' + df['Car_Age'] = 2024 - df['Year'] + df.drop(columns=['Year'], inplace=True) + return df + +# Make predictions +def predict_from_csv(input_csv, output_csv): + # Load the input CSV file + data = pd.read_csv(input_csv) + + # Ensure the required columns exist + if not all(col in data.columns for col in REQUIRED_COLUMNS): + raise ValueError(f"The input CSV must contain these columns: {REQUIRED_COLUMNS}") + + # Prepare the input data + prepared_data = prepare_input(data.copy()) + + # Perform predictions + data['Predicted_Mileage'] = mileage_model.predict(prepared_data) + data['Predicted_Price'] = price_model.predict(prepared_data) + data['Predicted_Year'] = year_model.predict(prepared_data).astype(int) + + # Format numeric predictions to two decimal places + data['Predicted_Mileage'] = data['Predicted_Mileage'].map(lambda x: round(x, 2)) + data['Predicted_Price'] = data['Predicted_Price'].map(lambda x: round(x, 2)) + + # Save results to a new CSV file + data.to_csv(output_csv, index=False) + print(f"Predictions saved to {output_csv}") + +# Input and output CSV file paths +input_csv = 'data.csv' # Change to your input CSV file name +output_csv = 'predicted_data.csv' # Change to your desired output file name + +# Run the prediction +predict_from_csv(input_csv, output_csv) diff --git a/main.py b/main.py index b0d6673..c6c1aed 100644 --- a/main.py +++ b/main.py @@ -1,26 +1,25 @@ import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV -from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import joblib +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import LSTM, Dense, Dropout # Load Dataset def load_data(file_path): df = pd.read_csv(file_path) - # Drop unnecessary index column if present - df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnecessary index column if present return df # Preprocessing def preprocess_data(df): - # Save the 'Year' column before calculating car age df['Original_Year'] = df['Year'] - - # Calculate Car Age df['Car_Age'] = 2024 - df['Year'] df.drop(columns=['Year'], inplace=True) @@ -39,12 +38,10 @@ def preprocess_data(df): upper_bound = q3 + 1.5 * iqr df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)] - # Return processed dataframe return df -# Train Model -def train_model(df, target, model_name): - # Features and target +# Random Forest Model +def train_rf_model(df, target, model_name): X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) y = df[target] @@ -55,12 +52,13 @@ def train_model(df, target, model_name): categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] - preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols), + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numerical_cols), ('cat', OneHotEncoder(drop='first'), categorical_cols) ] ) - # Random Forest Regressor pipeline rf_pipeline = Pipeline([ ('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42)) @@ -75,22 +73,65 @@ def train_model(df, target, model_name): grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_train, y_train) - # Best model best_model = grid_search.best_estimator_ # Test predictions y_pred = best_model.predict(X_test) # Evaluation - print(f"Model Performance for {target}:") + print(f"Random Forest Model Performance for {target}:") print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}") print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}") print(f"R^2: {r2_score(y_test, y_pred):.2f}") # Save the model - model_file = f'{model_name}.pkl' + model_file = f'{model_name}_rf.pkl' joblib.dump(best_model, model_file) - print(f"Model saved as '{model_file}'") + print(f"Random Forest model saved as '{model_file}'") + +# LSTM Model +def train_lstm_model(df, target, model_name): + X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year']) + y = df[target] + + # Encode categorical variables + categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] + numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] + + for col in categorical_cols: + le = LabelEncoder() + X[col] = le.fit_transform(X[col]) + + scaler = StandardScaler() + X[numerical_cols] = scaler.fit_transform(X[numerical_cols]) + + # Reshape data for LSTM (samples, timesteps, features) + X = np.expand_dims(X.values, axis=1) + + # Split data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Build LSTM model + model = Sequential([ + LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True), + Dropout(0.2), + LSTM(32, activation='tanh'), + Dropout(0.2), + Dense(1) + ]) + model.compile(optimizer='adam', loss='mse', metrics=['mae']) + + # Train the model + history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1) + + # Test evaluation + loss, mae = model.evaluate(X_test, y_test, verbose=1) + print(f"LSTM Model Performance for {target}:") + print(f"MAE: {mae:.2f}") + + # Save the model + model.save(f'{model_name}_lstm.h5') + print(f"LSTM model saved as '{model_name}_lstm.h5'") # Main Function def main(): @@ -101,14 +142,18 @@ def main(): df = preprocess_data(df) print("Data preprocessing complete.") - print("Training mileage prediction model...") - train_model(df, target='Mileage Km/L', model_name='mileage_predictor') + print("Training mileage prediction models...") + train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor') + train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor') - print("Training price prediction model...") - train_model(df, target='Price', model_name='price_predictor') + print("Training price prediction models...") + train_rf_model(df, target='Price', model_name='price_predictor') + train_lstm_model(df, target='Price', model_name='price_predictor') - print("Training year prediction model...") - train_model(df, target='Original_Year', model_name='year_predictor') + print("Training year prediction models...") + train_rf_model(df, target='Original_Year', model_name='year_predictor') + train_lstm_model(df, target='Original_Year', model_name='year_predictor') if __name__ == "__main__": main() + -- cgit v1.2.3-59-g8ed1b