From 3b3a6547649f75066f45ea2a7e1c46e34d2e26a9 Mon Sep 17 00:00:00 2001
From: Biswakalyan Bhuyan <biswa@surgot.in>
Date: Sat, 30 Nov 2024 23:14:55 +0530
Subject: Added LSTM for neural networking

---
 .~lock.predicted_data.csv# |  1 +
 edit.py                    | 55 ++++++++++++++++++++++++++++
 main.py                    | 89 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 123 insertions(+), 22 deletions(-)
 create mode 100644 .~lock.predicted_data.csv#
 create mode 100644 edit.py

diff --git a/.~lock.predicted_data.csv# b/.~lock.predicted_data.csv#
new file mode 100644
index 0000000..9c6d033
--- /dev/null
+++ b/.~lock.predicted_data.csv#
@@ -0,0 +1 @@
+,surgot,zeus,28.11.2024 00:11,file:///home/surgot/.config/libreoffice/4;
\ No newline at end of file
diff --git a/edit.py b/edit.py
new file mode 100644
index 0000000..d2bf57a
--- /dev/null
+++ b/edit.py
@@ -0,0 +1,55 @@
+import joblib
+import pandas as pd
+
+# Load the models
+mileage_model = joblib.load('mileage_predictor.pkl')
+price_model = joblib.load('price_predictor.pkl')
+year_model = joblib.load('year_predictor.pkl')
+
+# Required columns
+REQUIRED_COLUMNS = [
+    'Name', 'Manufacturer', 'Location', 'Year', 'Kilometers_Driven',
+    'Fuel_Type', 'Transmission', 'Owner_Type', 'Engine CC', 'Power', 'Seats'
+]
+
+# Prepare input data for prediction
+def prepare_input(df):
+    # Select only necessary columns
+    df = df[REQUIRED_COLUMNS]
+    
+    # Add 'Car_Age' and drop 'Year'
+    df['Car_Age'] = 2024 - df['Year']
+    df.drop(columns=['Year'], inplace=True)
+    return df
+
+# Make predictions
+def predict_from_csv(input_csv, output_csv):
+    # Load the input CSV file
+    data = pd.read_csv(input_csv)
+    
+    # Ensure the required columns exist
+    if not all(col in data.columns for col in REQUIRED_COLUMNS):
+        raise ValueError(f"The input CSV must contain these columns: {REQUIRED_COLUMNS}")
+    
+    # Prepare the input data
+    prepared_data = prepare_input(data.copy())
+    
+    # Perform predictions
+    data['Predicted_Mileage'] = mileage_model.predict(prepared_data)
+    data['Predicted_Price'] = price_model.predict(prepared_data)
+    data['Predicted_Year'] = year_model.predict(prepared_data).astype(int)
+    
+    # Format numeric predictions to two decimal places
+    data['Predicted_Mileage'] = data['Predicted_Mileage'].map(lambda x: round(x, 2))
+    data['Predicted_Price'] = data['Predicted_Price'].map(lambda x: round(x, 2))
+    
+    # Save results to a new CSV file
+    data.to_csv(output_csv, index=False)
+    print(f"Predictions saved to {output_csv}")
+
+# Input and output CSV file paths
+input_csv = 'data.csv'  # Change to your input CSV file name
+output_csv = 'predicted_data.csv'  # Change to your desired output file name
+
+# Run the prediction
+predict_from_csv(input_csv, output_csv)
diff --git a/main.py b/main.py
index b0d6673..c6c1aed 100644
--- a/main.py
+++ b/main.py
@@ -1,26 +1,25 @@
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split, GridSearchCV
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 import joblib
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Dense, Dropout
 
 # Load Dataset
 def load_data(file_path):
     df = pd.read_csv(file_path)
-    # Drop unnecessary index column if present
-    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Drop unnecessary index column if present
     return df
 
 # Preprocessing
 def preprocess_data(df):
-    # Save the 'Year' column before calculating car age
     df['Original_Year'] = df['Year']
-
-    # Calculate Car Age
     df['Car_Age'] = 2024 - df['Year']
     df.drop(columns=['Year'], inplace=True)
 
@@ -39,12 +38,10 @@ def preprocess_data(df):
     upper_bound = q3 + 1.5 * iqr
     df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]
 
-    # Return processed dataframe
     return df
 
-# Train Model
-def train_model(df, target, model_name):
-    # Features and target
+# Random Forest Model
+def train_rf_model(df, target, model_name):
     X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
     y = df[target]
 
@@ -55,12 +52,13 @@ def train_model(df, target, model_name):
     categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
     numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
 
-    preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols),
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ('num', StandardScaler(), numerical_cols),
             ('cat', OneHotEncoder(drop='first'), categorical_cols)
         ]
     )
 
-    # Random Forest Regressor pipeline
     rf_pipeline = Pipeline([
         ('preprocessor', preprocessor),
         ('regressor', RandomForestRegressor(random_state=42))
@@ -75,22 +73,65 @@ def train_model(df, target, model_name):
     grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
     grid_search.fit(X_train, y_train)
 
-    # Best model
     best_model = grid_search.best_estimator_
 
     # Test predictions
     y_pred = best_model.predict(X_test)
 
     # Evaluation
-    print(f"Model Performance for {target}:")
+    print(f"Random Forest Model Performance for {target}:")
     print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
     print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
     print(f"R^2: {r2_score(y_test, y_pred):.2f}")
 
     # Save the model
-    model_file = f'{model_name}.pkl'
+    model_file = f'{model_name}_rf.pkl'
     joblib.dump(best_model, model_file)
-    print(f"Model saved as '{model_file}'")
+    print(f"Random Forest model saved as '{model_file}'")
+
+# LSTM Model
+def train_lstm_model(df, target, model_name):
+    X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
+    y = df[target]
+
+    # Encode categorical variables
+    categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
+    numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
+
+    for col in categorical_cols:
+        le = LabelEncoder()
+        X[col] = le.fit_transform(X[col])
+
+    scaler = StandardScaler()
+    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
+
+    # Reshape data for LSTM (samples, timesteps, features)
+    X = np.expand_dims(X.values, axis=1)
+
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Build LSTM model
+    model = Sequential([
+        LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True),
+        Dropout(0.2),
+        LSTM(32, activation='tanh'),
+        Dropout(0.2),
+        Dense(1)
+    ])
+    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
+
+    # Train the model
+    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
+
+    # Test evaluation
+    loss, mae = model.evaluate(X_test, y_test, verbose=1)
+    print(f"LSTM Model Performance for {target}:")
+    print(f"MAE: {mae:.2f}")
+
+    # Save the model
+    model.save(f'{model_name}_lstm.h5')
+    print(f"LSTM model saved as '{model_name}_lstm.h5'")
 
 # Main Function
 def main():
@@ -101,14 +142,18 @@ def main():
     df = preprocess_data(df)
     print("Data preprocessing complete.")
 
-    print("Training mileage prediction model...")
-    train_model(df, target='Mileage Km/L', model_name='mileage_predictor')
+    print("Training mileage prediction models...")
+    train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor')
+    train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor')
 
-    print("Training price prediction model...")
-    train_model(df, target='Price', model_name='price_predictor')
+    print("Training price prediction models...")
+    train_rf_model(df, target='Price', model_name='price_predictor')
+    train_lstm_model(df, target='Price', model_name='price_predictor')
 
-    print("Training year prediction model...")
-    train_model(df, target='Original_Year', model_name='year_predictor')
+    print("Training year prediction models...")
+    train_rf_model(df, target='Original_Year', model_name='year_predictor')
+    train_lstm_model(df, target='Original_Year', model_name='year_predictor')
 
 if __name__ == "__main__":
     main()
+
-- 
cgit v1.2.3-59-g8ed1b