summaryrefslogblamecommitdiffstats
path: root/main.py
blob: c6c1aed1917043e84633dc7193f6f847d90fbe96 (plain) (tree)
1
2
3
4
5
6
7
8
9


                                                                  
                                                                             




                                                                             


                                                        



                               
                                                                                                    



                        
                                    








                                                                      
                                                    







                                                                                                

             

                                           
                                                                           
                  







                                                                                             


                                                      



                                                                  













                                                                                               





                                            
                                                           




                                                                           
                                       
                                       












































                                                                                                             









                                                                


                                                                               
 


                                                                      
 


                                                                             
 

                          
 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Load Dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]  # Drop unnecessary index column if present
    return df

# Preprocessing
def preprocess_data(df):
    df['Original_Year'] = df['Year']
    df['Car_Age'] = 2024 - df['Year']
    df.drop(columns=['Year'], inplace=True)

    # Handle missing values
    df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median())
    df['Power'] = df['Power'].fillna(df['Power'].median())
    df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0])

    # Remove rows with missing target variable
    df = df.dropna(subset=['Mileage Km/L', 'Price'])

    # Remove outliers in 'Kilometers Driven'
    q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]

    return df

# Random Forest Model
def train_rf_model(df, target, model_name):
    X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
    y = df[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ColumnTransformer for preprocessing
    categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
    numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ]
    )

    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    # Hyperparameter tuning
    param_grid = {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [10, 20, None]
    }

    grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Test predictions
    y_pred = best_model.predict(X_test)

    # Evaluation
    print(f"Random Forest Model Performance for {target}:")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
    print(f"R^2: {r2_score(y_test, y_pred):.2f}")

    # Save the model
    model_file = f'{model_name}_rf.pkl'
    joblib.dump(best_model, model_file)
    print(f"Random Forest model saved as '{model_file}'")

# LSTM Model
def train_lstm_model(df, target, model_name):
    X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
    y = df[target]

    # Encode categorical variables
    categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
    numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']

    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    # Reshape data for LSTM (samples, timesteps, features)
    X = np.expand_dims(X.values, axis=1)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build LSTM model
    model = Sequential([
        LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='tanh'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

    # Test evaluation
    loss, mae = model.evaluate(X_test, y_test, verbose=1)
    print(f"LSTM Model Performance for {target}:")
    print(f"MAE: {mae:.2f}")

    # Save the model
    model.save(f'{model_name}_lstm.h5')
    print(f"LSTM model saved as '{model_name}_lstm.h5'")

# Main Function
def main():
    file_path = 'data.csv'  # Update with your dataset file path
    df = load_data(file_path)
    print("Dataset loaded.")

    df = preprocess_data(df)
    print("Data preprocessing complete.")

    print("Training mileage prediction models...")
    train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor')
    train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor')

    print("Training price prediction models...")
    train_rf_model(df, target='Price', model_name='price_predictor')
    train_lstm_model(df, target='Price', model_name='price_predictor')

    print("Training year prediction models...")
    train_rf_model(df, target='Original_Year', model_name='year_predictor')
    train_lstm_model(df, target='Original_Year', model_name='year_predictor')

if __name__ == "__main__":
    main()