summaryrefslogtreecommitdiffstats
path: root/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'main.py')
-rw-r--r--main.py89
1 files changed, 67 insertions, 22 deletions
diff --git a/main.py b/main.py
index b0d6673..c6c1aed 100644
--- a/main.py
+++ b/main.py
@@ -1,26 +1,25 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import LSTM, Dense, Dropout
# Load Dataset
def load_data(file_path):
df = pd.read_csv(file_path)
- # Drop unnecessary index column if present
- df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+ df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnecessary index column if present
return df
# Preprocessing
def preprocess_data(df):
- # Save the 'Year' column before calculating car age
df['Original_Year'] = df['Year']
-
- # Calculate Car Age
df['Car_Age'] = 2024 - df['Year']
df.drop(columns=['Year'], inplace=True)
@@ -39,12 +38,10 @@ def preprocess_data(df):
upper_bound = q3 + 1.5 * iqr
df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]
- # Return processed dataframe
return df
-# Train Model
-def train_model(df, target, model_name):
- # Features and target
+# Random Forest Model
+def train_rf_model(df, target, model_name):
X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
y = df[target]
@@ -55,12 +52,13 @@ def train_model(df, target, model_name):
categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
- preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols),
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
]
)
- # Random Forest Regressor pipeline
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))
@@ -75,22 +73,65 @@ def train_model(df, target, model_name):
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
- # Best model
best_model = grid_search.best_estimator_
# Test predictions
y_pred = best_model.predict(X_test)
# Evaluation
- print(f"Model Performance for {target}:")
+ print(f"Random Forest Model Performance for {target}:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R^2: {r2_score(y_test, y_pred):.2f}")
# Save the model
- model_file = f'{model_name}.pkl'
+ model_file = f'{model_name}_rf.pkl'
joblib.dump(best_model, model_file)
- print(f"Model saved as '{model_file}'")
+ print(f"Random Forest model saved as '{model_file}'")
+
+# LSTM Model
+def train_lstm_model(df, target, model_name):
+ X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
+ y = df[target]
+
+ # Encode categorical variables
+ categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
+ numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
+
+ for col in categorical_cols:
+ le = LabelEncoder()
+ X[col] = le.fit_transform(X[col])
+
+ scaler = StandardScaler()
+ X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
+
+ # Reshape data for LSTM (samples, timesteps, features)
+ X = np.expand_dims(X.values, axis=1)
+
+ # Split data
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+ # Build LSTM model
+ model = Sequential([
+ LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True),
+ Dropout(0.2),
+ LSTM(32, activation='tanh'),
+ Dropout(0.2),
+ Dense(1)
+ ])
+ model.compile(optimizer='adam', loss='mse', metrics=['mae'])
+
+ # Train the model
+ history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
+
+ # Test evaluation
+ loss, mae = model.evaluate(X_test, y_test, verbose=1)
+ print(f"LSTM Model Performance for {target}:")
+ print(f"MAE: {mae:.2f}")
+
+ # Save the model
+ model.save(f'{model_name}_lstm.h5')
+ print(f"LSTM model saved as '{model_name}_lstm.h5'")
# Main Function
def main():
@@ -101,14 +142,18 @@ def main():
df = preprocess_data(df)
print("Data preprocessing complete.")
- print("Training mileage prediction model...")
- train_model(df, target='Mileage Km/L', model_name='mileage_predictor')
+ print("Training mileage prediction models...")
+ train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor')
+ train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor')
- print("Training price prediction model...")
- train_model(df, target='Price', model_name='price_predictor')
+ print("Training price prediction models...")
+ train_rf_model(df, target='Price', model_name='price_predictor')
+ train_lstm_model(df, target='Price', model_name='price_predictor')
- print("Training year prediction model...")
- train_model(df, target='Original_Year', model_name='year_predictor')
+ print("Training year prediction models...")
+ train_rf_model(df, target='Original_Year', model_name='year_predictor')
+ train_lstm_model(df, target='Original_Year', model_name='year_predictor')
if __name__ == "__main__":
main()
+