1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
# Load Dataset
def load_data(file_path):
df = pd.read_csv(file_path)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnecessary index column if present
return df
# Preprocessing
def preprocess_data(df):
df['Original_Year'] = df['Year']
df['Car_Age'] = 2024 - df['Year']
df.drop(columns=['Year'], inplace=True)
# Handle missing values
df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median())
df['Power'] = df['Power'].fillna(df['Power'].median())
df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0])
# Remove rows with missing target variable
df = df.dropna(subset=['Mileage Km/L', 'Price'])
# Remove outliers in 'Kilometers Driven'
q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]
return df
# Random Forest Model
def train_rf_model(df, target, model_name):
X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
y = df[target]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ColumnTransformer for preprocessing
categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
]
)
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))
])
# Hyperparameter tuning
param_grid = {
'regressor__n_estimators': [50, 100, 200],
'regressor__max_depth': [10, 20, None]
}
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
# Test predictions
y_pred = best_model.predict(X_test)
# Evaluation
print(f"Random Forest Model Performance for {target}:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R^2: {r2_score(y_test, y_pred):.2f}")
# Save the model
model_file = f'{model_name}_rf.pkl'
joblib.dump(best_model, model_file)
print(f"Random Forest model saved as '{model_file}'")
# LSTM Model
def train_lstm_model(df, target, model_name):
X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
y = df[target]
# Encode categorical variables
categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
for col in categorical_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col])
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
# Reshape data for LSTM (samples, timesteps, features)
X = np.expand_dims(X.values, axis=1)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build LSTM model
model = Sequential([
LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='tanh', return_sequences=True),
Dropout(0.2),
LSTM(32, activation='tanh'),
Dropout(0.2),
Dense(1)
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
# Test evaluation
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f"LSTM Model Performance for {target}:")
print(f"MAE: {mae:.2f}")
# Save the model
model.save(f'{model_name}_lstm.h5')
print(f"LSTM model saved as '{model_name}_lstm.h5'")
# Main Function
def main():
file_path = 'data.csv' # Update with your dataset file path
df = load_data(file_path)
print("Dataset loaded.")
df = preprocess_data(df)
print("Data preprocessing complete.")
print("Training mileage prediction models...")
train_rf_model(df, target='Mileage Km/L', model_name='mileage_predictor')
train_lstm_model(df, target='Mileage Km/L', model_name='mileage_predictor')
print("Training price prediction models...")
train_rf_model(df, target='Price', model_name='price_predictor')
train_lstm_model(df, target='Price', model_name='price_predictor')
print("Training year prediction models...")
train_rf_model(df, target='Original_Year', model_name='year_predictor')
train_lstm_model(df, target='Original_Year', model_name='year_predictor')
if __name__ == "__main__":
main()
|