import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
# Load Dataset
def load_data(file_path):
df = pd.read_csv(file_path)
# Drop unnecessary index column if present
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
return df
# Preprocessing
def preprocess_data(df):
# Save the 'Year' column before calculating car age
df['Original_Year'] = df['Year']
# Calculate Car Age
df['Car_Age'] = 2024 - df['Year']
df.drop(columns=['Year'], inplace=True)
# Handle missing values
df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median())
df['Power'] = df['Power'].fillna(df['Power'].median())
df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0])
# Remove rows with missing target variable
df = df.dropna(subset=['Mileage Km/L', 'Price'])
# Remove outliers in 'Kilometers Driven'
q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]
# Return processed dataframe
return df
# Train Model
def train_model(df, target, model_name):
# Features and target
X = df.drop(columns=['Mileage Km/L', 'Price', 'Name', 'Original_Year'])
y = df[target]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# ColumnTransformer for preprocessing
categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(drop='first'), categorical_cols)
]
)
# Random Forest Regressor pipeline
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))
])
# Hyperparameter tuning
param_grid = {
'regressor__n_estimators': [50, 100, 200],
'regressor__max_depth': [10, 20, None]
}
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
# Best model
best_model = grid_search.best_estimator_
# Test predictions
y_pred = best_model.predict(X_test)
# Evaluation
print(f"Model Performance for {target}:")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R^2: {r2_score(y_test, y_pred):.2f}")
# Save the model
model_file = f'{model_name}.pkl'
joblib.dump(best_model, model_file)
print("Model saved as '{model_file}'")
# Main Function
def main():
file_path = 'data.csv' # Update with your dataset file path
df = load_data(file_path)
print("Dataset loaded.")
df = preprocess_data(df)
print("Data preprocessing complete.")
print("Training mileage prediction model...")
train_model(df, target='Mileage Km/L', model_name='mileage_predictor')
print("Training price prediction model...")
train_model(df, target='Price', model_name='price_predictor')
print("Training year prediction model...")
train_model(df, target='Original_Year', model_name='year_predictor')
if __name__ == "__main__":
main()