diff options
author | Biswakalyan Bhuyan <biswa@surgot.in> | 2024-11-27 16:58:22 +0530 |
---|---|---|
committer | Biswakalyan Bhuyan <biswa@surgot.in> | 2024-11-27 16:58:22 +0530 |
commit | 9f56376d64bf3307b18cd67d0fe8aaf5a6860f11 (patch) | |
tree | 515f39ebb936cabe9f5598c6fd784d64dd7eab88 /main.py | |
download | autopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.tar.gz autopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.tar.bz2 autopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.zip |
Data Pridiction of Cars
Diffstat (limited to 'main.py')
-rw-r--r-- | main.py | 105 |
1 files changed, 105 insertions, 0 deletions
@@ -0,0 +1,105 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +import joblib + +# Load Dataset +def load_data(file_path): + df = pd.read_csv(file_path) + # Drop unnecessary index column if present + df = df.loc[:, ~df.columns.str.contains('^Unnamed')] + return df + +# Preprocessing +def preprocess_data(df): + # Calculate Car Age + df['Car_Age'] = 2024 - df['Year'] + df.drop(columns=['Year'], inplace=True) + + # Handle missing values + df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median()) + df['Power'] = df['Power'].fillna(df['Power'].median()) + df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0]) + + # Remove rows with missing target variable + df = df.dropna(subset=['Mileage Km/L']) + + # Remove outliers in 'Kilometers Driven' + q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75]) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)] + + # Return processed dataframe + return df + +# Train Model +def train_model(df): + # Features and target + X = df.drop(columns=['Mileage Km/L', 'Name', 'Price']) + y = df['Mileage Km/L'] + + # Split data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # ColumnTransformer for preprocessing + categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location'] + numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age'] + + preprocessor = ColumnTransformer( + transformers=[ + ('num', StandardScaler(), numerical_cols), + ('cat', OneHotEncoder(drop='first'), categorical_cols) + ] + ) + + # Random Forest Regressor pipeline + rf_pipeline = Pipeline([ + ('preprocessor', preprocessor), + ('regressor', RandomForestRegressor(random_state=42)) + ]) + + # Hyperparameter tuning + param_grid = { + 'regressor__n_estimators': [50, 100, 200], + 'regressor__max_depth': [10, 20, None] + } + + grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error') + grid_search.fit(X_train, y_train) + + # Best model + best_model = grid_search.best_estimator_ + + # Test predictions + y_pred = best_model.predict(X_test) + + # Evaluation + print("Model Performance:") + print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}") + print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}") + print(f"R^2: {r2_score(y_test, y_pred):.2f}") + + # Save the model + joblib.dump(best_model, 'model.pkl') + print("Model saved as 'best_mileage_predictor.pkl'") + +# Main Function +def main(): + file_path = 'data.csv' # Update with your dataset file path + df = load_data(file_path) + print("Dataset loaded.") + + df = preprocess_data(df) + print("Data preprocessing complete.") + + train_model(df) + +if __name__ == "__main__": + main() |