summaryrefslogtreecommitdiffstats
path: root/main.py
diff options
context:
space:
mode:
authorLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2024-11-27 16:58:22 +0530
committerLibravatarLibravatar Biswakalyan Bhuyan <biswa@surgot.in> 2024-11-27 16:58:22 +0530
commit9f56376d64bf3307b18cd67d0fe8aaf5a6860f11 (patch)
tree515f39ebb936cabe9f5598c6fd784d64dd7eab88 /main.py
downloadautopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.tar.gz
autopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.tar.bz2
autopredict-9f56376d64bf3307b18cd67d0fe8aaf5a6860f11.zip
Data Pridiction of Cars
Diffstat (limited to 'main.py')
-rw-r--r--main.py105
1 files changed, 105 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..d01d421
--- /dev/null
+++ b/main.py
@@ -0,0 +1,105 @@
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+import joblib
+
+# Load Dataset
+def load_data(file_path):
+ df = pd.read_csv(file_path)
+ # Drop unnecessary index column if present
+ df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+ return df
+
+# Preprocessing
+def preprocess_data(df):
+ # Calculate Car Age
+ df['Car_Age'] = 2024 - df['Year']
+ df.drop(columns=['Year'], inplace=True)
+
+ # Handle missing values
+ df['Engine CC'] = df['Engine CC'].fillna(df['Engine CC'].median())
+ df['Power'] = df['Power'].fillna(df['Power'].median())
+ df['Seats'] = df['Seats'].fillna(df['Seats'].mode()[0])
+
+ # Remove rows with missing target variable
+ df = df.dropna(subset=['Mileage Km/L'])
+
+ # Remove outliers in 'Kilometers Driven'
+ q1, q3 = df['Kilometers_Driven'].quantile([0.25, 0.75])
+ iqr = q3 - q1
+ lower_bound = q1 - 1.5 * iqr
+ upper_bound = q3 + 1.5 * iqr
+ df = df[(df['Kilometers_Driven'] >= lower_bound) & (df['Kilometers_Driven'] <= upper_bound)]
+
+ # Return processed dataframe
+ return df
+
+# Train Model
+def train_model(df):
+ # Features and target
+ X = df.drop(columns=['Mileage Km/L', 'Name', 'Price'])
+ y = df['Mileage Km/L']
+
+ # Split data
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+ # ColumnTransformer for preprocessing
+ categorical_cols = ['Fuel_Type', 'Transmission', 'Owner_Type', 'Location']
+ numerical_cols = ['Kilometers_Driven', 'Engine CC', 'Power', 'Seats', 'Car_Age']
+
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ('num', StandardScaler(), numerical_cols),
+ ('cat', OneHotEncoder(drop='first'), categorical_cols)
+ ]
+ )
+
+ # Random Forest Regressor pipeline
+ rf_pipeline = Pipeline([
+ ('preprocessor', preprocessor),
+ ('regressor', RandomForestRegressor(random_state=42))
+ ])
+
+ # Hyperparameter tuning
+ param_grid = {
+ 'regressor__n_estimators': [50, 100, 200],
+ 'regressor__max_depth': [10, 20, None]
+ }
+
+ grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
+ grid_search.fit(X_train, y_train)
+
+ # Best model
+ best_model = grid_search.best_estimator_
+
+ # Test predictions
+ y_pred = best_model.predict(X_test)
+
+ # Evaluation
+ print("Model Performance:")
+ print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
+ print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
+ print(f"R^2: {r2_score(y_test, y_pred):.2f}")
+
+ # Save the model
+ joblib.dump(best_model, 'model.pkl')
+ print("Model saved as 'best_mileage_predictor.pkl'")
+
+# Main Function
+def main():
+ file_path = 'data.csv' # Update with your dataset file path
+ df = load_data(file_path)
+ print("Dataset loaded.")
+
+ df = preprocess_data(df)
+ print("Data preprocessing complete.")
+
+ train_model(df)
+
+if __name__ == "__main__":
+ main()