#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, ttest_ind, f_oneway
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge


# read in the ford csv
df = pd.read_csv("ford.csv")

#check the info of the DataFrame for any abnormalities
print(df.info())

#change the 'year' column to something more meaningful,'age'.
df['age_of_car'] = 2024 - df['year']
del df['year'] #no need for 'year' anymore

#get rid of duplicate data
df.drop_duplicates(inplace=True)

# this block of code checks for any missing data
missing_values = df.isnull().sum()
if missing_values.any():
    print("columns with missing vals:")
    print(missing_values[missing_values > 0])
else:
    print("none")

#reset the index to be properly 0-indexed after some cleaning
df.reset_index(drop=True, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  int64  
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   tax           17966 non-null  int64  
 7   mpg           17966 non-null  float64
 8   engineSize    17966 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB
None
none


#we see a negative age, which does not make sense! Run this alone first to see
#the negative value.
df.describe()
df[df.age_of_car == -36]

#drop the outlier
df.drop(df.query("age_of_car == -36").index, inplace = True)
df.describe()


# Chi-Squared Test
contingency_table = pd.crosstab(df['model'], df['transmission'])
_, p_val, _, _ = chi2_contingency(contingency_table)
print("P-value for Chi-Squared Test:", p_val)
contingency_table.plot(kind='bar', figsize=(10, 6))
plt.title("Model vs. Transmission Type")
plt.xlabel("Model")
plt.ylabel("Count")
plt.show()

P-value for Chi-Squared Test: 0.0


# Two Sample T-Test
petrol_mpg = df[df['fuelType'] == 'Petrol']['mpg']
diesel_mpg = df[df['fuelType'] == 'Diesel']['mpg']
_, p_val = ttest_ind(petrol_mpg, diesel_mpg)
print("P-value for T Test:", p_val)
plt.figure(figsize=(8, 6))
plt.hist([petrol_mpg, diesel_mpg], bins=10, label=['Petrol', 'Diesel'], density=True)
plt.title("MPG Distribution for Petrol and Diesel Cars")
plt.xlabel("MPG")
plt.ylabel("Density")
plt.legend()
plt.show()

P-value for T Test: 0.0


#One-way ANOVA
unique_fuels = df['fuelType'].unique() # ['Petrol', 'Diesel', 'Hybrid', 'Electric', 'Other']
fuel_type_groups = df.groupby('fuelType')['price']
_, p_val = f_oneway(*[group for name, group in fuel_type_groups])
print("P-value for ANOVA Test:", p_val)
plt.figure(figsize=(10, 6))
sns.boxplot(x='fuelType', y='price', data=df, hue='fuelType', palette='Set2', dodge=False)
plt.title("Price Distribution across Different Fuel Types")
plt.xlabel("Fuel Type")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

P-value for ANOVA Test: 1.9977378926368497e-179


# Splitting features and target variable
X = df.drop(columns=['price'])
y = df['price']

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing categorical features
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

onehot = OneHotEncoder(handle_unknown='ignore')

# Preprocessing pipeline for categorical variables
categorical_transformer = Pipeline(steps=[
    ('onehot', onehot)
])

# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
    ])

# Append regression model to preprocessing pipeline
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', LinearRegression())])

# Train the model
regression_model.fit(X_train, y_train)

# Make predictions
y_pred = regression_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 2681.0750883750725


# Make predictions on the remaining data
y_remaining_pred = regression_model.predict(X)

# Assess model accuracy
mae_remaining = mean_absolute_error(y, y_remaining_pred)
r2_remaining = r2_score(y, y_remaining_pred)
print("Mean Absolute Error on Remaining Data:", mae_remaining)
print("R-squared on Remaining Data:", r2_remaining)

# Create visualizations
# Scatter plot of actual vs. predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y, y_remaining_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs. Predicted Prices')
plt.grid(True)
plt.show()

# Distribution of residuals
residuals = y - y_remaining_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel('Residuals ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.grid(True)
plt.show()

Mean Absolute Error on Remaining Data: 2669.9070307127663
R-squared on Remaining Data: 0.4440272329154157


# Seperating numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Setting up preprocessing pipeline for numerical variables
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine all of it into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
    ])

# Updated regression model, using the preprocessor we just made and Ridge
ridge_model_improved = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('regressor', Ridge())])

# Define a range of alpha values for Ridge Regression to try
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Grid search for hyperparameter tuning, allows us to find the best params
grid_search = GridSearchCV(ridge_model_improved, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Train the model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Ridge Regression Model:")
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs. Predicted Prices (Ridge Regression)')
plt.grid(True)
plt.show()

Best Ridge Regression Model:
Mean Absolute Error: 1351.0499581605309
R-squared: 0.8578125364945244


# Retrieve coefficients of all features
coefficients = best_model.named_steps['regressor'].coef_

# Get feature names
feature_names = numerical_cols + categorical_cols

# Create a dictionary mapping feature names to coefficients
feature_coefficients = dict(zip(feature_names, coefficients))

# Print feature coefficients
for feature, coefficient in feature_coefficients.items():
    print(f"{feature}: {coefficient}")

mileage: -1214.1141925518411
tax: -47.87943202212063
mpg: -733.7412275620932
engineSize: 1234.0538223049728
age_of_car: -2263.0816456289454
model: -3984.0314459804003
transmission: -2825.4352273861523
fuelType: -2359.862612889144


# Define price ranges
price_ranges = [
    (0, 10000),
    (10000, 20000),
    (20000, 30000),
    (30000, 40000),
    (40000, 50000),
    (50000, np.inf)
]

# Evaluate model performance for each price range
for low, high in price_ranges:
    mask = (y_test >= low) & (y_test < high)
    y_test_range = y_test[mask]
    y_pred_range = y_pred[mask]

    if len(y_test_range) > 0:
        mae_range = mean_absolute_error(y_test_range, y_pred_range)
        r2_range = r2_score(y_test_range, y_pred_range)

        print(f"Price Range: ${low} - ${high}")
        print(f"Number of Samples: {len(y_test_range)}")
        print(f"Mean Absolute Error: {mae_range:.2f}")
        print(f"R-squared: {r2_range:.2f}")
        print("---")
    else:
        print(f"No samples in the price range: ${low} - ${high}")
        print("---")

Price Range: $0 - $10000
Number of Samples: 1288
Mean Absolute Error: 1341.15
R-squared: 0.01
---
Price Range: $10000 - $20000
Number of Samples: 2091
Mean Absolute Error: 1189.84
R-squared: 0.73
---
Price Range: $20000 - $30000
Number of Samples: 163
Mean Absolute Error: 2872.99
R-squared: -0.88
---
Price Range: $30000 - $40000
Number of Samples: 19
Mean Absolute Error: 6460.98
R-squared: -7.18
---
Price Range: $40000 - $50000
Number of Samples: 2
Mean Absolute Error: 3691.65
R-squared: -53.71
---
No samples in the price range: $50000 - $inf
---

	price	mileage	tax	mpg	engineSize	age_of_car
count	17811.000000	17811.000000	17811.000000	17811.000000	17811.000000	17811.000000
mean	12269.880523	23379.381955	113.309865	57.909545	1.350620	7.140026
std	4736.220719	19418.128363	62.032540	10.132348	0.432593	2.026478
min	495.000000	1.000000	0.000000	20.800000	0.000000	4.000000
25%	8999.000000	10000.000000	30.000000	52.300000	1.000000	6.000000
50%	11289.000000	18274.000000	145.000000	58.900000	1.200000	7.000000
75%	15295.000000	31092.500000	145.000000	65.700000	1.500000	8.000000
max	54995.000000	177644.000000	580.000000	201.800000	5.000000	28.000000

A Statistical Exploration and Predictive Modeling of Ford Car Attributes

Spring 2024 Data Science Project¶

Introduction¶

Data Curation¶

Exploratory Data Analysis With Visualizations¶

1. Chi-Squared Test¶

Chi-Squared Test Conclusion:¶

2. Two Sample T-test¶

Two Sample T-Test Conclusion:¶

3. One-Way ANOVA Test¶

One-way ANOVA Test Conclusion:¶

Overall Conclusion:¶

Primary Analysis and Visualizations¶

Insights and Conclusions:¶