# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('customer_purchase.csv')

# Handle missing values using mean imputation
data.fillna(data.mean(), inplace=True)

import matplotlib.pyplot as plt

# Visualize the relationship between Age and Purchase
plt.scatter(data['Age'], data['Purchase'])
plt.title('Age vs Purchase')
plt.xlabel('Age')
plt.ylabel('Purchase')
plt.show()

# Normalize the Age and Income columns using Min-Max scaling
data['Age_scaled'] = (data['Age'] - data['Age'].min()) / (data['Age'].max() -
data['Age'].min())
data['Income_scaled'] = (data['Income'] - data['Income'].min()) / (data['Income'].max() - data['Income'].min())

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data
X = data[['Age_scaled']]
y = data['Purchase']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_simple = LinearRegression()
model_simple.fit(X_train, y_train)

# Predict on the test set
y_pred_simple = model_simple.predict(X_test)

# Assessing Accuracy
print("Simple Linear Regression:")
print("Coefficients:", model_simple.coef_)
print("R-squared:", r2_score(y_test, y_pred_simple))
print("MSE:", mean_squared_error(y_test, y_pred_simple))

# Prepare data
X = data[['Age_scaled', 'Income_scaled']]
y = data['Purchase']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model_multiple = LinearRegression()
model_multiple.fit(X_train, y_train)

# Predict on the test set
y_pred_multiple = model_multiple.predict(X_test)

# Assessing Accuracy
print("\nMultiple Linear Regression:")
print("Intercept:", model_multiple.intercept_)
print("Coefficients:", model_multiple.coef_)
print("R-squared:", r2_score(y_test, y_pred_multiple))
print("MSE:", mean_squared_error(y_test, y_pred_multiple))

X = data[['Age_scaled', 'Income_scaled']]
y = data['Purchase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)