import pandas as pd
import numpy as np
import seaborn as sns
data = sns.load_dataset("tips")
three_features = ['total_bill', 'size', 'day']
three_feature_data = pd.DataFrame(data[three_features])
three_feature_data.iloc[[193, 90, 25, 26, 190], :]
total_bill | size | day | |
---|---|---|---|
193 | 15.48 | 2 | Thur |
90 | 28.97 | 2 | Fri |
25 | 17.81 | 4 | Sat |
26 | 13.37 | 2 | Sat |
190 | 15.69 | 2 | Sun |
dummies = pd.get_dummies(data['day'])
dummies.iloc[[193, 90, 25, 26, 190], :]
Thur | Fri | Sat | Sun | |
---|---|---|---|---|
193 | 1 | 0 | 0 | 0 |
90 | 0 | 1 | 0 | 0 |
25 | 0 | 0 | 1 | 0 |
26 | 0 | 0 | 1 | 0 |
190 | 0 | 0 | 0 | 1 |
data_w_dummies = pd.concat([three_feature_data, dummies], axis=1)
data_w_dummies.iloc[[193, 90, 25, 26, 190], :]
total_bill | size | day | Thur | Fri | Sat | Sun | |
---|---|---|---|---|---|---|---|
193 | 15.48 | 2 | Thur | 1 | 0 | 0 | 0 |
90 | 28.97 | 2 | Fri | 0 | 1 | 0 | 0 |
25 | 17.81 | 4 | Sat | 0 | 0 | 1 | 0 |
26 | 13.37 | 2 | Sat | 0 | 0 | 1 | 0 |
190 | 15.69 | 2 | Sun | 0 | 0 | 0 | 1 |
from sklearn.linear_model import LinearRegression
f_with_day = LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies[["total_bill", "size", "Thur",
"Fri", "Sat", "Sun"]], data["tip"])
LinearRegression(fit_intercept=False)
f_with_day.predict([[50, 3, 1, 0, 0, 0]])
array([5.87937107])
vehicle_data = sns.load_dataset("mpg")
vehicle_data = vehicle_data.rename(columns = {"horsepower": "hp"})
vehicle_data = vehicle_data.dropna()
vehicle_data
mpg | cylinders | displacement | hp | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
392 rows × 9 columns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
def get_MSE_for_degree_k_model(k):
pipelined_model = Pipeline([
('josh_transform', PolynomialFeatures(degree = k)),
('josh_regression', LinearRegression(fit_intercept = True))
])
pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
return mean_squared_error(pipelined_model.predict(vehicle_data[["hp"]]), vehicle_data["mpg"])
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k
k | MSE | |
---|---|---|
0 | 0 | 60.762738 |
1 | 1 | 23.943663 |
2 | 2 | 18.984769 |
3 | 3 | 18.944990 |
4 | 4 | 18.876333 |
5 | 5 | 18.426969 |
6 | 6 | 18.241505 |
import plotly.graph_objects as go
def plot_degree_k_model(k):
pipelined_model = Pipeline([
('josh_transform', PolynomialFeatures(degree = k)),
('josh_regression', LinearRegression(fit_intercept = True))
])
pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=vehicle_data['hp'], y = vehicle_data['mpg'],
mode = "markers", name = ""))
x_range = np.linspace(45, 210, 100)
fig.add_trace(go.Scatter(x=x_range, y = pipelined_model.predict(x_range.reshape(-1, 1)),
mode = "lines", name = ""))
fig.update_layout(font_size = 20,
xaxis_title = "hp",
yaxis_title = "mpg",
margin=dict(l=50, r=50, b=0, t=1),
showlegend = False)
return fig
plot_degree_k_model(2)