import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(42)
#plt.rcParams['figure.figsize'] = (12, 9)
sns.set()
#sns.set_context('talk')
tips_df = sns.load_dataset("tips")
tips_df
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
three_features = ['total_bill', 'size', 'day']
three_feature_df = pd.DataFrame(tips_df[three_features])
random_rows = [193, 90, 25, 26, 190]
three_feature_df.iloc[random_rows, :]
total_bill | size | day | |
---|---|---|---|
193 | 15.48 | 2 | Thur |
90 | 28.97 | 2 | Fri |
25 | 17.81 | 4 | Sat |
26 | 13.37 | 2 | Sat |
190 | 15.69 | 2 | Sun |
sklearn OneHotEncoder documentation
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
oh_enc.fit(tips_df[['day']])
oh_enc
OneHotEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder()
dummies = oh_enc.transform(tips_df[['day']])
dummies
<244x4 sparse matrix of type '<class 'numpy.float64'>' with 244 stored elements in Compressed Sparse Row format>
Sparse matrices are lightweight solutions to storing matrices with many zero elements.
(Why would this be useful for one-hot encoded data?)
dummies.toarray().shape # transform to regular NumPy array
(244, 4)
Let's compare our one-hot encoded features to the original day
feature.
random_rows
[193, 90, 25, 26, 190]
dummies.toarray()[random_rows,:]
array([[0., 0., 0., 1.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.]])
tips_df.loc[random_rows,:]
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
193 | 15.48 | 2.02 | Male | Yes | Thur | Lunch | 2 |
90 | 28.97 | 3.00 | Male | Yes | Fri | Dinner | 2 |
25 | 17.81 | 2.34 | Male | No | Sat | Dinner | 4 |
26 | 13.37 | 2.00 | Male | No | Sat | Dinner | 2 |
190 | 15.69 | 1.50 | Male | Yes | Sun | Dinner | 2 |
Note the days of week are "out of order," because sklearn doesn't "know" that Thursday comes before Friday, etc.
We can use .get_feature_names_out()
to get sklearn's one-hot encoding order (documentation):
oh_enc.get_feature_names_out()
array(['day_Fri', 'day_Sat', 'day_Sun', 'day_Thur'], dtype=object)
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
oh_enc.fit(tips_df[['day']])
ohe_data = oh_enc.transform(tips_df[['day']]).toarray()
data_w_ohe = (tips_df[three_features]
.join(
pd.DataFrame(ohe_data, columns=oh_enc.get_feature_names_out(), index=tips_df.index)))
data_w_ohe = data_w_ohe.drop(columns=["day"]) # why do we need to do this before calling fit?
data_w_ohe.loc[random_rows,:]
total_bill | size | day_Fri | day_Sat | day_Sun | day_Thur | |
---|---|---|---|---|---|---|
193 | 15.48 | 2 | 0.0 | 0.0 | 0.0 | 1.0 |
90 | 28.97 | 2 | 1.0 | 0.0 | 0.0 | 0.0 |
25 | 17.81 | 4 | 0.0 | 1.0 | 0.0 | 0.0 |
26 | 13.37 | 2 | 0.0 | 1.0 | 0.0 | 0.0 |
190 | 15.69 | 2 | 0.0 | 0.0 | 1.0 | 0.0 |
Now fitting the model with one-hot encodings:
from sklearn.linear_model import LinearRegression
f_with_day = LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_ohe, tips_df["tip"])
LinearRegression(fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression(fit_intercept=False)
# total_bill, size, day_Fri, day_Sat, day_Sun, day_Thur
f_with_day.predict([[50, 3, 1, 0, 0, 0]])
/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
array([5.9568643])
f_with_day.intercept_
0.0
f_with_day.coef_
array([0.09299361, 0.18713231, 0.74578683, 0.62112858, 0.73228865, 0.66829361])
The code below used to generate the lecture slide plots uses two out of scope syntax concepts:
vehicle_data = sns.load_dataset("mpg")
vehicle_data = vehicle_data.rename(columns = {"horsepower": "hp"})
vehicle_data = vehicle_data.dropna()
vehicle_data
mpg | cylinders | displacement | hp | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
392 rows × 9 columns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
def get_MSE_for_degree_k_model(k):
pipelined_model = Pipeline([
('poly_transform', PolynomialFeatures(degree = k)),
('regression', LinearRegression(fit_intercept = True))
])
pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
return mean_squared_error(pipelined_model.predict(vehicle_data[["hp"]]), vehicle_data["mpg"])
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k.set_index("k")
MSE | |
---|---|
k | |
0 | 60.762738 |
1 | 23.943663 |
2 | 18.984769 |
3 | 18.944990 |
4 | 18.876333 |
5 | 18.426969 |
6 | 18.241505 |
def plot_degree_k_model(k, MSEs_and_k, axs):
pipelined_model = Pipeline([
('poly_transform', PolynomialFeatures(degree = k)),
('regression', LinearRegression(fit_intercept = True))
])
pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
row = k // 3
col = k % 3
ax = axs[row, col]
sns.scatterplot(data=vehicle_data, x='hp', y='mpg', ax=ax)
x_range = np.linspace(45, 210, 100).reshape(-1, 1)
ax.plot(x_range, pipelined_model.predict(pd.DataFrame(x_range, columns=['hp'])), c='orange', linewidth=2)
ax.set_ylim((0, 50))
mse_str = f"MSE: {MSEs_and_k.loc[k, 'MSE']:.4}\norder: {k}"
ax.text(150, 40, mse_str, dict(size=16))
fig = plt.figure(figsize=(12, 6))
axs = fig.subplots(nrows=2, ncols=3)
for k in range(6):
plot_degree_k_model(k, MSEs_and_k, axs)
fig.tight_layout()
fig.savefig('higherorder')
import plotly.graph_objects as go
def plot_degree_k_model(k):
pipelined_model = Pipeline([
('poly_transform', PolynomialFeatures(degree = k)),
('regression', LinearRegression(fit_intercept = True))
])
pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=vehicle_data['hp'], y = vehicle_data['mpg'],
mode = "markers", name = ""))
x_range = np.linspace(45, 210, 100)
fig.add_trace(go.Scatter(x=x_range, y = pipelined_model.predict(x_range.reshape(-1, 1)),
mode = "lines", name = ""))
fig.update_layout(font_size = 20,
xaxis_title = "hp",
yaxis_title = "mpg",
margin=dict(l=50, r=50, b=0, t=1),
showlegend = False)
return fig
plot_degree_k_model(2)
/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names