Lecture 14 Notebook¶

Data 100, Spring 2023

Acknowledgments Page

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(42)

#plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
#sns.set_context('talk')
In [2]:
tips_df = sns.load_dataset("tips")
tips_df
Out[2]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

In [3]:
three_features = ['total_bill', 'size', 'day']

three_feature_df = pd.DataFrame(tips_df[three_features])
random_rows = [193, 90, 25, 26, 190]

three_feature_df.iloc[random_rows, :]
Out[3]:
total_bill size day
193 15.48 2 Thur
90 28.97 2 Fri
25 17.81 4 Sat
26 13.37 2 Sat
190 15.69 2 Sun

sklearn OneHotEncoder documentation

In [4]:
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
In [5]:
oh_enc.fit(tips_df[['day']])
oh_enc
Out[5]:
OneHotEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
OneHotEncoder()
In [6]:
dummies = oh_enc.transform(tips_df[['day']])
dummies
Out[6]:
<244x4 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

Sparse matrices are lightweight solutions to storing matrices with many zero elements.
(Why would this be useful for one-hot encoded data?)

In [7]:
dummies.toarray().shape # transform to regular NumPy array
Out[7]:
(244, 4)



Let's compare our one-hot encoded features to the original day feature.

In [8]:
random_rows
Out[8]:
[193, 90, 25, 26, 190]
In [9]:
dummies.toarray()[random_rows,:]
Out[9]:
array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])
In [10]:
tips_df.loc[random_rows,:]
Out[10]:
total_bill tip sex smoker day time size
193 15.48 2.02 Male Yes Thur Lunch 2
90 28.97 3.00 Male Yes Fri Dinner 2
25 17.81 2.34 Male No Sat Dinner 4
26 13.37 2.00 Male No Sat Dinner 2
190 15.69 1.50 Male Yes Sun Dinner 2

Note the days of week are "out of order," because sklearn doesn't "know" that Thursday comes before Friday, etc.

We can use .get_feature_names_out() to get sklearn's one-hot encoding order (documentation):

In [11]:
oh_enc.get_feature_names_out()
Out[11]:
array(['day_Fri', 'day_Sat', 'day_Sun', 'day_Thur'], dtype=object)

Putting it all together¶

In [12]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder()
oh_enc.fit(tips_df[['day']])

ohe_data = oh_enc.transform(tips_df[['day']]).toarray()
data_w_ohe = (tips_df[three_features]
              .join(
                  pd.DataFrame(ohe_data, columns=oh_enc.get_feature_names_out(), index=tips_df.index)))
data_w_ohe = data_w_ohe.drop(columns=["day"]) # why do we need to do this before calling fit?
data_w_ohe.loc[random_rows,:]
Out[12]:
total_bill size day_Fri day_Sat day_Sun day_Thur
193 15.48 2 0.0 0.0 0.0 1.0
90 28.97 2 1.0 0.0 0.0 0.0
25 17.81 4 0.0 1.0 0.0 0.0
26 13.37 2 0.0 1.0 0.0 0.0
190 15.69 2 0.0 0.0 1.0 0.0



Now fitting the model with one-hot encodings:

In [14]:
from sklearn.linear_model import LinearRegression
f_with_day = LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_ohe, tips_df["tip"])
Out[14]:
LinearRegression(fit_intercept=False)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression(fit_intercept=False)
In [15]:
# total_bill, size, day_Fri, day_Sat, day_Sun, day_Thur
f_with_day.predict([[50, 3, 1, 0, 0, 0]])
/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
Out[15]:
array([5.9568643])
In [16]:
f_with_day.intercept_
Out[16]:
0.0
In [17]:
f_with_day.coef_
Out[17]:
array([0.09299361, 0.18713231, 0.74578683, 0.62112858, 0.73228865,
       0.66829361])

High Order Polynomial Example¶

The code below used to generate the lecture slide plots uses two out of scope syntax concepts:

  • The sklearn Pipeline class (documentation)
  • The sklearn PolynomialFeatures transformer (documentatoin)
In [18]:
vehicle_data = sns.load_dataset("mpg")
vehicle_data = vehicle_data.rename(columns = {"horsepower": "hp"})
vehicle_data = vehicle_data.dropna()
In [19]:
vehicle_data
Out[19]:
mpg cylinders displacement hp weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790 15.6 82 usa ford mustang gl
394 44.0 4 97.0 52.0 2130 24.6 82 europe vw pickup
395 32.0 4 135.0 84.0 2295 11.6 82 usa dodge rampage
396 28.0 4 120.0 79.0 2625 18.6 82 usa ford ranger
397 31.0 4 119.0 82.0 2720 19.4 82 usa chevy s-10

392 rows × 9 columns

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

def get_MSE_for_degree_k_model(k):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    return mean_squared_error(pipelined_model.predict(vehicle_data[["hp"]]), vehicle_data["mpg"])
In [21]:
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k.set_index("k")
Out[21]:
MSE
k
0 60.762738
1 23.943663
2 18.984769
3 18.944990
4 18.876333
5 18.426969
6 18.241505
In [22]:
def plot_degree_k_model(k, MSEs_and_k, axs):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    
    row = k // 3
    col = k % 3
    ax = axs[row, col]
    
    sns.scatterplot(data=vehicle_data, x='hp', y='mpg', ax=ax)
    
    x_range = np.linspace(45, 210, 100).reshape(-1, 1)
    ax.plot(x_range, pipelined_model.predict(pd.DataFrame(x_range, columns=['hp'])), c='orange', linewidth=2)
    
    ax.set_ylim((0, 50))
    mse_str = f"MSE: {MSEs_and_k.loc[k, 'MSE']:.4}\norder: {k}"
    ax.text(150, 40, mse_str, dict(size=16))

fig = plt.figure(figsize=(12, 6))
axs = fig.subplots(nrows=2, ncols=3)

for k in range(6):
    plot_degree_k_model(k, MSEs_and_k, axs)
fig.tight_layout()

fig.savefig('higherorder')
In [23]:
import plotly.graph_objects as go


def plot_degree_k_model(k):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=vehicle_data['hp'], y = vehicle_data['mpg'], 
                        mode = "markers", name = ""))
    
    x_range = np.linspace(45, 210, 100)

    fig.add_trace(go.Scatter(x=x_range, y = pipelined_model.predict(x_range.reshape(-1, 1)), 
                         mode = "lines", name = ""))
    
    fig.update_layout(font_size = 20,
                  xaxis_title = "hp",
                  yaxis_title = "mpg",
                  margin=dict(l=50, r=50, b=0, t=1),
                  showlegend = False)
    return fig
In [24]:
plot_degree_k_model(2)
/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning:

X does not have valid feature names, but PolynomialFeatures was fitted with feature names

In [ ]:
 
In [ ]: