In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
data = sns.load_dataset("tips")
In [3]:
three_features = ['total_bill', 'size', 'day']

three_feature_data = pd.DataFrame(data[three_features])

three_feature_data.iloc[[193, 90, 25, 26, 190], :]
Out[3]:
total_bill size day
193 15.48 2 Thur
90 28.97 2 Fri
25 17.81 4 Sat
26 13.37 2 Sat
190 15.69 2 Sun
In [4]:
dummies = pd.get_dummies(data['day'])
dummies.iloc[[193, 90, 25, 26, 190], :]
Out[4]:
Thur Fri Sat Sun
193 1 0 0 0
90 0 1 0 0
25 0 0 1 0
26 0 0 1 0
190 0 0 0 1
In [5]:
data_w_dummies = pd.concat([three_feature_data, dummies], axis=1)

data_w_dummies.iloc[[193, 90, 25, 26, 190], :]
Out[5]:
total_bill size day Thur Fri Sat Sun
193 15.48 2 Thur 1 0 0 0
90 28.97 2 Fri 0 1 0 0
25 17.81 4 Sat 0 0 1 0
26 13.37 2 Sat 0 0 1 0
190 15.69 2 Sun 0 0 0 1
In [6]:
from sklearn.linear_model import LinearRegression
f_with_day = LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_dummies[["total_bill", "size", "Thur",
                               "Fri", "Sat", "Sun"]], data["tip"])
Out[6]:
LinearRegression(fit_intercept=False)
In [7]:
f_with_day.predict([[50, 3, 1, 0, 0, 0]])
Out[7]:
array([5.87937107])

High Order Polynomial Example¶

In [8]:
vehicle_data = sns.load_dataset("mpg")
vehicle_data = vehicle_data.rename(columns = {"horsepower": "hp"})
vehicle_data = vehicle_data.dropna()
In [9]:
vehicle_data
Out[9]:
mpg cylinders displacement hp weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
... ... ... ... ... ... ... ... ... ...
393 27.0 4 140.0 86.0 2790 15.6 82 usa ford mustang gl
394 44.0 4 97.0 52.0 2130 24.6 82 europe vw pickup
395 32.0 4 135.0 84.0 2295 11.6 82 usa dodge rampage
396 28.0 4 120.0 79.0 2625 18.6 82 usa ford ranger
397 31.0 4 119.0 82.0 2720 19.4 82 usa chevy s-10

392 rows × 9 columns

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

def get_MSE_for_degree_k_model(k):
    pipelined_model = Pipeline([
        ('josh_transform', PolynomialFeatures(degree = k)),
        ('josh_regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    return mean_squared_error(pipelined_model.predict(vehicle_data[["hp"]]), vehicle_data["mpg"])
In [11]:
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k
Out[11]:
k MSE
0 0 60.762738
1 1 23.943663
2 2 18.984769
3 3 18.944990
4 4 18.876333
5 5 18.426969
6 6 18.241505
In [12]:
import plotly.graph_objects as go


def plot_degree_k_model(k):
    pipelined_model = Pipeline([
        ('josh_transform', PolynomialFeatures(degree = k)),
        ('josh_regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=vehicle_data['hp'], y = vehicle_data['mpg'], 
                        mode = "markers", name = ""))
    
    x_range = np.linspace(45, 210, 100)

    fig.add_trace(go.Scatter(x=x_range, y = pipelined_model.predict(x_range.reshape(-1, 1)), 
                         mode = "lines", name = ""))
    
    fig.update_layout(font_size = 20,
                  xaxis_title = "hp",
                  yaxis_title = "mpg",
                  margin=dict(l=50, r=50, b=0, t=1),
                  showlegend = False)
    return fig
In [13]:
plot_degree_k_model(2)