import numpy as np
import pandas as pd


import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');

/opt/conda/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning:

The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.


from sklearn.linear_model import LinearRegression


from seaborn import load_dataset
data = load_dataset("mpg")
data


data = data[~data.isna().any(axis=1)].copy()


shuffled_data = data.sample(frac = 1, random_state = 42)
shuffled_data


split_point = int(shuffled_data.shape[0] * 0.90)
split_point

352


tr = shuffled_data.iloc[:split_point]
te = shuffled_data.iloc[split_point:]


len(tr), len(te)

(352, 40)


tr.head()


te.head()


len(tr) + len(te) == len(data)

True


np.random.seed(100)
shuffled_indices = np.random.permutation(np.arange(len(data)))
shuffled_indices

array([124, 140, 276, 252, 326, 136, 369, 132, 387, 174, 225, 356, 257,
       239, 231, 267,   7, 129, 258, 234,  43, 190, 227, 368,  75, 149,
       201, 288,  78, 163, 347, 284, 152,   1, 246, 213,  21, 110, 161,
        69,  56, 198, 160, 134,  97, 195, 255,  98,  54, 118, 361,  18,
       311,  64, 272, 295, 298, 127, 191,   5, 103, 377, 266, 346,  90,
       385, 188, 293,  96,  46,  50, 282, 248, 120, 233, 209, 187,  27,
       235, 338, 328, 352, 372, 304, 308,   6, 153, 219, 279, 121,   3,
        20, 125, 166, 307, 309,  60,  84, 342,  80, 147, 133,  31, 345,
        45,  47, 260, 150, 391,  59, 334,  23,  88, 332,  15,  33, 171,
       355, 169, 265, 386, 241, 249, 178, 362,  19,  26, 297,  35, 157,
        39, 244, 375,  10, 199, 184, 208, 367,  65, 259, 285,  41, 378,
       203, 104, 128, 216, 151, 142, 158,  40, 217,  32,  48, 327, 197,
       123, 173, 204,  61,  71, 305, 330, 126, 115, 271,  85, 159, 164,
        52, 321, 154, 205, 315,  29, 358, 139, 302, 319, 162, 111, 296,
       177, 371, 300, 175, 331, 324, 281, 339,  62, 247,  99, 269, 112,
        37, 189, 206, 374,  83, 373, 314,  51, 263, 341, 370,  42, 357,
       229, 236, 318, 179,  87, 268,  55,  22, 379, 313, 101,  11, 291,
       108, 376, 194,  25, 117,  81, 366, 275, 242, 230,  82, 292, 156,
       278, 333, 329,  74, 224, 254, 218, 306, 145, 320, 130, 113, 349,
       351, 287, 388, 353, 210,  28, 344, 221,  24, 168, 148, 380, 322,
        77, 340,  34, 144, 182, 232,  73, 301,  57, 365,  44,  92, 146,
       200, 264, 138, 223, 220,  67, 109,  12,  16,  89, 337, 243, 382,
       286, 222, 107, 186, 116, 122, 165, 262, 277,   9, 253, 196, 310,
       384, 250, 256, 102, 289,  76, 119, 237, 114,  95, 170, 381,  94,
       214,  38, 261,  36, 299, 180, 176, 360, 215, 207, 212, 325,  70,
       131, 185, 335,   0, 348,  68, 383,  17,  30, 106,  13,  72, 273,
       202, 192, 274, 172, 294, 167, 303, 238, 312, 283, 181,  63, 105,
         2, 336, 251, 183, 270, 317, 193,  49, 135, 389,  91,   4, 100,
       211, 245, 141, 364, 155,  86,  93, 137,  58, 316, 363, 228, 143,
       390, 240, 290,  14, 226,  66,  53, 354, 350,  79, 343, 359, 323,
       280,   8])


data.iloc[shuffled_indices].head()


tr = data.iloc[shuffled_indices[:split_point]]
te = data.iloc[shuffled_indices[split_point:]]


len(tr), len(te)

(352, 40)


from sklearn.model_selection import train_test_split


tr, te = train_test_split(data, test_size = 0.1, random_state = 83)


len(tr), len(te)

(352, 40)


tr.head()


te.head()


def basic_design_matrix(df):
    X = df[["cylinders", "displacement", 
          "horsepower", "weight", "acceleration", "model_year"]]
    return X

basic_design_matrix(tr)


from sklearn.linear_model import LinearRegression
model = LinearRegression()


model.fit(basic_design_matrix(tr), tr['mpg'])

LinearRegression()


def rmse(y, yhat):
    return np.sqrt(np.mean((y - yhat)**2))


Y_hat = model.predict(basic_design_matrix(tr))
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

Training Error (RMSE): 3.374582699942459


models = {"quant": (basic_design_matrix, LinearRegression())}


def dispcyl_design_matrix(df):
    X = basic_design_matrix(df)
    X['displacement/cylinder'] = X['displacement'] / X['cylinders']
    return X

dispcyl_design_matrix(tr)


model = LinearRegression()
model.fit(dispcyl_design_matrix(tr), tr['mpg'])

models['quant+dc'] = (dispcyl_design_matrix, LinearRegression())


Y_hat = model.predict(dispcyl_design_matrix(tr))
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

Training Error (RMSE): 3.033309344625912


from sklearn.model_selection import KFold
from sklearn.base import clone

def cross_validate_rmse(phi_function, model):
    model = clone(model)
    five_fold = KFold(n_splits = 5, random_state = 100, shuffle = True)
    rmse_values = []
    for tr_ind, va_ind in five_fold.split(tr):
        
        X_train = phi_function(tr.iloc[tr_ind, :])
        y_train = tr['mpg'].iloc[tr_ind]
        X_val = phi_function(tr.iloc[va_ind, :])
        y_val = tr['mpg'].iloc[va_ind]
        
        model.fit(X_train, y_train)
        
        rmse_values.append(rmse(y_val, model.predict(X_val)))
        
    return np.mean(rmse_values)


cross_validate_rmse(dispcyl_design_matrix, LinearRegression())

3.1113159765045717


def compare_models(models):
    
    # Compute the training error for each model
    training_rmse = []
    for transformation, model in models.values():
        model = clone(model)
        model.fit(transformation(tr), tr['mpg'])
        training_rmse.append(rmse(tr['mpg'], model.predict(transformation(tr))))
    
    # Compute the cross validation error for each model
    validation_rmse = [cross_validate_rmse(transformation, model) for transformation, model in models.values()]
    
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = training_rmse, name="Training RMSE"),
        go.Bar(x = names, y = validation_rmse, name="CV RMSE")])
    return fig


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


data['origin'].value_counts()

usa       245
japan      79
europe     68
Name: origin, dtype: int64


from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()
oh_enc.fit(data[['origin']])

def origin_design_matrix(df):
    X = dispcyl_design_matrix(df)
    ohe_cols = pd.DataFrame(oh_enc.transform(df[['origin']]).todense(), 
                           columns = oh_enc.get_feature_names(),
                           index = df.index)
    return X.join(ohe_cols)

models['quant+dc+o'] = (origin_design_matrix, LinearRegression())

origin_design_matrix(tr)


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


tr['name'].str.split().explode().value_counts().head(20)

ford          44
chevrolet     37
plymouth      28
(sw)          27
dodge         26
amc           26
toyota        22
custom        17
datsun        17
buick         16
volkswagen    14
pontiac       14
brougham      10
oldsmobile    10
honda         10
mercury       10
rabbit        10
corolla        9
mazda          9
corona         7
Name: name, dtype: int64


brands = tr['name'].str.split().explode().value_counts().head(20).index

def brands_design_matrix(df):
    X = origin_design_matrix(df)
    for brand in brands:
        X[brand] = df['name'].str.contains(brand, regex = False).astype(float)
    return X

models['quant+dc+o+b'] = (brands_design_matrix, LinearRegression())

brands_design_matrix(tr)


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


from sklearn.linear_model import Ridge


Ridge?


from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(tr['name'])

CountVectorizer()


def name_design_matrix(df):
    X = origin_design_matrix(df)
    feature_names = vectorizer.get_feature_names()
    X[feature_names] = vectorizer.transform(df['name']).toarray()
    return X

name_design_matrix(tr)


cross_validate_rmse(name_design_matrix, LinearRegression())

1856664698.7350745


from sklearn.preprocessing import StandardScaler
quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]
scaler = StandardScaler()
scaler.fit(basic_design_matrix(tr[quantitative_features]))

StandardScaler()


def name_design_matrix_std(df):
    X = name_design_matrix(df)
    X[quantitative_features] = scaler.transform(X[quantitative_features])
    return X

name_design_matrix_std(tr)


cross_validate_rmse(name_design_matrix_std, Ridge(alpha = .5))

3.1168076796253716


models['quant+dc+o+n-Ridge.5'] = (name_design_matrix_std,  Ridge(alpha = .5))


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


models['quant+dc+o+n-Ridge100'] = (name_design_matrix_std, Ridge(alpha = 100))


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


alphas = np.linspace(.5, 3.5, 20)
cv_values = []
train_values = []
for alpha in alphas:
    model = Ridge(alpha = alpha)
    model.fit(name_design_matrix_std(tr), tr['mpg'])
    train_values.append(rmse(tr['mpg'], model.predict(name_design_matrix_std(tr))))
    
    validation_rmse = cross_validate_rmse(name_design_matrix_std, model)
    cv_values.append(validation_rmse)


fig = go.Figure()
fig.add_trace(go.Scatter(x = alphas, y = train_values, mode="lines+markers", name="Train"))
fig.add_trace(go.Scatter(x = alphas, y = cv_values, mode="lines+markers", name="CV"))
fig.update_layout(xaxis_title=r"$\alpha$", yaxis_title="CV RMSE")


models['quant+dc+o+n-Ridge1.75'] = (name_design_matrix_std,  Ridge(alpha = 1.75))


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


from sklearn.linear_model import RidgeCV


models['quant+dc+o+n-RidgeCV'] = (name_design_matrix_std, RidgeCV())


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


from sklearn.linear_model import Lasso, LassoCV


models['quant+dc+o+n-LassoCV'] = (name_design_matrix_std,  LassoCV())


fig = compare_models(models)
fig.update_yaxes(range = [0, 4], title = "RMSE")


model = RidgeCV()
model.fit(name_design_matrix_std(tr), tr['mpg'])
ridge_coef = model.coef_

model = LassoCV()
model.fit(name_design_matrix_std(tr), tr['mpg'])
lasso_coef = model.coef_


ff.create_distplot([ridge_coef, lasso_coef], ["Ridge", "Lasso"], bin_size = 0.1)


name_design_matrix_std(tr).columns[lasso_coef > 0]

Index(['displacement', 'model_year', 'x0_japan', 'datsun', 'diesel',
       'plymouth', 'pontiac', 'vw'],
      dtype='object')

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
79	26.0	4	96.0	69.0	2189	18.0	72	europe	renault 12 (sw)
276	21.6	4	121.0	115.0	2795	15.7	78	europe	saab 99gle
248	36.1	4	91.0	60.0	1800	16.4	78	japan	honda civic cvcc
56	26.0	4	91.0	70.0	1955	20.5	71	usa	plymouth cricket
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
...	...	...	...	...	...	...	...	...	...
72	15.0	8	304.0	150.0	3892	12.5	72	usa	amc matador (sw)
107	18.0	6	232.0	100.0	2789	15.0	73	usa	amc gremlin
272	23.8	4	151.0	85.0	2855	17.6	78	usa	oldsmobile starfire sx
352	29.9	4	98.0	65.0	2380	20.7	81	usa	ford escort 2h
103	11.0	8	400.0	150.0	4997	14.0	73	usa	chevrolet impala

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
79	26.0	4	96.0	69.0	2189	18.0	72	europe	renault 12 (sw)
276	21.6	4	121.0	115.0	2795	15.7	78	europe	saab 99gle
248	36.1	4	91.0	60.0	1800	16.4	78	japan	honda civic cvcc
56	26.0	4	91.0	70.0	1955	20.5	71	usa	plymouth cricket
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
245	36.1	4	98.0	66.0	1800	14.4	78	usa	ford fiesta
55	27.0	4	97.0	60.0	1834	19.0	71	europe	volkswagen model 111
51	30.0	4	79.0	70.0	2074	19.5	71	europe	peugeot 304
176	19.0	6	232.0	90.0	3211	17.0	75	usa	amc pacer
191	22.0	6	225.0	100.0	3233	15.4	76	usa	plymouth valiant

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
125	20.0	6	198.0	95.0	3102	16.5	74	usa	plymouth duster
142	26.0	4	79.0	67.0	1963	15.5	74	europe	volkswagen dasher
278	31.5	4	89.0	71.0	1990	14.9	78	europe	volkswagen scirocco
254	20.2	6	200.0	85.0	2965	15.8	78	usa	ford fairmont (auto)
328	30.0	4	146.0	67.0	3250	21.8	80	europe	mercedes-benz 240d

Lecture 17¶

Train Test Split and Cross Validation¶

Imports¶

The Data¶

Train Test Split¶

Using Pandas Operations¶

Shuffling with Numpy¶

Using SKLearn¶

Building A Basic Model¶

Cross-Validation¶

Keeping track of all the models.¶

More Feature Transformations¶

Regularization¶

Ridge Regression¶

The Regularization Hyperparameter¶

Ridge Regression in SK Learn¶

Lasso Regression¶

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
6	14.0	8	454.0	220.0	4354	9.0	70	usa	chevrolet impala
352	29.9	4	98.0	65.0	2380	20.7	81	usa	ford escort 2h
47	19.0	6	250.0	100.0	3282	15.0	71	usa	pontiac firebird
39	14.0	8	400.0	175.0	4464	11.5	71	usa	pontiac catalina brougham
304	37.3	4	91.0	69.0	2130	14.7	79	europe	fiat strada custom

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
87	13.0	8	350.0	145.0	3988	13.0	73	usa	chevrolet malibu
279	29.5	4	98.0	68.0	2135	16.6	78	japan	honda accord lx
319	31.3	4	120.0	75.0	2542	17.5	80	japan	mazda 626
173	24.0	4	119.0	97.0	2545	17.0	75	japan	datsun 710
148	26.0	4	116.0	75.0	2246	14.0	74	europe	fiat 124 tc

	cylinders	displacement	horsepower	weight	acceleration	model_year	displacement/cylinder	x0_europe	x0_japan	x0_usa	...	volare	volkswagen	volvo	vw	wagon	woody	xe	yorker	zephyr	zx
6	1.452785	2.426007	2.916388	1.580639	-2.320642	-1.611836	56.750000	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0
352	-0.874316	-0.933437	-1.034011	-0.720703	1.862364	1.351615	24.500000	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0
47	0.289235	0.500933	-0.141986	0.330873	-0.175511	-1.342431	41.666667	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0
39	1.452785	1.916429	1.769498	1.708880	-1.426838	-1.342431	50.000000	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0
304	-0.874316	-0.999493	-0.932065	-1.012159	-0.282767	0.812806	22.750000	1.0	0.0	0.0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
394	-0.874316	-0.942873	-1.365335	-1.012159	3.256700	1.621020	24.250000	1.0	0.0	0.0	...	0	0	0	1	0	0	0	0	0	0
258	0.289235	0.321637	-0.014553	0.445124	0.110507	0.543401	38.500000	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0
297	-0.292540	-0.131322	-0.728174	0.619998	1.647851	0.812806	36.600000	1.0	0.0	0.0	...	0	0	0	0	0	0	0	0	0	0
23	-0.874316	-0.716394	0.189338	-0.890913	-1.069316	-1.611836	30.250000	1.0	0.0	0.0	...	0	0	0	0	0	0	0	0	0	0
83	-0.874316	-0.933437	-0.651714	-0.972521	-0.175511	-1.073026	24.500000	0.0	0.0	1.0	...	0	0	0	0	0	0	0	0	0	0