import numpy as np
import pandas as pd


import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');

/opt/conda/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning:

The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


from seaborn import load_dataset
data = load_dataset("mpg")
data


tr, te = train_test_split(data, test_size=0.25, random_state=83)


models = {}


quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]


for i in range(len(quantitative_features)):
    # The features to include in the ith model
    features = quantitative_features[:(i+1)]
    # The name we are giving to the ith model
    name = ",".join([name[0] for name in features])
    # The pipeline for the ith model
    model = Pipeline([
        ("SelectColumns", ColumnTransformer([
            ("keep", "passthrough", features),
        ])),
        ("Imputation", SimpleImputer()),
        ("LinearModel", LinearRegression())
    ])
    # Fit the pipeline
    model.fit(tr, tr['mpg']);
    # Saving the ith model
    models[name] = model


models.keys()

dict_keys(['c', 'c,d', 'c,d,h', 'c,d,h,w', 'c,d,h,w,a', 'c,d,h,w,a,m'])


from sklearn.model_selection import cross_val_score


def rmse_score(model, X, y):
    return np.sqrt(np.mean((y - model.predict(X))**2))


cross_val_score(models['c'], tr, tr['mpg'], scoring=rmse_score, cv=5)

array([5.34219804, 4.31960704, 4.17949277, 4.89156037, 5.65044712])


np.mean(cross_val_score(models['c'], tr, tr['mpg'], scoring=rmse_score, cv=5))

4.876661068256957


def compare_models(models):
    # Compute the training error for each model
    training_rmse = [rmse_score(model, tr, tr['mpg']) for model in models.values()]
    # Compute the cross validation error for each model
    validation_rmse = [np.mean(cross_val_score(model, tr, tr['mpg'], scoring=rmse_score, cv=5)) 
                       for model in models.values()]
    # Compute the test error for each model (don't do this!)
    test_rmse = [rmse_score(model, te, te['mpg']) for model in models.values()]
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = training_rmse, name="Training RMSE"),
        go.Bar(x = names, y = validation_rmse, name="CV RMSE"),
        go.Bar(x = names, y = test_rmse, name="Test RMSE", opacity=.3)])
    fig.update_yaxes(title="RMSE")
    return fig


compare_models(models)


tr['origin'].value_counts()

usa       188
japan      56
europe     54
Name: origin, dtype: int64


from sklearn.preprocessing import OneHotEncoder


model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"])
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg'])
name = ",".join([name[0] for name in quantitative_features]) + ",o"
models[name] = model


from sklearn.feature_extraction.text import CountVectorizer


model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg'])
name = ",".join([name[0] for name in quantitative_features]) + ",o,n"
models[name] = model


compare_models(models)


fig = go.Figure()
fig.add_trace(go.Bar(
    x = list(models.keys()), 
    y = [len(models[m]["LinearModel"].coef_) for m in models]
))
fig.update_yaxes(title="Number of Features",type="log")


from sklearn.linear_model import Ridge


Ridge?


ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", Ridge(alpha=0.5))
])


ridge_model.fit(tr, tr['mpg'])
models["Ridge(alpha=0.5)"] = ridge_model
compare_models(models)


from sklearn.preprocessing import StandardScaler


ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
#     ("Standarize", StandardScaler(with_mean=False)),
    ("LinearModel", Ridge(alpha=0.5))
])


ridge_model.fit(tr, tr['mpg'])
models["RidgeN(alpha=0.5)"] = ridge_model
compare_models(models)


ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", Ridge(alpha=10))
])


ridge_model.fit(tr, tr['mpg'])
models["RidgeN(alpha=10)"] = ridge_model
compare_models(models)


ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", Ridge(alpha=10))
])

alphas = np.linspace(0.5, 20, 30)
cv_values = []
train_values = []
test_values = []
for alpha in alphas:
    ridge_model.set_params(LinearModel__alpha=alpha)
    cv_values.append(np.mean(cross_val_score(ridge_model, tr, tr['mpg'], scoring=rmse_score, cv=5)))
    ridge_model.fit(tr, tr['mpg'])
    train_values.append(rmse_score(ridge_model, tr, tr['mpg']))
    test_values.append(rmse_score(ridge_model, te, te['mpg']))


fig = go.Figure()
fig.add_trace(go.Scatter(x = alphas, y = train_values, mode="lines+markers", name="Train"))
fig.add_trace(go.Scatter(x = alphas, y = cv_values, mode="lines+markers", name="CV"))
fig.add_trace(go.Scatter(x = alphas, y = test_values, mode="lines+markers", name="Test"))
fig.update_layout(xaxis_title=r"$\alpha$", yaxis_title="CV RMSE")


best_alpha = alphas[np.argmin(cv_values)]
ridge_model.set_params(LinearModel__alpha=best_alpha)
ridge_model.fit(tr, tr['mpg'])
models["RidgeN(alpha_best)"] = ridge_model
compare_models(models)


from sklearn.linear_model import RidgeCV


alphas = np.linspace(0.5, 3, 30)

ridge_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", RidgeCV(alphas=alphas))
])


ridge_model.fit(tr, tr['mpg'])
models["RidgeCV"] = ridge_model
compare_models(models)


from sklearn.linear_model import Lasso, LassoCV


lasso_model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", StandardScaler(), quantitative_features),
        ("origin_encoder", OneHotEncoder(), ["origin"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LassoCV(cv=3))
])


lasso_model.fit(tr, tr['mpg'])
models["LassoCV"] = lasso_model
compare_models(models)


ff.create_distplot([
    models['LassoCV']["LinearModel"].coef_, 
    models['RidgeCV']["LinearModel"].coef_],
    ["Lasso", "Ridge"], bin_size=0.1)


ct = models['LassoCV']['SelectColumns']
feature_names = (
    quantitative_features +
    list(ct.named_transformers_['origin_encoder'].get_feature_names())+
    list(ct.named_transformers_['text'].get_feature_names())
)
feature_names = np.array(feature_names)
print(feature_names)

['cylinders' 'displacement' 'horsepower' 'weight' 'acceleration'
 'model_year' 'x0_europe' 'x0_japan' 'x0_usa' '10' '100' '100ls' '111'
 '1131' '12' '1200' '1200d' '124' '128' '12tl' '1300' '144ea' '1500' '18i'
 '1900' '200' '2000' '2002' '200sx' '210' '225' '2300' '240d' '244dl'
 '245' '280s' '300d' '304' '310' '320i' '340' '350' '4000' '411' '4w'
 '500' '5000' '504' '505s' '510' '610' '626' '710' '810' '88' '99gle'
 '99le' 'accord' 'air' 'ambassador' 'amc' 'aries' 'aspen' 'audi' 'auto'
 'beetle' 'bel' 'benz' 'bmw' 'brougham' 'buick' 'c10' 'cadillac' 'camaro'
 'capri' 'caprice' 'carina' 'carlo' 'catalina' 'cavalier' 'celica'
 'century' 'challenger' 'champ' 'chevelle' 'chevette' 'chevroelt'
 'chevrolet' 'chevy' 'chrysler' 'ciera' 'citation' 'civic' 'classic'
 'cobra' 'colt' 'concord' 'concours' 'corolla' 'corona' 'coronet' 'cougar'
 'country' 'coupe' 'cressida' 'cuda' 'custom' 'cutlass' 'cvcc' 'd100'
 'd200' 'dart' 'dasher' 'datsun' 'delta' 'deluxe' 'diesel' 'diplomat' 'dl'
 'dodge' 'door' 'dpl' 'duster' 'eldorado' 'electra' 'escort' 'estate'
 'f250' 'fairmont' 'fiat' 'firebird' 'ford' 'fox' 'fury' 'futura'
 'galaxie' 'ghia' 'gl' 'glc' 'gran' 'granada' 'grand' 'gremlin' 'gs' 'gt'
 'gtl' 'gx' 'hardtop' 'hatchback' 'hi' 'honda' 'horizon' 'hornet' 'ii'
 'iii' 'impala' 'isuzu' 'j2000' 'jetta' 'landau' 'lebaron' 'lecar'
 'lesabre' 'liftback' 'limited' 'lj' 'ls' 'ltd' 'luxus' 'lx' 'lynx'
 'magnum' 'malibu' 'man' 'mark' 'marquis' 'matador' 'maverick' 'maxda'
 'mazda' 'mercedes' 'mercury' 'model' 'monaco' 'monarch' 'monte' 'monza'
 'mpg' 'mustang' 'new' 'newport' 'nissan' 'nova' 'oldsmobile' 'omega'
 'omni' 'opel' 'pacer' 'peugeot' 'phoenix' 'pickup' 'pinto' 'pl510'
 'plymouth' 'pontiac' 'prelude' 'premier' 'prix' 'rabbit' 'rampage'
 'ranger' 'rebel' 'regal' 'regis' 'reliant' 'renault' 'royal' 'royale'
 'rx' 'rx3' 'saab' 'safari' 'salon' 'sapporo' 'satellite' 'se' 'sebring'
 'sedan' 'seville' 'sj' 'skyhawk' 'skylark' 'special' 'spirit' 'sport'
 'sportabout' 'squire' 'sst' 'st' 'stanza' 'strada' 'subaru' 'suburb'
 'sunbird' 'super' 'sw' 'sx' 'tc' 'tc3' 'tercel' 'torino' 'town' 'toyota'
 'toyouta' 'turbo' 'type' 'v6' 'v8' 'valiant' 'vega' 'ventura' 'volare'
 'volkswagen' 'volvo' 'vw' 'wagon' 'x1' 'xe' 'yorker' 'zephyr']


kept = ~np.isclose(models['LassoCV']["LinearModel"].coef_, 0)
feature_names[kept]

array(['cylinders', 'displacement', 'weight', 'acceleration',
       'model_year', 'x0_usa', '1200', '1500', '210', 'amc', 'civic',
       'colt', 'corolla', 'custom', 'datsun', 'diesel', 'ford',
       'hatchback', 'honda', 'horizon', 'ii', 'maxda', 'plymouth',
       'pontiac', 'rabbit', 'renault', 'rx', 'rx3', 'sw', 'tercel', 'vw'],
      dtype='<U12')

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

Lecture 16, Part 2 – Data 100, Fall 2020¶

Regularization¶

Imports¶

The Data¶

Building a Few Basic Models¶

`cross_val_score`¶

Visualizing the Train/CV/Test RMSE¶

Adding the Text Features¶

Adding the Origin¶

Adding the Vehicle Name¶

Regularization¶

The Regularization Hyper-parameter¶

The Regularization Function¶

Ridge Regression¶

Lasso Regression¶

Normalizing the Features¶

Ridge Regression in SK Learn¶

Cross Validation to Tune Regularization Parameter¶

Lasso in SKLearn¶

Lecture 16, Part 2 – Data 100, Fall 2020¶

Regularization¶

Imports¶

The Data¶

Building a Few Basic Models¶

cross_val_score¶

Visualizing the Train/CV/Test RMSE¶

Adding the Text Features¶

Adding the Origin¶

Adding the Vehicle Name¶

Regularization¶

The Regularization Hyper-parameter¶

The Regularization Function¶

Ridge Regression¶

Lasso Regression¶

Normalizing the Features¶

Ridge Regression in SK Learn¶

Cross Validation to Tune Regularization Parameter¶

Lasso in SKLearn¶

`cross_val_score`¶