import numpy as np
import pandas as pd


import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');

/opt/conda/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning:

The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.


from sklearn.linear_model import LinearRegression


from seaborn import load_dataset
data = load_dataset("mpg")
data


shuffled_data = data.sample(frac=1., random_state=42)
shuffled_data


split_point = int(shuffled_data.shape[0]*0.90)
split_point

358


tr = shuffled_data.iloc[:split_point]
te = shuffled_data.iloc[split_point:]

tr

te


len(tr) + len(te) == len(data)

True


from sklearn.model_selection import train_test_split


tr, te = train_test_split(data, test_size=0.1, random_state=83)

tr

te


ff.create_distplot([tr['mpg'], te['mpg']], ['train mpg', 'test mpg'])


def phi(df):
    return df[["cylinders", "displacement"]]


from sklearn.linear_model import LinearRegression
model = LinearRegression()


model.fit(phi(tr), tr['mpg'])

LinearRegression()


def rmse(y, yhat):
    return np.sqrt(np.mean((y - yhat)**2))


Y_hat = model.predict(phi(tr))
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

Training Error (RMSE): 4.589009902639974


Y_hat = model.predict(phi(te))
Y = te['mpg']
print("Test Error (RMSE):", rmse(Y, Y_hat))

Test Error (RMSE): 5.0340093039091744


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

model = Pipeline([
    ("SelectColumns", ColumnTransformer([("keep", "passthrough", ["cylinders", "displacement"])])),
    ("LinearModel", LinearRegression())
])

model.fit(tr, tr['mpg']);


models = {"c+d": model}


from sklearn.preprocessing import FunctionTransformer

def compute_volume(X):
    return np.expand_dims(X[:,1] / X[:,0]  , axis=1)

volume_transformer = FunctionTransformer(compute_volume, validate=True)


model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", ["cylinders", "displacement"]),
        ("cyl_vol", volume_transformer, ["cylinders", "displacement"])])),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg']);


Y_hat = model.predict(tr)
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

Training Error (RMSE): 4.318799994795117


models["c+d+d/c"] = model


quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration"]
model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("cyl_vol", volume_transformer, ["cylinders", "displacement"])])),
    ("LinearModel", LinearRegression())
])


try:
    model.fit(tr, tr['mpg'])
except ValueError as err:
    print(err)

Input contains NaN, infinity or a value too large for dtype('float64').


from sklearn.impute import SimpleImputer
model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("cyl_vol", volume_transformer, ["cylinders", "displacement"])])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg']);


models['c+d+d/c+h+w+a'] = model


Y_hat = model.predict(tr)
Y = tr['mpg']
print("Training Error (RMSE):", rmse(Y, Y_hat))

Training Error (RMSE): 4.025034093127813


from sklearn.model_selection import KFold
from sklearn.base import clone

def cross_validate_rmse(model):
    model = clone(model)
    five_fold = KFold(n_splits=5)
    rmse_values = []
    for tr_ind, va_ind in five_fold.split(tr):
        model.fit(tr.iloc[tr_ind,:], tr['mpg'].iloc[tr_ind])
        rmse_values.append(rmse(tr['mpg'].iloc[va_ind], model.predict(tr.iloc[va_ind,:])))
    return np.mean(rmse_values)


cross_validate_rmse(model)

4.05180814386984


def compare_models(models):
    # Compute the training error for each model
    training_rmse = [rmse(tr['mpg'], model.predict(tr)) for model in models.values()]
    # Compute the cross validation error for each model
    validation_rmse = [cross_validate_rmse(model) for model in models.values()]
    # Compute the test error for each model (don't do this!)
    test_rmse = [rmse(te['mpg'], model.predict(te)) for model in models.values()]
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = training_rmse, name="Training RMSE"),
        go.Bar(x = names, y = validation_rmse, name="CV RMSE"),
        go.Bar(x = names, y = test_rmse, name="Test RMSE", opacity=.3)])
    return fig


fig = compare_models(models)
fig.update_yaxes(range=[2,5.1], title="RMSE")


quantitative_features = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]
model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("cyl_vol", volume_transformer, ["cylinders", "displacement"])
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg'])
models['c+d+d/c+h+w+a+y'] = model


fig = compare_models(models)
fig.update_yaxes(range=[2,5.1], title="RMSE")


from sklearn.feature_extraction.text import CountVectorizer
model = Pipeline([
    ("SelectColumns", ColumnTransformer([
        ("keep", "passthrough", quantitative_features),
        ("cyl_vol", volume_transformer, ["cylinders", "displacement"]),
        ("text", CountVectorizer(), "name")
    ])),
    ("Imputation", SimpleImputer()),
    ("LinearModel", LinearRegression())
])


model.fit(tr, tr['mpg'])
models['c+d+d/c+h+w+a+y+n'] = model


fig = compare_models(models)
fig.update_yaxes(range=[0,5.1], title="RMSE")


best_model = clone(models['c+d+d/c+h+w+a+y'])


best_model.fit(data, data['mpg']);


rmse(best_model.predict(te), te['mpg'])

2.8167549672284298

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
198	33.0	4	91.0	53.0	1795	17.4	76	japan	honda civic
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
33	19.0	6	232.0	100.0	2634	13.0	71	usa	amc gremlin
208	13.0	8	318.0	150.0	3940	13.2	76	usa	plymouth volare premier v8
93	14.0	8	318.0	150.0	4237	14.5	73	usa	plymouth fury gran sedan
...	...	...	...	...	...	...	...	...	...
71	19.0	3	70.0	97.0	2330	13.5	72	japan	mazda rx2 coupe
106	12.0	8	350.0	180.0	4499	12.5	73	usa	oldsmobile vista cruiser
270	21.1	4	134.0	95.0	2515	14.8	78	japan	toyota celica gt liftback
348	37.7	4	89.0	62.0	2050	17.3	81	japan	toyota tercel
102	26.0	4	97.0	46.0	1950	21.0	73	europe	volkswagen super beetle

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
198	33.0	4	91.0	53.0	1795	17.4	76	japan	honda civic
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
33	19.0	6	232.0	100.0	2634	13.0	71	usa	amc gremlin
208	13.0	8	318.0	150.0	3940	13.2	76	usa	plymouth volare premier v8
93	14.0	8	318.0	150.0	4237	14.5	73	usa	plymouth fury gran sedan
...	...	...	...	...	...	...	...	...	...
391	36.0	4	135.0	84.0	2370	13.0	82	usa	dodge charger 2.2
134	16.0	6	258.0	110.0	3632	18.0	74	usa	amc matador
306	28.8	6	173.0	115.0	2595	11.3	79	usa	chevrolet citation
381	36.0	4	107.0	75.0	2205	14.5	82	japan	honda accord
319	31.3	4	120.0	75.0	2542	17.5	80	japan	mazda 626

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
243	21.5	3	80.0	110.0	2720	13.5	77	japan	mazda rx-4
54	35.0	4	72.0	69.0	1613	18.0	71	japan	datsun 1200
50	28.0	4	116.0	90.0	2123	14.0	71	europe	opel 1900
174	18.0	6	171.0	97.0	2984	14.5	75	usa	ford pinto
189	15.5	8	304.0	120.0	3962	13.9	76	usa	amc matador
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
187	17.5	8	305.0	140.0	4215	13.0	76	usa	chevrolet chevelle malibu classic
169	20.0	6	232.0	100.0	2914	16.0	75	usa	amc gremlin
58	25.0	4	97.5	80.0	2126	17.0	72	usa	dodge colt hardtop
48	18.0	6	250.0	88.0	3139	14.5	71	usa	ford mustang
344	39.0	4	86.0	64.0	1875	16.4	81	usa	plymouth champ
235	26.0	4	97.0	75.0	2265	18.2	77	japan	toyota corolla liftback
252	19.2	6	231.0	105.0	3535	19.2	78	usa	pontiac phoenix lj
21	24.0	4	107.0	90.0	2430	14.5	70	europe	audi 100 ls
313	28.0	4	151.0	90.0	2678	16.5	80	usa	chevrolet citation
160	17.0	6	231.0	110.0	3907	21.0	75	usa	buick century
276	21.6	4	121.0	115.0	2795	15.7	78	europe	saab 99gle
191	22.0	6	225.0	100.0	3233	15.4	76	usa	plymouth valiant
293	31.9	4	89.0	71.0	1925	14.0	79	europe	vw rabbit custom
343	39.1	4	79.0	58.0	1755	16.9	81	japan	toyota starlet
257	19.4	6	232.0	90.0	3210	17.2	78	usa	amc concord
308	33.5	4	151.0	90.0	2556	13.2	79	usa	pontiac phoenix
149	24.0	4	120.0	97.0	2489	15.0	74	japan	honda civic
130	26.0	4	122.0	80.0	2451	16.5	74	usa	ford pinto
151	31.0	4	79.0	67.0	2000	16.0	74	europe	fiat x1.9
359	28.1	4	141.0	80.0	3230	20.4	81	europe	peugeot 505s turbo diesel
99	18.0	6	232.0	100.0	2945	16.0	73	usa	amc hornet
372	27.0	4	151.0	90.0	2735	18.0	82	usa	pontiac phoenix
87	13.0	8	350.0	145.0	3988	13.0	73	usa	chevrolet malibu
330	40.9	4	85.0	NaN	1835	17.3	80	europe	renault lecar deluxe
214	13.0	8	302.0	130.0	3870	15.0	76	usa	ford f108
121	15.0	8	318.0	150.0	3399	11.0	73	usa	dodge dart custom
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10
20	25.0	4	110.0	87.0	2672	17.5	70	europe	peugeot 504
188	16.0	8	318.0	150.0	4190	13.0	76	usa	dodge coronet brougham
71	19.0	3	70.0	97.0	2330	13.5	72	japan	mazda rx2 coupe
106	12.0	8	350.0	180.0	4499	12.5	73	usa	oldsmobile vista cruiser
270	21.1	4	134.0	95.0	2515	14.8	78	japan	toyota celica gt liftback
348	37.7	4	89.0	62.0	2050	17.3	81	japan	toyota tercel
102	26.0	4	97.0	46.0	1950	21.0	73	europe	volkswagen super beetle

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
143	26.0	4	97.0	78.0	2300	14.5	74	europe	opel manta
391	36.0	4	135.0	84.0	2370	13.0	82	usa	dodge charger 2.2
182	28.0	4	107.0	86.0	2464	15.5	76	europe	fiat 131
63	14.0	8	400.0	175.0	4385	12.0	72	usa	pontiac catalina
52	30.0	4	88.0	76.0	2065	14.5	71	europe	fiat 124b
...	...	...	...	...	...	...	...	...	...
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
256	20.5	6	225.0	100.0	3430	17.2	78	usa	plymouth volare
295	35.7	4	98.0	80.0	1915	14.4	79	usa	dodge colt hatchback custom
23	26.0	4	121.0	113.0	2234	12.5	70	europe	bmw 2002
82	23.0	4	120.0	97.0	2506	14.5	72	japan	toyouta corona mark ii (sw)

Lecture 16, Part 1 – Data 100, Fall 2020¶

Train Test Split and Cross Validation¶

Imports¶

The Data¶

Train Test Split¶

Using Pandas Operations¶

Using SKLearn¶

Quick Visualization¶

Building A Basic Model¶

Don't try this at home!¶

SKLearn Pipelines¶

Keeping track of all the models.¶

More Feature Transformations¶

Adding More Features¶

Cross Validation¶

Going too Far?¶

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
232	16.0	8	351.0	149.0	4335	14.5	77	usa	ford thunderbird
365	20.2	6	200.0	88.0	3060	17.1	81	usa	ford granada gl
225	17.5	6	250.0	110.0	3520	16.4	77	usa	chevrolet concours
277	16.2	6	163.0	133.0	3410	15.8	78	europe	peugeot 604sl
115	15.0	8	350.0	145.0	4082	13.0	73	usa	chevrolet monte carlo s
314	26.4	4	140.0	88.0	2870	18.1	80	usa	ford fairmont
219	25.5	4	122.0	96.0	2300	15.5	77	usa	plymouth arrow gs
33	19.0	6	232.0	100.0	2634	13.0	71	usa	amc gremlin
312	37.2	4	86.0	65.0	2019	16.4	80	japan	datsun 310
306	28.8	6	173.0	115.0	2595	11.3	79	usa	chevrolet citation
280	21.5	6	231.0	115.0	3245	15.4	79	usa	pontiac lemans v6
97	18.0	6	225.0	105.0	3121	16.5	73	usa	plymouth valiant
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
100	18.0	6	250.0	88.0	3021	16.5	73	usa	ford maverick
29	27.0	4	97.0	88.0	2130	14.5	71	japan	datsun pl510
300	23.9	8	260.0	90.0	3420	22.2	79	usa	oldsmobile cutlass salon brougham
281	19.8	6	200.0	85.0	2990	18.2	79	usa	mercury zephyr 6
252	19.2	6	231.0	105.0	3535	19.2	78	usa	pontiac phoenix lj
388	26.0	4	156.0	92.0	2585	14.5	82	usa	chrysler lebaron medallion
376	37.0	4	91.0	68.0	2025	18.2	82	japan	mazda glc custom l
278	31.5	4	89.0	71.0	1990	14.9	78	europe	volkswagen scirocco
101	23.0	6	198.0	95.0	2904	16.0	73	usa	plymouth duster
76	18.0	4	121.0	112.0	2933	14.5	72	europe	volvo 145e (sw)
343	39.1	4	79.0	58.0	1755	16.9	81	japan	toyota starlet
362	24.2	6	146.0	120.0	2930	13.8	81	japan	datsun 810 maxima
327	36.4	5	121.0	67.0	2950	19.9	80	europe	audi 5000s (diesel)
170	23.0	4	140.0	78.0	2592	18.5	75	usa	pontiac astro
50	28.0	4	116.0	90.0	2123	14.0	71	europe	opel 1900
246	32.8	4	78.0	52.0	1985	19.4	78	japan	mazda glc deluxe
32	25.0	4	98.0	NaN	2046	19.0	71	usa	ford pinto
89	15.0	8	318.0	150.0	3777	12.5	73	usa	dodge coronet custom
230	15.5	8	350.0	170.0	4165	11.4	77	usa	chevrolet monte carlo landau
313	28.0	4	151.0	90.0	2678	16.5	80	usa	chevrolet citation
86	14.0	8	304.0	150.0	3672	11.5	73	usa	amc matador
46	22.0	4	140.0	72.0	2408	19.0	71	usa	chevrolet vega (sw)
364	26.6	8	350.0	105.0	3725	19.0	81	usa	oldsmobile cutlass ls
272	23.8	4	151.0	85.0	2855	17.6	78	usa	oldsmobile starfire sx
171	24.0	4	134.0	96.0	2702	13.5	75	japan	toyota corona
71	19.0	3	70.0	97.0	2330	13.5	72	japan	mazda rx2 coupe
231	15.5	8	400.0	190.0	4325	12.2	77	usa	chrysler cordoba