import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from seaborn import load_dataset
data = load_dataset("mpg")
data


data[data.isna().any(axis=1)]


def phi_cont(df):
    Phi = df[["cylinders", "displacement", 
              "horsepower", "weight", 
              "acceleration", 
              "model_year"]].copy()
    Phi["horsepower_missing"] = Phi["horsepower"].isna()
    Phi = Phi.fillna(Phi.mean())
    return Phi


model = LinearRegression()
model.fit(phi_cont(data), data[["mpg"]])

LinearRegression()


def evaluate_model(name, model, phi, models=dict()):
    # run the prediction function and compute the RMSE
    Yhat = model.predict(phi(data)).flatten()
    Y = data['mpg'].to_numpy()
    rmse = np.sqrt(mean_squared_error(Y, Yhat))
    print("Root Mean Squared Error:", rmse)
    
    # Save the model and rmse to the collection of models 
    models[name] = dict(model=model, phi=phi, rmse=rmse)
    
    # Generate diagnostic and model comparison plots
    fig = make_subplots(rows=1, cols=2)
    fig.add_trace(go.Scatter(x=Yhat, y=Y, mode="markers"), row=1, col=1)
    fig.update_xaxes(title = "Yhat", row=1, col=1)
    fig.update_yaxes(title = "Y", row=1, col=1)
    ymin = np.min(Yhat)
    ymax = np.max(Yhat)
    fig.add_trace(go.Scatter(x=[ymin,ymax], y=[ymin,ymax], name="y=yhat"), row=1, col=1)
    fig.add_trace(go.Bar(x=list(models.keys()), 
                         y=[models[k]['rmse'] for k in models]), row=1, col=2)    
    fig.update_layout(showlegend=False)
    fig.update_yaxes(title = "RMSE", row=1, col=2)
    fig.show()
    


models = {}


evaluate_model("cont.", model, phi_cont, models)

Root Mean Squared Error: 3.4140204828031737


new_data = data[data['horsepower'].isna()].head(3)
new_data


try:
    model.predict(phi_cont(new_data))
except Exception as e:
    print(e)

Input contains NaN, infinity or a value too large for dtype('float64').


# Making a global variable
def phi_cont(df, data_mean = data.mean()):
    feature_cols = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]
    Phi = df[feature_cols].copy()
    Phi["horsepower_missing"] = Phi["horsepower"].isna().astype(float)
    Phi = Phi.fillna(data_mean)
    return Phi


model.predict(phi_cont(new_data))

array([[25.91611313],
       [22.40066016],
       [33.96492047]])


from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")


imputer.fit(data[['weight', 'horsepower']])

SimpleImputer()


imputer.transform(data[['weight', 'horsepower']])[32]

array([2046.        ,  104.46938776])


imputer.fit(data[['horsepower']])
def phi_cont(df, imputer=imputer):
    feature_cols = ["cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year"]
    Phi = df[feature_cols].copy()
    Phi["horsepower_missing"] = Phi["horsepower"].isna().astype(float)
    Phi["horsepower"] = imputer.transform(Phi[["horsepower"]]).flatten()
    return Phi


model = LinearRegression()
model.fit(phi_cont(data), data[["mpg"]])
evaluate_model("cont.", model, phi_cont, models)

Root Mean Squared Error: 3.4140204828031737


def phi_with_displacement(df):
    Phi = phi_cont(df)
    Phi['displacement/cylinder'] = Phi['displacement'] / Phi['cylinders']
    return Phi


phi_with_displacement(data).head()


model = LinearRegression()
model.fit(phi_with_displacement(data), data[["mpg"]])
evaluate_model("cont.+(d/c)", model, phi_with_displacement, models)

Root Mean Squared Error: 3.020742481741042


data.head()


px.histogram(data, x='origin')


pd.get_dummies(data[['origin']])


def phi_with_origin(df):
    Phi = phi_with_displacement(df)
    return Phi.join(pd.get_dummies(df[['origin']]))


phi_with_origin(data).head()


model = LinearRegression()
model.fit(phi_with_origin(data), data[["mpg"]])
evaluate_model("cont.+(d/c)+o", model, phi_with_origin, models)

Root Mean Squared Error: 3.006188837672639


try:
    model.predict(phi_with_origin(data.head(1)))
except Exception as e:
    print(e)

matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 11 is different from 9)


phi_with_origin(data.head(1))


from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()


oh_enc.fit(data[['origin']])

OneHotEncoder()


oh_enc.transform(data[['origin']].head())

<5x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>


oh_enc.transform(data[['origin']].head()).todense()

matrix([[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])


oh_enc.get_feature_names()

array(['x0_europe', 'x0_japan', 'x0_usa'], dtype=object)


def phi_with_origin(df):
    Phi = phi_with_displacement(df)
    dummies = pd.DataFrame(oh_enc.transform(df[['origin']]).todense(), 
                           columns=oh_enc.get_feature_names(),
                           index = df.index)
    return Phi.join(dummies)


phi_with_origin(data.head())


model = LinearRegression()
model.fit(phi_with_origin(data), data[["mpg"]])
evaluate_model("cont.+(d/c)+o", model, phi_with_origin, models)

Root Mean Squared Error: 3.006188837672639


data[['name']]


data[['name']].head(10)


frost_text = [x for x in """
Some say the world will end in fire,
Some say in ice.
From what Ive tasted of desire
I hold with those who favor fire.
""".split("\n") if len(x) > 0]

frost_text

['Some say the world will end in fire,',
 'Some say in ice.',
 'From what Ive tasted of desire',
 'I hold with those who favor fire.']


from sklearn.feature_extraction.text import CountVectorizer

# Construct the tokenizer with English stop words
bow = CountVectorizer(stop_words="english")

# fit the model to the passage
bow.fit(frost_text)

CountVectorizer(stop_words='english')


# Print the words that are kept
print("Words:", list(enumerate(bow.get_feature_names())))

Words: [(0, 'desire'), (1, 'end'), (2, 'favor'), (3, 'hold'), (4, 'ice'), (5, 'ive'), (6, 'say'), (7, 'tasted'), (8, 'world')]


print("Sentence Encoding: \n")
# Print the encoding of each line
for (text, encoding) in zip(frost_text, bow.transform(frost_text)):
    print(text)
    print(encoding.todense())
    print("------------------")

Sentence Encoding: 

Some say the world will end in fire,
[[0 1 0 0 0 0 1 0 1]]
------------------
Some say in ice.
[[0 0 0 0 1 0 1 0 0]]
------------------
From what Ive tasted of desire
[[1 0 0 0 0 1 0 1 0]]
------------------
I hold with those who favor fire.
[[0 0 1 1 0 0 0 0 0]]
------------------


# Construct the tokenizer with English stop words
bigram = CountVectorizer(ngram_range=(1, 2))
# fit the model to the passage
bigram.fit(frost_text)

CountVectorizer(ngram_range=(1, 2))


# Print the words that are kept
print("\nWords:", 
      list(zip(range(0,len(bigram.get_feature_names())), bigram.get_feature_names())))

Words: [(0, 'desire'), (1, 'end'), (2, 'end in'), (3, 'favor'), (4, 'favor fire'), (5, 'fire'), (6, 'from'), (7, 'from what'), (8, 'hold'), (9, 'hold with'), (10, 'ice'), (11, 'in'), (12, 'in fire'), (13, 'in ice'), (14, 'ive'), (15, 'ive tasted'), (16, 'of'), (17, 'of desire'), (18, 'say'), (19, 'say in'), (20, 'say the'), (21, 'some'), (22, 'some say'), (23, 'tasted'), (24, 'tasted of'), (25, 'the'), (26, 'the world'), (27, 'those'), (28, 'those who'), (29, 'what'), (30, 'what ive'), (31, 'who'), (32, 'who favor'), (33, 'will'), (34, 'will end'), (35, 'with'), (36, 'with those'), (37, 'world'), (38, 'world will')]


print("\nSentence Encoding: \n")
# Print the encoding of each line
for (text, encoding) in zip(frost_text, bigram.transform(frost_text)):
    print(text)
    print(encoding.todense())
    print("------------------")

Sentence Encoding: 

Some say the world will end in fire,
[[0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0
  0 1 1]]
------------------
Some say in ice.
[[0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0]]
------------------
From what Ive tasted of desire
[[1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0
  0 0 0]]
------------------
I hold with those who favor fire.
[[0 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 1
  1 0 0]]
------------------


bow = CountVectorizer()
bow.fit(data["name"])

def phi_with_name(df):
    Phi = phi_with_origin(df)
    bow_encoding = pd.DataFrame(
        bow.transform(df['name']).todense(), 
        columns=bow.get_feature_names(),
        index = df.index)
    return Phi.join(bow_encoding)


Phi = phi_with_name(data)
Phi.head()


model = LinearRegression()
model.fit(phi_with_name(data), data[["mpg"]])
evaluate_model("cont.+(d/c)+o+n", model, phi_with_name, models)

Root Mean Squared Error: 1.3566759335171261

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
32	25.0	4	98.0	NaN	2046	19.0	71	usa	ford pinto
126	21.0	6	200.0	NaN	2875	17.0	74	usa	ford maverick
330	40.9	4	85.0	NaN	1835	17.3	80	europe	renault lecar deluxe
336	23.6	4	140.0	NaN	2905	14.3	80	usa	ford mustang cobra
354	34.5	4	100.0	NaN	2320	15.8	81	europe	renault 18i
374	23.0	4	151.0	NaN	3035	20.5	82	usa	amc concord dl

	cylinders	displacement	horsepower	weight	acceleration	model_year	displacement/cylinder
0	8	307.0	130.0	3504	12.0	70	38.375
1	8	350.0	165.0	3693	11.5	70	43.750
2	8	318.0	150.0	3436	11.0	70	39.750
3	8	304.0	150.0	3433	12.0	70	38.000
4	8	302.0	140.0	3449	10.5	70	37.750

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino

	origin_europe	origin_japan	origin_usa
0	0	0	1
1	0	0	1
2	0	0	1
3	0	0	1
4	0	0	1
...	...	...	...
393	0	0	1
394	1	0	0
395	0	0	1
396	0	0	1
397	0	0	1

Missing Values, Categorical Features, and Text¶

The Data¶

Quantitative Continuous Features¶

Missing Values¶

Keeping Track of Progress¶

Stable Feature Functions¶

Scikit-learn Model Imputer¶

Applying Domain Knowledge¶

Encoding Categorical Data¶

One-Hot Encoding (Dummy Encoding)¶

Dummy Encoding in Pandas¶

Scikit-learn One-hot Encoder¶

Encoding Text Features¶

The Bag-of-Words Encoding¶

Professor Gonzalez is an "artist"¶

Bag-of-words in Scikit-learn¶

The N-Gram Encoding¶

Applying Text Encoding¶

Quick Reflection¶

Success!!!!!¶

	origin_europe	origin_japan	origin_usa
0	0	0	1
1	0	0	1
2	0	0	1
3	0	0	1
4	0	0	1
...	...	...	...
393	0	0	1
394	1	0	0
395	0	0	1
396	0	0	1
397	0	0	1

	origin_europe	origin_japan	origin_usa
0	0	0	1
1	0	0	1
2	0	0	1
3	0	0	1
4	0	0	1
...	...	...	...
393	0	0	1
394	1	0	0
395	0	0	1
396	0	0	1
397	0	0	1

	origin_europe	origin_japan	origin_usa
0	0	0	1
1	0	0	1
2	0	0	1
3	0	0	1
4	0	0	1
...	...	...	...
393	0	0	1
394	1	0	0
395	0	0	1
396	0	0	1
397	0	0	1