%load_ext sql

There's a new jupysql version available (0.10.2), you're running 0.8.0. To upgrade: pip install jupysql --upgrade

%%sql
sqlite:///data/basic_examples.db

%%sql
SELECT *
FROM Dish;

%%sql
SELECT type, COUNT(*)
FROM Dish
GROUP BY type
HAVING MAX(cost) < 8;

%%sql
SELECT type, MAX(name)
FROM DishDietary
WHERE notes == 'gf'
GROUP BY type
HAVING MAX(cost) <= 7;

%%sql
sqlite:///data/imdbmini.db

%%sql
SELECT *
FROM Title
ORDER BY RANDOM()
LIMIT 10;

%%sql
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE "%Star Wars%"

%%sql
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE "Harry Potter and the Deathly Hallows: Part _"

%%sql
SELECT primaryTitle, CAST(runtimeMinutes AS INT)
FROM Title
LIMIT 10;

%%sql
SELECT titleType, startYear,
CASE WHEN startYear < 1950 THEN "old"
     WHEN startYear < 2000 THEN "mid-aged"
     ELSE "new"
     END AS movie_age
FROM Title
ORDER BY RANDOM()
LIMIT 10;

%%sql
sqlite:///data/basic_examples.db

%%sql
SELECT * FROM s;

%%sql
SELECT * FROM t;

%%sql
SELECT s.id, name, breed
FROM s INNER JOIN t ON s.id = t.id;

%%sql
SELECT s.id, name, breed
FROM s JOIN t ON s.id = t.id;

%%sql
SELECT *
FROM s CROSS JOIN t;

%%sql
SELECT *
FROM s, t;

%%sql
SELECT *
FROM s CROSS JOIN t
WHERE s.id = t.id;

%%sql
SELECT *
FROM s LEFT JOIN t ON s.id = t.id;

%%sql
SELECT *
FROM s RIGHT JOIN t ON s.id = t.id;

%%sql
SELECT *
FROM s FULL JOIN t ON s.id = t.id;

%%sql
sqlite:///data/imdbmini.db

%%sql
SELECT primaryTitle, averageRating
FROM Title AS T INNER JOIN Rating AS R
ON T.tconst = R.tconst;

%%sql
SELECT primaryTitle, averageRating
FROM Title T INNER JOIN Rating R
ON T.tconst = R.tconst;

%%sql
SELECT primaryTitle, averageRating
FROM Title AS T INNER JOIN Rating AS R
ON tconst = tconst;

%%sql
SELECT primaryTitle, averageRating
FROM Title AS T INNER JOIN Rating AS R
ON T.tconst = R.tconst;

%%sql
sqlite:///data/imdbmini.db

%%sql
SELECT tbl_name, sql FROM sqlite_master WHERE type='table'

%%sql
SELECT *
FROM Title
WHERE primaryTitle IN ("Ginny & Georgia", "What If...?", "Succession", "Veep", "Tenet")
LIMIT 10;

%%sql 
SELECT tconst AS id,
  primaryTitle AS title, 
  titleType,
  CAST(runtimeMinutes AS int) AS time,  
  CAST(startYear AS int) AS year
FROM Title
WHERE time > 0 AND -- get rid of outliers and missing values --
      year > 0     -- get rid of missing values --

%%sql
SELECT tconst AS id,
  primaryTitle AS title, 
  titleType,
  CAST(runtimeMinutes AS int) AS time,  
  CAST(startYear AS int) AS year
FROM Title
WHERE time > 0 AND -- get rid of outliers and missing values --
      year > 0     -- get rid of missing values --
      AND titleType = "movie"
      AND genres LIKE "%Action%";

%%sql action_movies_sql <<
SELECT T.tconst AS id,
  primaryTitle AS title, 
  titleType,
  CAST(runtimeMinutes AS int) AS time,  
  CAST(startYear AS int) AS year,
  CAST(averageRating AS float) AS rating
FROM Title AS T INNER JOIN Rating AS R ON T.tconst = R.tconst
WHERE time > 0 AND -- get rid of outliers and missing values --
      year > 0     -- get rid of missing values --
      AND titleType = "movie"
      AND genres LIKE "%Action%";

action_movies_sql

action_movies_df = action_movies_sql.DataFrame()
action_movies_df

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as lm

sns.lineplot(data=action_movies_df, x="year", y="rating");

sns.scatterplot(data=action_movies_df, x="time", y="rating");

# Apply transformations to linearize the data
sns.scatterplot(x=np.log(action_movies_df["time"]), y=action_movies_df["rating"]**3);

X, Y = np.log(action_movies_df[["time"]]), action_movies_df["rating"]**3

model = lm.LinearRegression()
model.fit(X, Y)

xs = np.linspace(4, 5.5)
plt.plot(xs, model.predict(xs[:, np.newaxis]), c="tab:red")
sns.scatterplot(x=np.log(action_movies_df["time"]), y=action_movies_df["rating"]**3);

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

name	type	cost
ravioli	entree	10
ramen	entree	7
taco	entree	7
edamame	appetizer	4
fries	appetizer	4
potsticker	appetizer	4
ice cream	dessert	5

type	MAX(name)
appetizer	fries
entree	taco

tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
1485796	movie	The Greatest Showman	The Greatest Showman	2017	None	105	Biography,Drama,Musical
96163	movie	The Vanishing	Spoorloos	1988	None	107	Mystery,Thriller
185937	movie	The Blair Witch Project	The Blair Witch Project	1999	None	81	Horror,Mystery
117631	movie	Shine	Shine	1996	None	105	Biography,Drama,Music
1233381	movie	Three Monkeys	Üç Maymun	2008	None	109	Drama
831884	movie	Reservation Road	Reservation Road	2007	None	102	Crime,Drama,Thriller
11318602	tvMiniSeries	Don't F**k with Cats: Hunting an Internet Killer	Don't F**k with Cats: Hunting an Internet Killer	2019	2019	187	Crime,Documentary
473488	movie	A Guide to Recognizing Your Saints	A Guide to Recognizing Your Saints	2006	None	100	Crime,Drama
10342730	movie	Spiral	Spiral: From the Book of Saw	2021	None	93	Crime,Horror,Mystery
421073	movie	Street Kings	Street Kings	2008	None	109	Action,Crime,Drama

titleType	primaryTitle
movie	Star Wars: Episode IV - A New Hope
movie	Star Wars: Episode V - The Empire Strikes Back
movie	Star Wars: Episode VI - Return of the Jedi
movie	Star Wars: Episode I - The Phantom Menace
movie	Star Wars: Episode II - Attack of the Clones
movie	Star Wars: Episode III - Revenge of the Sith
tvSeries	Star Wars: Clone Wars
tvSeries	Star Wars: The Clone Wars
movie	Star Wars: The Clone Wars
movie	Star Wars: Episode VII - The Force Awakens

primaryTitle	CAST(runtimeMinutes AS INT)
A Trip to the Moon	13
The Birth of a Nation	195
The Cabinet of Dr. Caligari	76
The Kid	68
Nosferatu	94
Sherlock Jr.	45
Battleship Potemkin	75
The Gold Rush	95
Metropolis	153
The General	67

Lecture 21 – Data 100, Fall 2023¶

Loading the Data¶

Filtering Groups Using `HAVING`¶

EDA in SQL¶

Matching Text Using `LIKE`¶

Converting Data Types Using `CAST`¶

Applying Conditions With `CASE`¶

Joining Tables¶

Inner Join¶

Cross Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Aliasing in Joins¶

IMDB Case Study¶

titleType	primaryTitle
movie	Harry Potter and the Deathly Hallows: Part 1
movie	Harry Potter and the Deathly Hallows: Part 2

titleType	startYear	movie_age
movie	2005	new
movie	2019	new
movie	2010	new
movie	1992	mid-aged
movie	2002	new
movie	2015	new
movie	2012	new
movie	1953	mid-aged
movie	2015	new
movie	2013	new

id	name	id_1	breed
0	Apricot	1	persian
0	Apricot	2	ragdoll
0	Apricot	4	bengal
0	Apricot	5	persian
1	Boots	1	persian
1	Boots	2	ragdoll
1	Boots	4	bengal
1	Boots	5	persian
2	Cally	1	persian
2	Cally	2	ragdoll

tbl_name	sql
Title	CREATE TABLE "Title" ( "tconst" INTEGER, "titleType" TEXT, "primaryTitle" TEXT, "originalTitle" TEXT, "isAdult" TEXT, "startYear" TEXT, "endYear" TEXT, "runtimeMinutes" TEXT, "genres" TEXT )
Name	CREATE TABLE "Name" ( "nconst" INTEGER, "primaryName" TEXT, "birthYear" TEXT, "deathYear" TEXT, "primaryProfession" TEXT )
Role	CREATE TABLE "Role" ( tconst INTEGER, ordering TEXT, nconst INTEGER, category TEXT, job TEXT, characters TEXT )
Rating	CREATE TABLE "Rating" ( tconst INTEGER, averageRating TEXT, numVotes TEXT )

tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
10168312	tvSeries	What If...?	What If...?	2021	None	None	Action,Adventure,Animation
10813940	tvSeries	Ginny & Georgia	Ginny & Georgia	2021	None	None	Comedy,Drama
1759761	tvSeries	Veep	Veep	2012	2019	28	Comedy
6723592	movie	Tenet	Tenet	2020	None	150	Action,Sci-Fi,Thriller
7660850	tvSeries	Succession	Succession	2018	None	60	Drama

id	title	titleType	time	year
417	A Trip to the Moon	short	13	1902
4972	The Birth of a Nation	movie	195	1915
10323	The Cabinet of Dr. Caligari	movie	76	1920
12349	The Kid	movie	68	1921
13442	Nosferatu	movie	94	1922
15324	Sherlock Jr.	movie	45	1924
15648	Battleship Potemkin	movie	75	1925
15864	The Gold Rush	movie	95	1925
17136	Metropolis	movie	153	1927
17925	The General	movie	67	1926

id	title	titleType	time	year
15324	Sherlock Jr.	movie	45	1924
17925	The General	movie	67	1926
23427	Scarface	movie	93	1932
29843	The Adventures of Robin Hood	movie	102	1938
40506	Key Largo	movie	100	1948
40724	Red River	movie	133	1948
42041	White Heat	movie	114	1949
46534	The War of the Worlds	movie	85	1953
47034	Godzilla	movie	96	1954
47478	Seven Samurai	movie	207	1954

id	title	titleType	time	year	rating
15324	Sherlock Jr.	movie	45	1924	8.2
17925	The General	movie	67	1926	8.1
23427	Scarface	movie	93	1932	7.8
29843	The Adventures of Robin Hood	movie	102	1938	7.9
40506	Key Largo	movie	100	1948	7.8
40724	Red River	movie	133	1948	7.8
42041	White Heat	movie	114	1949	8.1
46534	The War of the Worlds	movie	85	1953	7.1
47034	Godzilla	movie	96	1954	7.6
47478	Seven Samurai	movie	207	1954	8.6

	id	title	titleType	time	year	rating
0	15324	Sherlock Jr.	movie	45	1924	8.2
1	17925	The General	movie	67	1926	8.1
2	23427	Scarface	movie	93	1932	7.8
3	29843	The Adventures of Robin Hood	movie	102	1938	7.9
4	40506	Key Largo	movie	100	1948	7.8
...	...	...	...	...	...	...
1715	9243946	El Camino: A Breaking Bad Movie	movie	122	2019	7.3
1716	9252468	Mosul	movie	86	2019	7.2
1717	9376612	Shang-Chi and the Legend of the Ten Rings	movie	132	2021	7.9
1718	9691136	Shadow in the Cloud	movie	83	2020	4.9
1719	9777666	The Tomorrow War	movie	138	2021	6.6

Lecture 21 – Data 100, Fall 2023¶

Loading the Data¶

Filtering Groups Using HAVING¶

EDA in SQL¶

Matching Text Using LIKE¶

Converting Data Types Using CAST¶

Applying Conditions With CASE¶

Joining Tables¶

Inner Join¶

Cross Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Aliasing in Joins¶

IMDB Case Study¶

Filtering Groups Using `HAVING`¶

Matching Text Using `LIKE`¶

Converting Data Types Using `CAST`¶

Applying Conditions With `CASE`¶