pip install jupysql --upgrade

Requirement already satisfied: jupysql in /srv/conda/envs/notebook/lib/python3.11/site-packages (0.10.15)
Requirement already satisfied: prettytable>=3.12.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (3.12.0)
Requirement already satisfied: sqlalchemy in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (2.0.16)
Requirement already satisfied: sqlparse in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.5.1)
Requirement already satisfied: ipython-genutils>=0.1.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.2.0)
Requirement already satisfied: jinja2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (3.1.3)
Requirement already satisfied: sqlglot>=11.3.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (25.24.0)
Requirement already satisfied: jupysql-plugin>=0.4.2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.4.5)
Requirement already satisfied: ploomber-core>=0.2.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.2.25)
Requirement already satisfied: pyyaml in /srv/conda/envs/notebook/lib/python3.11/site-packages (from ploomber-core>=0.2.7->jupysql) (6.0.1)
Requirement already satisfied: posthog in /srv/conda/envs/notebook/lib/python3.11/site-packages (from ploomber-core>=0.2.7->jupysql) (3.6.5)
Requirement already satisfied: wcwidth in /srv/conda/envs/notebook/lib/python3.11/site-packages (from prettytable>=3.12.0->jupysql) (0.2.13)
Requirement already satisfied: MarkupSafe>=2.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jinja2->jupysql) (2.1.5)
Requirement already satisfied: typing-extensions>=4.2.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from sqlalchemy->jupysql) (4.12.2)
Requirement already satisfied: greenlet!=0.4.17 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from sqlalchemy->jupysql) (3.0.3)
Requirement already satisfied: requests<3.0,>=2.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.32.3)
Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (1.16.0)
Requirement already satisfied: monotonic>=1.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (1.5)
Requirement already satisfied: backoff>=1.10.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.2.1)
Requirement already satisfied: python-dateutil>2.1 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.9.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (3.6)
Requirement already satisfied: urllib3<3,>=1.21.1 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (1.26.19)
Requirement already satisfied: certifi>=2017.4.17 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (2024.8.30)
Note: you may need to restart the kernel to use updated packages.

%load_ext sql

import duckdb
conn = duckdb.connect()
conn.query("INSTALL sqlite")
%config SqlMagic.displaylimit = 100

%sql duckdb:///data/basic_examples.db --alias duckdb

import os
if os.path.exists("/home/jovyan/shared/sql/imdb_duck.db"):
    imdbpath = "duckdb:////home/jovyan/shared/sql/imdb_duck.db"
elif os.path.exists("data/imdb_duck.db"):
    imdbpath =  "duckdb:///data/imdb_duck.db"
else:
    import gdown
    url = 'https://drive.google.com/uc?id=10tKOHGLt9QoOgq5Ii-FhxpB9lDSQgl1O'
    output_path = 'data/imdb_duck.db'
    gdown.download(url, output_path, quiet=False)
    imdbpath = "duckdb:///data/imdb_duck.db"
print(imdbpath)

duckdb:////home/jovyan/shared/sql/imdb_duck.db

from sqlalchemy import create_engine
imdb_engine = create_engine(imdbpath, connect_args={'read_only': True})
%sql imdb_engine --alias imdb

%%sql duckdb
SELECT *
FROM Dragon
ORDER BY cute DESC;

%%sql duckdb
SELECT *
FROM Dragon
LIMIT 2;

%%sql duckdb
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;

%%sql duckdb
SELECT *
FROM Dragon
ORDER BY RANDOM() 
LIMIT 2

%%sql duckdb
SELECT * 
FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);

%%sql duckdb
SELECT *
FROM Dish;

%%sql duckdb
SELECT type
FROM Dish;

%%sql duckdb
SELECT type
FROM Dish
GROUP BY type;

%%sql duckdb
SELECT type, SUM(cost)
FROM Dish
GROUP BY type;

%%sql duckdb
SELECT type, 
       SUM(cost), 
       MIN(cost),
       MAX(name)
FROM Dish
GROUP BY type;

%%sql duckdb
SELECT year, COUNT(cute)
FROM Dragon
GROUP BY year;

%%sql duckdb
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

%%sql duckdb
    
SELECT *
FROM Dish;

%%sql duckdb
    
SELECT type, COUNT(*)
FROM Dish
GROUP BY type
HAVING MAX(cost) < 8;

%%sql duckdb
    
SELECT type, COUNT(*)
FROM Dish
WHERE cost < 8
GROUP BY type

%%sql duckdb
    
SELECT type, MAX(name)
FROM DishDietary
WHERE notes == 'gf'
GROUP BY type
HAVING MAX(cost) <= 7;

%%sql imdb
SELECT setseed(0.42); -- Setting the random number seed

SELECT *
FROM Title
ORDER BY RANDOM()
LIMIT 10;

%%sql imdb 
    
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE '%Star Wars%'

%%sql imdb 
    
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle SIMILAR TO '.*Star Wars.*'

%%sql imdb
    
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE 'Harry Potter and the Deathly Hallows: Part _'

%%sql imdb
    
SELECT primaryTitle, CAST(runtimeMinutes AS REAL)
FROM Title
LIMIT 10;

%%sql imdb

SELECT setseed(0.42); -- Setting the random number seed
    
SELECT titleType, startYear,
CASE WHEN startYear < '1950' THEN 'old'
     WHEN startYear < '2000' THEN 'mid-aged'
     ELSE 'new'
     END AS movie_age
FROM Title
ORDER BY RANDOM()
LIMIT 10;

%%sql duckdb
SELECT * FROM s;

%%sql duckdb
SELECT * FROM t;

%%sql duckdb
SELECT s.id, name, breed
FROM s INNER JOIN t ON s.id = t.id;

%%sql duckdb
SELECT s.id, name, breed
FROM s JOIN t ON s.id = t.id;

%%sql duckdb
SELECT *
FROM s CROSS JOIN t;

%%sql duckdb
SELECT *
FROM s, t

%%sql duckdb
    
SELECT *
FROM s CROSS JOIN t
WHERE s.id = t.id;

%%sql duckdb
    
SELECT *
FROM s LEFT JOIN t ON s.id = t.id;

%%sql duckdb
    
SELECT *
FROM s RIGHT JOIN t ON s.id = t.id;

%%sql duckdb
    
SELECT *
FROM s FULL JOIN t ON s.id = t.id;

%%sql imdb
    
SELECT primaryTitle, averageRating, genres, 
FROM Title AS T INNER JOIN Rating AS R
    ON T.tconst = R.tconst
WHERE numVotes > 100 
ORDER BY averageRating DESC
LIMIT 10;

%%sql imdb 
    
SELECT primaryTitle, averageRating
FROM Title T INNER JOIN Rating R
    ON T.tconst = R.tconst
WHERE numVotes > 100 
ORDER BY averageRating DESC
LIMIT 10;

SELECT primaryTitle, averageRating
FROM Title AS T INNER JOIN Rating AS R
ON tconst = tconst;

%%sql imdb

SELECT tbl_name, sql FROM sqlite_master WHERE type='table'

%%sql imdb 
    
SELECT *
FROM Title
WHERE primaryTitle IN ('Ginny & Georgia', 'What If...?', 'Succession', 'Veep', 'Tenet')
LIMIT 50;

%%sql imdb 
    
SELECT 
    tconst AS id,
    primaryTitle AS title, 
    titleType,
    runtimeMinutes,
    startYear
FROM Title
WHERE 
    runtimeMinutes > 0 AND runtimeMinutes < 300 -- get rid of outliers and missing values --
    AND startYear > 0     -- get rid of missing values --
LIMIT 10

%%sql imdb
    
SELECT 
    T.tconst AS id,
    primaryTitle AS title, 
    titleType,
    runtimeMinutes,
    startYear,
    genres,
    averageRating, 
    numVotes
FROM Title T, Rating R
WHERE 
    T.tconst = R.tconst 
    AND runtimeMinutes > 0 AND runtimeMinutes < 300 -- get rid of outliers and missing values --
    AND startYear > 0     -- get rid of missing values --
    AND titleType = 'movie' -- get movies
    AND numVotes >= 1000
    AND genres LIKE '%Action%'
ORDER BY averageRating DESC
LIMIT 10

WITH table_name as ( 
    SELECT ...
)
SELECT ... 
FROM table_name, ...
...

%%sql
WITH good_action_movies AS (
    SELECT *
    FROM Title T JOIN Rating R ON T.tconst = R.tconst  
    WHERE genres LIKE '%Action%' AND averageRating > 7 AND numVotes > 5000
),
prolific_actors AS (
    SELECT N.nconst, primaryName, COUNT(*) as numRoles
    FROM Name N JOIN Principal P ON N.nconst = P.nconst
    WHERE category = 'actor'
    GROUP BY N.nconst, primaryName
)
SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating
FROM good_action_movies m, prolific_actors a, principal p
WHERE p.tconst = m.tconst AND p.nconst = a.nconst
ORDER BY rating DESC, numRoles DESC
LIMIT 10

%sql imdb
%config SqlMagic.autopandas = True

%%sql action_movies_df << 
SELECT 
    T.tconst AS id,
    primaryTitle AS title, 
    titleType,
    runtimeMinutes AS time,
    startYear AS year,
    genres,
    averageRating AS rating, 
    numVotes
FROM Title T, Rating R
WHERE 
    T.tconst = R.tconst 
    AND time > 0 AND time < 300 -- get rid of outliers and missing values --
    AND year > 0     -- get rid of missing values --
    AND titleType = 'movie' -- get movies
    AND numVotes >= 1000
    AND genres LIKE '%Action%'
ORDER BY startYear

action_movies_df

import plotly.express as px
px.scatter(action_movies_df, x = "year", y="rating", opacity=0.5)

%%sql action_grouped_df <<

WITH action_movies AS (
    SELECT 
        T.tconst AS id,
        primaryTitle AS title, 
        titleType,
        runtimeMinutes AS time,
        startYear AS year,
        genres,
        averageRating AS rating, 
        numVotes
    FROM Title T, Rating R
    WHERE 
        T.tconst = R.tconst 
        AND time > 0 AND time < 300 -- get rid of outliers and missing values --
        AND year > 0     -- get rid of missing values --
        AND titleType = 'movie' -- get movies
        AND numVotes >= 1000
    AND genres LIKE '%Action%'
)
SELECT 
    year, 
    median(rating) AS median_rating,
    quantile_cont(rating, .25) AS lower_q,
    quantile_cont(rating, .75) AS upper_q,
FROM action_movies
GROUP BY year
ORDER BY year

import plotly.express as px
action_grouped_df['error_p'] = action_grouped_df['upper_q'] - action_grouped_df['median_rating']
action_grouped_df['error_m'] = action_grouped_df['median_rating'] - action_grouped_df['lower_q'] 
fig = px.line(action_grouped_df, x="year", y="median_rating", 
              error_y="error_p", error_y_minus="error_m")
fig

px.scatter(action_movies_df, x="time", y="rating", opacity=0.1)

px.density_heatmap(action_movies_df, x="time", y="rating",nbinsx=100, nbinsy=100)

%%sql action_grouped_df << 
WITH action_movies AS (
    SELECT 
        T.tconst AS id,
        primaryTitle AS title, 
        titleType,
        runtimeMinutes AS time,
        startYear AS year,
        genres,
        averageRating AS rating, 
        numVotes
    FROM Title T, Rating R
    WHERE 
        T.tconst = R.tconst 
        AND time > 0 AND time < 300 -- get rid of outliers and missing values --
        AND year > 0     -- get rid of missing values --
        AND titleType = 'movie' -- get movies
        AND numVotes >= 1000
    AND genres LIKE '%Action%'
)
SELECT 
    ceil(time / 10) * 10 as time_desc, 
    median(rating) AS median_rating,
    quantile_cont(rating, .25) AS lower_q,
    quantile_cont(rating, .75) AS upper_q,
FROM action_movies
GROUP BY time_desc
ORDER BY time_desc

import plotly.express as px
action_grouped_df['error_p'] = action_grouped_df['upper_q'] - action_grouped_df['median_rating']
action_grouped_df['error_m'] = action_grouped_df['median_rating'] - action_grouped_df['lower_q'] 
fig = px.line(action_grouped_df, x="time_desc", y="median_rating", 
              error_y="error_p", error_y_minus="error_m")
fig

import sklearn.linear_model as lm
import numpy as np
X, Y = action_movies_df["time"].to_numpy(), action_movies_df["rating"].to_numpy()

model = lm.LinearRegression()
model.fit(X[..., np.newaxis], Y)

xs = np.linspace(X.min(),X.max())
yhats = model.predict(xs[:, np.newaxis])

fig = px.scatter(x=X, y=Y, opacity=0.1)
fig.add_scatter(x=xs, y=yhats)

px.scatter(x=X, y=Y, opacity=0.1, trendline="ols")

name	year	cute
hiccup	2010	10
dragon 2	2019	0
drogon	2011	-100

name	year	cute
hiccup	2010	10
drogon	2011	-100

name	year	cute
drogon	2011	-100
dragon 2	2019	0

name	year	cute
hiccup	2010	10
drogon	2011	-100

name	type	cost
ravioli	entree	10
ramen	entree	7
taco	entree	7
edamame	appetizer	4
fries	appetizer	4
potsticker	appetizer	4
ice cream	dessert	5

Lecture 21 – Data 100, Fall 2024¶

Loading the Data¶

More on Basic Queries¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Sampling from a Database¶

Grouping Data with `GROUP BY`¶

Filtering Groups Using `HAVING`¶

EDA in SQL¶

Matching Text Using `LIKE`¶

Converting Data Types Using `CAST`¶

Applying Conditions With `CASE`¶

Joining Tables¶

Inner Join¶

Cross Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Aliasing in Joins¶

IMDB Case Study¶

Using CTEs to do some complex analysis¶

How do ratings vary through the years?¶

How does the runtime of an action movie influence the rating it receives?¶

tconst	titleType	primaryTitle	originaTitle	isAdult	startYear	endYear	runtimeMinutes	genres
tt3247860	movie	Super Awesome!	Super Awesome!	False	2015	None	89	Comedy,Musical
tt4094808	movie	A Little Girl's Dream	Yume wa ushi no oisha-san	False	2014	None	86	Biography,Documentary,Drama
tt1316064	movie	Made in China	Made in China	False	2009	None	87	Comedy
tt2518926	movie	Age of Dinosaurs	Age of Dinosaurs	False	2013	None	88	Action,Adventure,Sci-Fi
tt1041794	movie	Le miracle de la foi	Le miracle de la foi	False	2005	None	111	Drama
tt8480486	movie	Çetin Ceviz 2	Çetin Ceviz 2	False	2016	None	114	Comedy
tt5502918	movie	Ek Full Chaar Half	Ek Full Chaar Half	False	1991	None	None	Comedy
tt0392297	movie	Krishna Kausalya	Krishna Kausalya	False	1929	None	None	None
tt3062740	movie	Broken Not Beaten	Broken Not Beaten	False	2003	None	45	Documentary
tt19245950	movie	Squad of Girls	Squad of Girls	False	2022	None	None	Drama,War

titleType	primaryTitle
movie	Star Wars: Episode IV - A New Hope
movie	Star Wars: Episode V - The Empire Strikes Back
movie	Star Wars: Episode VI - Return of the Jedi
movie	Star Wars: Episode I - The Phantom Menace
movie	Star Wars: Episode II - Attack of the Clones
movie	Star Wars: Episode III - Revenge of the Sith
tvMovie	The Star Wars Holiday Special
tvMovie	From Star Wars to Star Wars: The Story of Industrial Light & Magic
tvMovie	Star Wars City
tvMovie	From 'Star Wars' to 'Jedi': The Making of a Saga
tvMovie	The Making of 'Star Wars'
movie	Doraemon: Nobita's Little Star Wars
movie	Saving Star Wars
tvMovie	The Mythology of Star Wars
tvMovie	Star Wars: Feel the Force
movie	The Untitled Star Wars Mockumentary
tvMovie	Star Wars: The Magic & the Mystery
tvMovie	Star Wars: Shortened!
tvMovie	When Star Wars Ruled the World
tvMovie	Robot Chicken: Star Wars
movie	Star Wars: Battle for the Holocrons
movie	Rifftrax: The Star Wars Holiday Special
movie	Star Wars: Rogue Squadron
movie	Untitled Taika Waititi Star Wars Film
movie	Star Wars: New Jedi Order
movie	RiffTrax: Star Wars: The Force Awakens
tvMovie	Star Wars: The Legacy Revealed
tvMovie	Star Wars Tech
tvMovie	Bring Back... Star Wars
tvMovie	Nissan Rogue: Star Wars Rogue One Battle Tested
tvMovie	Walmart: Rogue One: A Star Wars Story
tvMovie	John Boyega Pranks Star Wars Fans with Surprise Photobomb at Celebration: Force For Change
tvMovie	Star Wars Galaxy's Edge: Adventure Awaits
movie	77-Star Wars-17
movie	Untitled Star Wars/Kevin Feige Project
tvMovie	Star Wars: Music by John Williams
movie	Battle Star Wars
movie	RiffTrax: Star Wars: The Last Jedi
movie	Built by Jedi - The Making of Star Wars Jedi: Fallen Order
movie	Untitled Star Wars/Exegol Project
movie	Star Wars: The Clone Wars
movie	Star Wars: Hand of the Empire
tvMovie	Star Wars in Japan Celebration
tvMovie	The Lego Star Wars Holiday Special
movie	How to Watch Star Wars, Part One: The Prequels Are Better Movies Than You Deserve
tvMovie	Hollywood's Master of Myth: Joseph Campbell - The Force Behind Star Wars
movie	Star Wars: New Horizons
tvMovie	Toy Empire: The British Force Behind Star Wars Toys
movie	Conflict: A Star Wars Story
movie	Star Wars: Rogue Squadron
movie	Star Wars: A Droid Story
movie	Doraemon the Movie: Nobita's Little Star Wars 2021
movie	Star Wars: A Galaxy in Darkness
movie	Journey of the Outlaw: A Star Wars Story
movie	Lego Star Wars the Last Hope
movie	Revan: A Star Wars Fan Film
tvMovie	Lego Star Wars Terrifying Tales
movie	Star Wars Inner Demons
movie	Rifftrax: Solo: A Star Wars Story
movie	Star Wars: The Chosen One
movie	Star Wars xxx parody
tvMovie	Robot Chicken: Star Wars III
tvMovie	Von Star Wars bis Harry Potter - John Williams dirigiert die Wiener Philharmoniker
movie	Star Wars Heroes & Villains
tvMovie	Lego Star Wars: The Padawan Menace
tvMovie	Star Wars: ESPN Fantasy Football
movie	Star Wars: Star Warriors
movie	Plageuis: A Star Wars Story
movie	Star Wars Kid: The Rise of the Digital Shadows
movie	Star Wars: Light of the Jedi
movie	Star Wars: The Rise of Ren
movie	Luke Skywalker: A Star Wars Story
tvMovie	Musk vs Bezos: The New Star Wars
tvMovie	Kinect Star Wars: Duel
movie	Star Wars: Star Warriors
movie	Star Wars: The Mystery of the Lockstream Cargo.
movie	Star Wars: Threads of Destiny
tvMovie	Lego Star Wars: The Empire Strikes Out
tvMovie	Lego Star Wars: The Empire Strikes Out
movie	Star Wars: Episode VII - The Force Awakens
movie	Star Wars: Episode VIII - The Last Jedi
movie	Star Wars: Episode IX - The Rise of Skywalker
movie	The Disney Star Wars Trilogy Documentary
movie	The Fans vs Disney Star Wars Special Edition
movie	Star Wars: Shattered Empire
movie	Star Wars: Labyrinth of Evil, Part I
movie	Untitled Dave Filoni Star Wars Project
movie	Star Wars: Dawn of the Jedi
movie	Star Wars: Crimson Empire
movie	Untitled Star Wars
movie	Star Wars: Unlikely Allies
tvMovie	Kinect Star Wars: Girly Vader
movie	Untitled Star Wars/Obaid-Chinoy Project
movie	The Patrick (H) Willems Star Wars Holiday Special
movie	Star Wars: Revenge of the Apprentice
movie	Star Wars: The Fandom Menace
movie	Star Wars: Tremors of the Force
movie	Star Wars: The Force Unleashed II
movie	Plastic Galaxy: The Story of Star Wars Toys
movie	Rogue One: A Star Wars Story

titleType	primaryTitle
movie	Harry Potter and the Deathly Hallows: Part 1
movie	Harry Potter and the Deathly Hallows: Part 2

primaryTitle	CAST(runtimeMinutes AS FLOAT)
Miss Jerry	45.0
The Corbett-Fitzsimmons Fight	100.0
Bohemios	100.0
The Story of the Kelly Gang	70.0
The Prodigal Son	90.0
Robbery Under Arms	None
Hamlet	None
Don Quijote	None
The Fairylogue and Radio-Plays	120.0
A Cultura do Cacau	None

titleType	startYear	movie_age
tvMovie	2012	new
movie	2023	new
movie	2009	new
tvMovie	2016	new
movie	1929	old
movie	2016	new
movie	None	new
tvMovie	1999	mid-aged
tvMovie	2019	new
movie	None	new

id	name	id_1	breed
0	Apricot	1	persian
0	Apricot	2	ragdoll
0	Apricot	4	bengal
0	Apricot	5	persian
1	Boots	1	persian
1	Boots	2	ragdoll
1	Boots	4	bengal
1	Boots	5	persian
2	Cally	1	persian
2	Cally	2	ragdoll
2	Cally	4	bengal
2	Cally	5	persian
4	Eugene	1	persian
4	Eugene	2	ragdoll
4	Eugene	4	bengal
4	Eugene	5	persian

primaryTitle	averageRating	genres
A Maiden's Tale	10.0	Drama,Romance
The Dark Web	10.0	Crime,Thriller
Electric Burma	9.899999618530273	Documentary
Mr. Natwarlal	9.899999618530273	Action
Brother Islands, a Scuba Dive Adventure	9.899999618530273	Documentary
Imamdasta	9.899999618530273	Drama
Rule Number 4	9.899999618530273	None
Aima	9.899999618530273	None
Cuento de Primavera: A Spring Tale	9.899999618530273	Drama,Fantasy,Mystery
Ondolle Love Story	9.800000190734863	Drama,Romance

tbl_name	sql
name	CREATE TABLE "name"(nconst VARCHAR PRIMARY KEY, primaryName VARCHAR, birthYear INTEGER, deathYear INTEGER, primaryProfession VARCHAR, knownForTitles VARCHAR);
principal	CREATE TABLE principal(tconst VARCHAR NOT NULL, ordering INTEGER, nconst VARCHAR NOT NULL, category VARCHAR, job VARCHAR, characters VARCHAR, FOREIGN KEY (tconst) REFERENCES title(tconst), FOREIGN KEY (nconst) REFERENCES name(nconst));
rating	CREATE TABLE rating(tconst VARCHAR NOT NULL, averageRating FLOAT, numVotes INTEGER, FOREIGN KEY (tconst) REFERENCES title(tconst));
title	CREATE TABLE title(tconst VARCHAR PRIMARY KEY, titleType VARCHAR, primaryTitle VARCHAR, originaTitle VARCHAR, isAdult BOOLEAN, startYear INTEGER, endYear INTEGER, runtimeMinutes INTEGER, genres VARCHAR);

tconst	titleType	primaryTitle	originaTitle	isAdult	startYear	endYear	runtimeMinutes	genres
tt8008862	movie	Succession	Succession	False	2018	None	None	Horror
tt8819150	movie	Succession	Succession	False	2018	None	71	Documentary
tt9663946	movie	Tenet	Tenet	False	2020	None	150	Action,Sci-Fi,Thriller
tt0254490	movie	Succession	Keisho sakazuki	False	1992	None	119	Drama
tt4097596	movie	What If...?	What If...?	False	None	None	25	Action,Drama
tt1003027	movie	What If...?	Notre univers impitoyable	False	2008	None	87	Comedy
tt6723592	movie	Tenet	Tenet	False	2020	None	150	Action,Sci-Fi,Thriller

id	title	titleType	runtimeMinutes	startYear
tt0000009	Miss Jerry	movie	45	1894
tt0000147	The Corbett-Fitzsimmons Fight	movie	100	1897
tt0000502	Bohemios	movie	100	1905
tt0000574	The Story of the Kelly Gang	movie	70	1906
tt0000591	The Prodigal Son	movie	90	1907
tt0000679	The Fairylogue and Radio-Plays	movie	120	1908
tt0000941	Locura de amor	movie	45	1909
tt0001184	Don Juan de Serrallonga	movie	58	1910
tt0001285	The Life of Moses	movie	50	1909
tt0001498	The Battle of Trafalgar	movie	51	1911

id	title	titleType	runtimeMinutes	startYear	genres	averageRating	numVotes
tt0259534	Ramayana: The Legend of Prince Rama	movie	135	1993	Action,Adventure,Animation	9.199999809265137	14441
tt0167260	The Lord of the Rings: The Return of the King	movie	201	2003	Action,Adventure,Drama	9.0	1970161
tt29284103	Nene Saroja	movie	155	2023	Action	9.0	1038
tt0468569	The Dark Knight	movie	152	2008	Action,Crime,Drama	9.0	2857548
tt14090476	Sarsenapati Hambirrao	movie	158	2022	Action,Drama,History	8.899999618530273	2877
tt16492678	Demon Slayer: Kimetsu no Yaiba - Tsuzumi Mansion Arc	movie	87	2021	Action,Animation,Fantasy	8.899999618530273	24226
tt0075143	The Message	movie	207	1976	Action,Adventure,Biography	8.899999618530273	10447
tt0120737	The Lord of the Rings: The Fellowship of the Ring	movie	178	2001	Action,Adventure,Drama	8.899999618530273	1998121
tt3189256	Om	movie	150	1995	Action,Crime,Drama	8.899999618530273	3917
tt0237376	Kireedam	movie	124	1989	Action,Drama	8.899999618530273	8490

primaryTitle	primaryName	numRoles	rating
Threat Level Midnight: The Movie	Steve Carell	51	10.0
Threat Level Midnight: The Movie	Steve Carell	51	10.0
Threat Level Midnight: The Movie	Rainn Wilson	36	10.0
Threat Level Midnight: The Movie	Ed Helms	36	10.0
Threat Level Midnight: The Movie	Ed Helms	36	10.0
Threat Level Midnight: The Movie	John Krasinski	28	10.0
Threat Level Midnight: The Movie	Craig Robinson	26	10.0
Threat Level Midnight: The Movie	Ricky Gervais	17	10.0
Threat Level Midnight: The Movie	Stephen Merchant	14	10.0
Threat Level Midnight: The Movie	Creed Bratton	12	10.0

	id	title	titleType	time	year	genres	rating	numVotes
0	tt0004465	The Perils of Pauline	movie	199	1914	Action,Adventure,Drama	6.3	1085
1	tt0004635	The Squaw Man	movie	74	1914	Action,Drama,Romance	5.7	1041
2	tt0006333	20,000 Leagues Under the Sea	movie	85	1916	Action,Adventure,Sci-Fi	6.1	1980
3	tt0011387	The Last of the Mohicans	movie	73	1920	Action,Adventure,Drama	6.7	1439
4	tt0012642	A Sailor-Made Man	movie	47	1921	Action,Adventure,Comedy	6.8	1246
...	...	...	...	...	...	...	...	...
7850	tt29195117	Justice League: Crisis on Infinite Earths - Pa...	movie	93	2024	Action,Adventure,Animation	6.3	3413
7851	tt29515802	S-99	movie	119	2024	Action	8.8	2049
7852	tt29538571	Sixty Minutes	movie	88	2024	Action,Adventure,Drama	5.7	3627
7853	tt29722855	Badland Hunters	movie	107	2024	Action,Adventure,Drama	5.9	8596
7854	tt3359350	Road House	movie	121	2024	Action,Thriller	6.2	48811

Lecture 21 – Data 100, Fall 2024¶

Loading the Data¶

More on Basic Queries¶

Ordering data using ORDER BY¶

Restricting output with LIMIT and OFFSET¶

Sampling from a Database¶

Grouping Data with GROUP BY¶

Filtering Groups Using HAVING¶

EDA in SQL¶

Matching Text Using LIKE¶

Converting Data Types Using CAST¶

Applying Conditions With CASE¶

Joining Tables¶

Inner Join¶

Cross Join¶

Left Outer Join¶

Right Outer Join¶

Full Outer Join¶

Aliasing in Joins¶

IMDB Case Study¶

Using CTEs to do some complex analysis¶

How do ratings vary through the years?¶

How does the runtime of an action movie influence the rating it receives?¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Grouping Data with `GROUP BY`¶

Filtering Groups Using `HAVING`¶

Matching Text Using `LIKE`¶

Converting Data Types Using `CAST`¶

Applying Conditions With `CASE`¶