Loading the Data¶
In this lecture, we'll continue our work with the Dish
table. In the cells below, we connect to the database and query the table.
pip install jupysql --upgrade
Requirement already satisfied: jupysql in /srv/conda/envs/notebook/lib/python3.11/site-packages (0.10.15) Requirement already satisfied: prettytable>=3.12.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (3.12.0) Requirement already satisfied: sqlalchemy in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (2.0.16) Requirement already satisfied: sqlparse in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.5.1) Requirement already satisfied: ipython-genutils>=0.1.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.2.0) Requirement already satisfied: jinja2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (3.1.3) Requirement already satisfied: sqlglot>=11.3.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (25.24.0) Requirement already satisfied: jupysql-plugin>=0.4.2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.4.5) Requirement already satisfied: ploomber-core>=0.2.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jupysql) (0.2.25) Requirement already satisfied: pyyaml in /srv/conda/envs/notebook/lib/python3.11/site-packages (from ploomber-core>=0.2.7->jupysql) (6.0.1) Requirement already satisfied: posthog in /srv/conda/envs/notebook/lib/python3.11/site-packages (from ploomber-core>=0.2.7->jupysql) (3.6.5) Requirement already satisfied: wcwidth in /srv/conda/envs/notebook/lib/python3.11/site-packages (from prettytable>=3.12.0->jupysql) (0.2.13) Requirement already satisfied: MarkupSafe>=2.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from jinja2->jupysql) (2.1.5) Requirement already satisfied: typing-extensions>=4.2.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from sqlalchemy->jupysql) (4.12.2) Requirement already satisfied: greenlet!=0.4.17 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from sqlalchemy->jupysql) (3.0.3) Requirement already satisfied: requests<3.0,>=2.7 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.32.3) Requirement already satisfied: six>=1.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (1.16.0) Requirement already satisfied: monotonic>=1.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (1.5) Requirement already satisfied: backoff>=1.10.0 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.2.1) Requirement already satisfied: python-dateutil>2.1 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from posthog->ploomber-core>=0.2.7->jupysql) (2.9.0) Requirement already satisfied: charset-normalizer<4,>=2 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (3.6) Requirement already satisfied: urllib3<3,>=1.21.1 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (1.26.19) Requirement already satisfied: certifi>=2017.4.17 in /srv/conda/envs/notebook/lib/python3.11/site-packages (from requests<3.0,>=2.7->posthog->ploomber-core>=0.2.7->jupysql) (2024.8.30) Note: you may need to restart the kernel to use updated packages.
%load_ext sql
Add support for reading sqlite
tables using duckdb.
import duckdb
conn = duckdb.connect()
conn.query("INSTALL sqlite")
%config SqlMagic.displaylimit = 100
Loading the SQLite database file using duckdb.
%sql duckdb:///data/basic_examples.db --alias duckdb
Get the large IMDB database file for other examples in this demo.
import os
if os.path.exists("/home/jovyan/shared/sql/imdb_duck.db"):
imdbpath = "duckdb:////home/jovyan/shared/sql/imdb_duck.db"
elif os.path.exists("data/imdb_duck.db"):
imdbpath = "duckdb:///data/imdb_duck.db"
else:
import gdown
url = 'https://drive.google.com/uc?id=10tKOHGLt9QoOgq5Ii-FhxpB9lDSQgl1O'
output_path = 'data/imdb_duck.db'
gdown.download(url, output_path, quiet=False)
imdbpath = "duckdb:///data/imdb_duck.db"
print(imdbpath)
duckdb:////home/jovyan/shared/sql/imdb_duck.db
from sqlalchemy import create_engine
imdb_engine = create_engine(imdbpath, connect_args={'read_only': True})
%sql imdb_engine --alias imdb
More on Basic Queries¶
Ordering data using ORDER BY
¶
%%sql duckdb
SELECT *
FROM Dragon
ORDER BY cute DESC;
name | year | cute |
---|---|---|
hiccup | 2010 | 10 |
dragon 2 | 2019 | 0 |
drogon | 2011 | -100 |
Restricting output with LIMIT
and OFFSET
¶
%%sql duckdb
SELECT *
FROM Dragon
LIMIT 2;
name | year | cute |
---|---|---|
hiccup | 2010 | 10 |
drogon | 2011 | -100 |
%%sql duckdb
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;
name | year | cute |
---|---|---|
drogon | 2011 | -100 |
dragon 2 | 2019 | 0 |
Sampling from a Database¶
What if we wanted a random sample:
%%sql duckdb
SELECT *
FROM Dragon
ORDER BY RANDOM()
LIMIT 2
name | year | cute |
---|---|---|
dragon 2 | 2019 | 0 |
hiccup | 2010 | 10 |
%%sql duckdb
SELECT *
FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);
name | year | cute |
---|---|---|
hiccup | 2010 | 10 |
drogon | 2011 | -100 |
Grouping Data with GROUP BY
¶
%%sql duckdb
SELECT *
FROM Dish;
name | type | cost |
---|---|---|
ravioli | entree | 10 |
ramen | entree | 7 |
taco | entree | 7 |
edamame | appetizer | 4 |
fries | appetizer | 4 |
potsticker | appetizer | 4 |
ice cream | dessert | 5 |
%%sql duckdb
SELECT type
FROM Dish;
type |
---|
entree |
entree |
entree |
appetizer |
appetizer |
appetizer |
dessert |
%%sql duckdb
SELECT type
FROM Dish
GROUP BY type;
type |
---|
entree |
dessert |
appetizer |
%%sql duckdb
SELECT type, SUM(cost)
FROM Dish
GROUP BY type;
type | sum("cost") |
---|---|
entree | 24 |
dessert | 5 |
appetizer | 12 |
%%sql duckdb
SELECT type,
SUM(cost),
MIN(cost),
MAX(name)
FROM Dish
GROUP BY type;
type | sum("cost") | min("cost") | max("name") |
---|---|---|---|
entree | 24 | 7 | taco |
dessert | 5 | 5 | ice cream |
appetizer | 12 | 4 | potsticker |
%%sql duckdb
SELECT year, COUNT(cute)
FROM Dragon
GROUP BY year;
year | count(cute) |
---|---|
2011 | 1 |
2019 | 1 |
2010 | 1 |
%%sql duckdb
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;
year | count_star() |
---|---|
2011 | 1 |
2010 | 1 |
2019 | 1 |
Filtering Groups Using HAVING
¶
%%sql duckdb
SELECT *
FROM Dish;
name | type | cost |
---|---|---|
ravioli | entree | 10 |
ramen | entree | 7 |
taco | entree | 7 |
edamame | appetizer | 4 |
fries | appetizer | 4 |
potsticker | appetizer | 4 |
ice cream | dessert | 5 |
%%sql duckdb
SELECT type, COUNT(*)
FROM Dish
GROUP BY type
HAVING MAX(cost) < 8;
type | count_star() |
---|---|
dessert | 1 |
appetizer | 3 |
%%sql duckdb
SELECT type, COUNT(*)
FROM Dish
WHERE cost < 8
GROUP BY type
type | count_star() |
---|---|
entree | 2 |
appetizer | 3 |
dessert | 1 |
%%sql duckdb
SELECT type, MAX(name)
FROM DishDietary
WHERE notes == 'gf'
GROUP BY type
HAVING MAX(cost) <= 7;
type | max("name") |
---|---|
appetizer | fries |
entree | taco |
EDA in SQL¶
Our typical workflow when working with "big data" is:
- Use SQL to query data from a database
- Use Python (with
pandas
) to analyze this data in detail
We can, however, still perform simple data cleaning and re-structuring using SQL directly. To do so, we'll consider the Title
table from the IMDB dataset.
%%sql imdb
SELECT setseed(0.42); -- Setting the random number seed
SELECT *
FROM Title
ORDER BY RANDOM()
LIMIT 10;
tconst | titleType | primaryTitle | originaTitle | isAdult | startYear | endYear | runtimeMinutes | genres |
---|---|---|---|---|---|---|---|---|
tt3247860 | movie | Super Awesome! | Super Awesome! | False | 2015 | None | 89 | Comedy,Musical |
tt4094808 | movie | A Little Girl's Dream | Yume wa ushi no oisha-san | False | 2014 | None | 86 | Biography,Documentary,Drama |
tt1316064 | movie | Made in China | Made in China | False | 2009 | None | 87 | Comedy |
tt2518926 | movie | Age of Dinosaurs | Age of Dinosaurs | False | 2013 | None | 88 | Action,Adventure,Sci-Fi |
tt1041794 | movie | Le miracle de la foi | Le miracle de la foi | False | 2005 | None | 111 | Drama |
tt8480486 | movie | Çetin Ceviz 2 | Çetin Ceviz 2 | False | 2016 | None | 114 | Comedy |
tt5502918 | movie | Ek Full Chaar Half | Ek Full Chaar Half | False | 1991 | None | None | Comedy |
tt0392297 | movie | Krishna Kausalya | Krishna Kausalya | False | 1929 | None | None | None |
tt3062740 | movie | Broken Not Beaten | Broken Not Beaten | False | 2003 | None | 45 | Documentary |
tt19245950 | movie | Squad of Girls | Squad of Girls | False | 2022 | None | None | Drama,War |
Matching Text Using LIKE
¶
%%sql imdb
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE '%Star Wars%'
titleType | primaryTitle |
---|---|
movie | Star Wars: Episode IV - A New Hope |
movie | Star Wars: Episode V - The Empire Strikes Back |
movie | Star Wars: Episode VI - Return of the Jedi |
movie | Star Wars: Episode I - The Phantom Menace |
movie | Star Wars: Episode II - Attack of the Clones |
movie | Star Wars: Episode III - Revenge of the Sith |
tvMovie | The Star Wars Holiday Special |
tvMovie | From Star Wars to Star Wars: The Story of Industrial Light & Magic |
tvMovie | Star Wars City |
tvMovie | From 'Star Wars' to 'Jedi': The Making of a Saga |
tvMovie | The Making of 'Star Wars' |
movie | Doraemon: Nobita's Little Star Wars |
movie | Saving Star Wars |
tvMovie | The Mythology of Star Wars |
tvMovie | Star Wars: Feel the Force |
movie | The Untitled Star Wars Mockumentary |
tvMovie | Star Wars: The Magic & the Mystery |
tvMovie | Star Wars: Shortened! |
tvMovie | When Star Wars Ruled the World |
tvMovie | Robot Chicken: Star Wars |
movie | Star Wars: Battle for the Holocrons |
movie | Rifftrax: The Star Wars Holiday Special |
movie | Star Wars: Rogue Squadron |
movie | Untitled Taika Waititi Star Wars Film |
movie | Star Wars: New Jedi Order |
movie | RiffTrax: Star Wars: The Force Awakens |
tvMovie | Star Wars: The Legacy Revealed |
tvMovie | Star Wars Tech |
tvMovie | Bring Back... Star Wars |
tvMovie | Nissan Rogue: Star Wars Rogue One Battle Tested |
tvMovie | Walmart: Rogue One: A Star Wars Story |
tvMovie | John Boyega Pranks Star Wars Fans with Surprise Photobomb at Celebration: Force For Change |
tvMovie | Star Wars Galaxy's Edge: Adventure Awaits |
movie | 77-Star Wars-17 |
movie | Untitled Star Wars/Kevin Feige Project |
tvMovie | Star Wars: Music by John Williams |
movie | Battle Star Wars |
movie | RiffTrax: Star Wars: The Last Jedi |
movie | Built by Jedi - The Making of Star Wars Jedi: Fallen Order |
movie | Untitled Star Wars/Exegol Project |
movie | Star Wars: The Clone Wars |
movie | Star Wars: Hand of the Empire |
tvMovie | Star Wars in Japan Celebration |
tvMovie | The Lego Star Wars Holiday Special |
movie | How to Watch Star Wars, Part One: The Prequels Are Better Movies Than You Deserve |
tvMovie | Hollywood's Master of Myth: Joseph Campbell - The Force Behind Star Wars |
movie | Star Wars: New Horizons |
tvMovie | Toy Empire: The British Force Behind Star Wars Toys |
movie | Conflict: A Star Wars Story |
movie | Star Wars: Rogue Squadron |
movie | Star Wars: A Droid Story |
movie | Doraemon the Movie: Nobita's Little Star Wars 2021 |
movie | Star Wars: A Galaxy in Darkness |
movie | Journey of the Outlaw: A Star Wars Story |
movie | Lego Star Wars the Last Hope |
movie | Revan: A Star Wars Fan Film |
tvMovie | Lego Star Wars Terrifying Tales |
movie | Star Wars Inner Demons |
movie | Rifftrax: Solo: A Star Wars Story |
movie | Star Wars: The Chosen One |
movie | Star Wars xxx parody |
tvMovie | Robot Chicken: Star Wars III |
tvMovie | Von Star Wars bis Harry Potter - John Williams dirigiert die Wiener Philharmoniker |
movie | Star Wars Heroes & Villains |
tvMovie | Lego Star Wars: The Padawan Menace |
tvMovie | Star Wars: ESPN Fantasy Football |
movie | Star Wars: Star Warriors |
movie | Plageuis: A Star Wars Story |
movie | Star Wars Kid: The Rise of the Digital Shadows |
movie | Star Wars: Light of the Jedi |
movie | Star Wars: The Rise of Ren |
movie | Luke Skywalker: A Star Wars Story |
tvMovie | Musk vs Bezos: The New Star Wars |
tvMovie | Kinect Star Wars: Duel |
movie | Star Wars: Star Warriors |
movie | Star Wars: The Mystery of the Lockstream Cargo. |
movie | Star Wars: Threads of Destiny |
tvMovie | Lego Star Wars: The Empire Strikes Out |
tvMovie | Lego Star Wars: The Empire Strikes Out |
movie | Star Wars: Episode VII - The Force Awakens |
movie | Star Wars: Episode VIII - The Last Jedi |
movie | Star Wars: Episode IX - The Rise of Skywalker |
movie | The Disney Star Wars Trilogy Documentary |
movie | The Fans vs Disney Star Wars Special Edition |
movie | Star Wars: Shattered Empire |
movie | Star Wars: Labyrinth of Evil, Part I |
movie | Untitled Dave Filoni Star Wars Project |
movie | Star Wars: Dawn of the Jedi |
movie | Star Wars: Crimson Empire |
movie | Untitled Star Wars |
movie | Star Wars: Unlikely Allies |
tvMovie | Kinect Star Wars: Girly Vader |
movie | Untitled Star Wars/Obaid-Chinoy Project |
movie | The Patrick (H) Willems Star Wars Holiday Special |
movie | Star Wars: Revenge of the Apprentice |
movie | Star Wars: The Fandom Menace |
movie | Star Wars: Tremors of the Force |
movie | Star Wars: The Force Unleashed II |
movie | Plastic Galaxy: The Story of Star Wars Toys |
movie | Rogue One: A Star Wars Story |
%%sql imdb
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle SIMILAR TO '.*Star Wars.*'
titleType | primaryTitle |
---|---|
movie | Star Wars: Episode IV - A New Hope |
movie | Star Wars: Episode V - The Empire Strikes Back |
movie | Star Wars: Episode VI - Return of the Jedi |
movie | Star Wars: Episode I - The Phantom Menace |
movie | Star Wars: Episode II - Attack of the Clones |
movie | Star Wars: Episode III - Revenge of the Sith |
tvMovie | The Star Wars Holiday Special |
tvMovie | From Star Wars to Star Wars: The Story of Industrial Light & Magic |
tvMovie | Star Wars City |
tvMovie | From 'Star Wars' to 'Jedi': The Making of a Saga |
tvMovie | The Making of 'Star Wars' |
movie | Doraemon: Nobita's Little Star Wars |
movie | Saving Star Wars |
tvMovie | The Mythology of Star Wars |
tvMovie | Star Wars: Feel the Force |
movie | The Untitled Star Wars Mockumentary |
tvMovie | Star Wars: The Magic & the Mystery |
tvMovie | Star Wars: Shortened! |
tvMovie | When Star Wars Ruled the World |
tvMovie | Robot Chicken: Star Wars |
movie | Star Wars: Battle for the Holocrons |
movie | Rifftrax: The Star Wars Holiday Special |
movie | Star Wars: Rogue Squadron |
movie | Untitled Taika Waititi Star Wars Film |
movie | Star Wars: New Jedi Order |
movie | RiffTrax: Star Wars: The Force Awakens |
tvMovie | Star Wars: The Legacy Revealed |
tvMovie | Star Wars Tech |
tvMovie | Bring Back... Star Wars |
tvMovie | Nissan Rogue: Star Wars Rogue One Battle Tested |
tvMovie | Walmart: Rogue One: A Star Wars Story |
tvMovie | John Boyega Pranks Star Wars Fans with Surprise Photobomb at Celebration: Force For Change |
tvMovie | Star Wars Galaxy's Edge: Adventure Awaits |
movie | 77-Star Wars-17 |
movie | Untitled Star Wars/Kevin Feige Project |
tvMovie | Star Wars: Music by John Williams |
movie | Battle Star Wars |
movie | RiffTrax: Star Wars: The Last Jedi |
movie | Built by Jedi - The Making of Star Wars Jedi: Fallen Order |
movie | Untitled Star Wars/Exegol Project |
movie | Star Wars: The Clone Wars |
movie | Star Wars: Hand of the Empire |
tvMovie | Star Wars in Japan Celebration |
tvMovie | The Lego Star Wars Holiday Special |
movie | How to Watch Star Wars, Part One: The Prequels Are Better Movies Than You Deserve |
tvMovie | Hollywood's Master of Myth: Joseph Campbell - The Force Behind Star Wars |
movie | Star Wars: New Horizons |
tvMovie | Toy Empire: The British Force Behind Star Wars Toys |
movie | Conflict: A Star Wars Story |
movie | Star Wars: Rogue Squadron |
movie | Star Wars: A Droid Story |
movie | Doraemon the Movie: Nobita's Little Star Wars 2021 |
movie | Star Wars: A Galaxy in Darkness |
movie | Journey of the Outlaw: A Star Wars Story |
movie | Lego Star Wars the Last Hope |
movie | Revan: A Star Wars Fan Film |
tvMovie | Lego Star Wars Terrifying Tales |
movie | Star Wars Inner Demons |
movie | Rifftrax: Solo: A Star Wars Story |
movie | Star Wars: The Chosen One |
movie | Star Wars xxx parody |
tvMovie | Robot Chicken: Star Wars III |
tvMovie | Von Star Wars bis Harry Potter - John Williams dirigiert die Wiener Philharmoniker |
movie | Star Wars Heroes & Villains |
tvMovie | Lego Star Wars: The Padawan Menace |
tvMovie | Star Wars: ESPN Fantasy Football |
movie | Star Wars: Star Warriors |
movie | Plageuis: A Star Wars Story |
movie | Star Wars Kid: The Rise of the Digital Shadows |
movie | Star Wars: Light of the Jedi |
movie | Star Wars: The Rise of Ren |
movie | Luke Skywalker: A Star Wars Story |
tvMovie | Musk vs Bezos: The New Star Wars |
tvMovie | Kinect Star Wars: Duel |
movie | Star Wars: Star Warriors |
movie | Star Wars: The Mystery of the Lockstream Cargo. |
movie | Star Wars: Threads of Destiny |
tvMovie | Lego Star Wars: The Empire Strikes Out |
tvMovie | Lego Star Wars: The Empire Strikes Out |
movie | Star Wars: Episode VII - The Force Awakens |
movie | Star Wars: Episode VIII - The Last Jedi |
movie | Star Wars: Episode IX - The Rise of Skywalker |
movie | The Disney Star Wars Trilogy Documentary |
movie | The Fans vs Disney Star Wars Special Edition |
movie | Star Wars: Shattered Empire |
movie | Star Wars: Labyrinth of Evil, Part I |
movie | Untitled Dave Filoni Star Wars Project |
movie | Star Wars: Dawn of the Jedi |
movie | Star Wars: Crimson Empire |
movie | Untitled Star Wars |
movie | Star Wars: Unlikely Allies |
tvMovie | Kinect Star Wars: Girly Vader |
movie | Untitled Star Wars/Obaid-Chinoy Project |
movie | The Patrick (H) Willems Star Wars Holiday Special |
movie | Star Wars: Revenge of the Apprentice |
movie | Star Wars: The Fandom Menace |
movie | Star Wars: Tremors of the Force |
movie | Star Wars: The Force Unleashed II |
movie | Plastic Galaxy: The Story of Star Wars Toys |
movie | Rogue One: A Star Wars Story |
%%sql imdb
SELECT titleType, primaryTitle
FROM Title
WHERE primaryTitle LIKE 'Harry Potter and the Deathly Hallows: Part _'
titleType | primaryTitle |
---|---|
movie | Harry Potter and the Deathly Hallows: Part 1 |
movie | Harry Potter and the Deathly Hallows: Part 2 |
Converting Data Types Using CAST
¶
%%sql imdb
SELECT primaryTitle, CAST(runtimeMinutes AS REAL)
FROM Title
LIMIT 10;
primaryTitle | CAST(runtimeMinutes AS FLOAT) |
---|---|
Miss Jerry | 45.0 |
The Corbett-Fitzsimmons Fight | 100.0 |
Bohemios | 100.0 |
The Story of the Kelly Gang | 70.0 |
The Prodigal Son | 90.0 |
Robbery Under Arms | None |
Hamlet | None |
Don Quijote | None |
The Fairylogue and Radio-Plays | 120.0 |
A Cultura do Cacau | None |
Applying Conditions With CASE
¶
Here, we return a random order so we can see the various movie ages (otherwise, the top few entries happen to all be old movies).
%%sql imdb
SELECT setseed(0.42); -- Setting the random number seed
SELECT titleType, startYear,
CASE WHEN startYear < '1950' THEN 'old'
WHEN startYear < '2000' THEN 'mid-aged'
ELSE 'new'
END AS movie_age
FROM Title
ORDER BY RANDOM()
LIMIT 10;
titleType | startYear | movie_age |
---|---|---|
tvMovie | 2012 | new |
movie | 2023 | new |
movie | 2009 | new |
tvMovie | 2016 | new |
movie | 1929 | old |
movie | 2016 | new |
movie | None | new |
tvMovie | 1999 | mid-aged |
tvMovie | 2019 | new |
movie | None | new |
Joining Tables¶
We combine data from multiple tables by performing a join. We will explore joins using the cats database, which includes two tables: s
and t
.
%%sql duckdb
SELECT * FROM s;
id | name |
---|---|
0 | Apricot |
1 | Boots |
2 | Cally |
4 | Eugene |
%%sql duckdb
SELECT * FROM t;
id | breed |
---|---|
1 | persian |
2 | ragdoll |
4 | bengal |
5 | persian |
Inner Join¶
%%sql duckdb
SELECT s.id, name, breed
FROM s INNER JOIN t ON s.id = t.id;
id | name | breed |
---|---|---|
1 | Boots | persian |
2 | Cally | ragdoll |
4 | Eugene | bengal |
By default, JOIN
ing without specifying a join type will default to an inner join.
%%sql duckdb
SELECT s.id, name, breed
FROM s JOIN t ON s.id = t.id;
id | name | breed |
---|---|---|
1 | Boots | persian |
2 | Cally | ragdoll |
4 | Eugene | bengal |
Cross Join¶
%%sql duckdb
SELECT *
FROM s CROSS JOIN t;
id | name | id_1 | breed |
---|---|---|---|
0 | Apricot | 1 | persian |
0 | Apricot | 2 | ragdoll |
0 | Apricot | 4 | bengal |
0 | Apricot | 5 | persian |
1 | Boots | 1 | persian |
1 | Boots | 2 | ragdoll |
1 | Boots | 4 | bengal |
1 | Boots | 5 | persian |
2 | Cally | 1 | persian |
2 | Cally | 2 | ragdoll |
2 | Cally | 4 | bengal |
2 | Cally | 5 | persian |
4 | Eugene | 1 | persian |
4 | Eugene | 2 | ragdoll |
4 | Eugene | 4 | bengal |
4 | Eugene | 5 | persian |
Equivalently, a cross join can be performed using the following syntax:
%%sql duckdb
SELECT *
FROM s, t
id | name | id_1 | breed |
---|---|---|---|
0 | Apricot | 1 | persian |
0 | Apricot | 2 | ragdoll |
0 | Apricot | 4 | bengal |
0 | Apricot | 5 | persian |
1 | Boots | 1 | persian |
1 | Boots | 2 | ragdoll |
1 | Boots | 4 | bengal |
1 | Boots | 5 | persian |
2 | Cally | 1 | persian |
2 | Cally | 2 | ragdoll |
2 | Cally | 4 | bengal |
2 | Cally | 5 | persian |
4 | Eugene | 1 | persian |
4 | Eugene | 2 | ragdoll |
4 | Eugene | 4 | bengal |
4 | Eugene | 5 | persian |
Conceptually, an inner join is equivalent to a cross join where irrelevant rows are removed.
%%sql duckdb
SELECT *
FROM s CROSS JOIN t
WHERE s.id = t.id;
id | name | id_1 | breed |
---|---|---|---|
1 | Boots | 1 | persian |
2 | Cally | 2 | ragdoll |
4 | Eugene | 4 | bengal |
Left Outer Join¶
%%sql duckdb
SELECT *
FROM s LEFT JOIN t ON s.id = t.id;
id | name | id_1 | breed |
---|---|---|---|
1 | Boots | 1 | persian |
2 | Cally | 2 | ragdoll |
4 | Eugene | 4 | bengal |
0 | Apricot | None | None |
Right Outer Join¶
%%sql duckdb
SELECT *
FROM s RIGHT JOIN t ON s.id = t.id;
id | name | id_1 | breed |
---|---|---|---|
1 | Boots | 1 | persian |
2 | Cally | 2 | ragdoll |
4 | Eugene | 4 | bengal |
None | None | 5 | persian |
Full Outer Join¶
%%sql duckdb
SELECT *
FROM s FULL JOIN t ON s.id = t.id;
id | name | id_1 | breed |
---|---|---|---|
1 | Boots | 1 | persian |
2 | Cally | 2 | ragdoll |
4 | Eugene | 4 | bengal |
None | None | 5 | persian |
0 | Apricot | None | None |
Aliasing in Joins¶
Let's return to the IMDB dataset. Now, we'll consider two tables: Title
and Rating
.
When working with tables that have long names, we often create an alias using the AS
keyword (much like we did with columns in the previous lecture). This makes it easier to reference these tables when performing a join.
%%sql imdb
SELECT primaryTitle, averageRating, genres,
FROM Title AS T INNER JOIN Rating AS R
ON T.tconst = R.tconst
WHERE numVotes > 100
ORDER BY averageRating DESC
LIMIT 10;
primaryTitle | averageRating | genres |
---|---|---|
A Maiden's Tale | 10.0 | Drama,Romance |
The Dark Web | 10.0 | Crime,Thriller |
Electric Burma | 9.899999618530273 | Documentary |
Mr. Natwarlal | 9.899999618530273 | Action |
Brother Islands, a Scuba Dive Adventure | 9.899999618530273 | Documentary |
Imamdasta | 9.899999618530273 | Drama |
Rule Number 4 | 9.899999618530273 | None |
Aima | 9.899999618530273 | None |
Cuento de Primavera: A Spring Tale | 9.899999618530273 | Drama,Fantasy,Mystery |
Ondolle Love Story | 9.800000190734863 | Drama,Romance |
The AS
keyword is optional – omitting it will still create an alias. We typically include AS
to make our queries more readable.
%%sql imdb
SELECT primaryTitle, averageRating
FROM Title T INNER JOIN Rating R
ON T.tconst = R.tconst
WHERE numVotes > 100
ORDER BY averageRating DESC
LIMIT 10;
primaryTitle | averageRating |
---|---|
A Maiden's Tale | 10.0 |
The Dark Web | 10.0 |
Mr. Natwarlal | 9.899999618530273 |
Brother Islands, a Scuba Dive Adventure | 9.899999618530273 |
Imamdasta | 9.899999618530273 |
Rule Number 4 | 9.899999618530273 |
Aima | 9.899999618530273 |
Cuento de Primavera: A Spring Tale | 9.899999618530273 |
Electric Burma | 9.899999618530273 |
Jugalbandhi | 9.800000190734863 |
Referencing columns using the full or aliased table name is important to avoid ambiguity. Suppose the tables we are trying to join both include a column with the same name, like the tconst
columns present in both the Title
and Rating
tables of the IMDB database. If we do not specify which table's column we wish to reference, SQL will not be able to process our query.
In the cell below, it is unclear if we are referring to the tconst
column from the Title
table or the tconst
column from the Rating
table. SQL errors.
SELECT primaryTitle, averageRating
FROM Title AS T INNER JOIN Rating AS R
ON tconst = tconst;
IMDB Case Study¶
Let's perform our first data science task in SQL – we'll explore the IMDB dataset in further detail.
First, let's understand what tables are present in the database.
%%sql imdb
SELECT tbl_name, sql FROM sqlite_master WHERE type='table'
tbl_name | sql |
---|---|
name | CREATE TABLE "name"(nconst VARCHAR PRIMARY KEY, primaryName VARCHAR, birthYear INTEGER, deathYear INTEGER, primaryProfession VARCHAR, knownForTitles VARCHAR); |
principal | CREATE TABLE principal(tconst VARCHAR NOT NULL, ordering INTEGER, nconst VARCHAR NOT NULL, category VARCHAR, job VARCHAR, characters VARCHAR, FOREIGN KEY (tconst) REFERENCES title(tconst), FOREIGN KEY (nconst) REFERENCES name(nconst)); |
rating | CREATE TABLE rating(tconst VARCHAR NOT NULL, averageRating FLOAT, numVotes INTEGER, FOREIGN KEY (tconst) REFERENCES title(tconst)); |
title | CREATE TABLE title(tconst VARCHAR PRIMARY KEY, titleType VARCHAR, primaryTitle VARCHAR, originaTitle VARCHAR, isAdult BOOLEAN, startYear INTEGER, endYear INTEGER, runtimeMinutes INTEGER, genres VARCHAR); |
Let's take a closer look at the Title
table. Below, we preview a few rows of interest.
%%sql imdb
SELECT *
FROM Title
WHERE primaryTitle IN ('Ginny & Georgia', 'What If...?', 'Succession', 'Veep', 'Tenet')
LIMIT 50;
tconst | titleType | primaryTitle | originaTitle | isAdult | startYear | endYear | runtimeMinutes | genres |
---|---|---|---|---|---|---|---|---|
tt8008862 | movie | Succession | Succession | False | 2018 | None | None | Horror |
tt8819150 | movie | Succession | Succession | False | 2018 | None | 71 | Documentary |
tt9663946 | movie | Tenet | Tenet | False | 2020 | None | 150 | Action,Sci-Fi,Thriller |
tt0254490 | movie | Succession | Keisho sakazuki | False | 1992 | None | 119 | Drama |
tt4097596 | movie | What If...? | What If...? | False | None | None | 25 | Action,Drama |
tt1003027 | movie | What If...? | Notre univers impitoyable | False | 2008 | None | 87 | Comedy |
tt6723592 | movie | Tenet | Tenet | False | 2020 | None | 150 | Action,Sci-Fi,Thriller |
There are a few issues with how this data is stored. Can you spot them?
- We see some rows with missing
runtimeMinutes
. This suggests malformed data. We could drop these rows inpandas
using[]
or.dropna()
.
Instead, let's see how we can can do these fixes in SQL.
%%sql imdb
SELECT
tconst AS id,
primaryTitle AS title,
titleType,
runtimeMinutes,
startYear
FROM Title
WHERE
runtimeMinutes > 0 AND runtimeMinutes < 300 -- get rid of outliers and missing values --
AND startYear > 0 -- get rid of missing values --
LIMIT 10
id | title | titleType | runtimeMinutes | startYear |
---|---|---|---|---|
tt0000009 | Miss Jerry | movie | 45 | 1894 |
tt0000147 | The Corbett-Fitzsimmons Fight | movie | 100 | 1897 |
tt0000502 | Bohemios | movie | 100 | 1905 |
tt0000574 | The Story of the Kelly Gang | movie | 70 | 1906 |
tt0000591 | The Prodigal Son | movie | 90 | 1907 |
tt0000679 | The Fairylogue and Radio-Plays | movie | 120 | 1908 |
tt0000941 | Locura de amor | movie | 45 | 1909 |
tt0001184 | Don Juan de Serrallonga | movie | 58 | 1910 |
tt0001285 | The Life of Moses | movie | 50 | 1909 |
tt0001498 | The Battle of Trafalgar | movie | 51 | 1911 |
Perhaps we're interested in only top rated action movies. We can refine our query results futher.
%%sql imdb
SELECT
T.tconst AS id,
primaryTitle AS title,
titleType,
runtimeMinutes,
startYear,
genres,
averageRating,
numVotes
FROM Title T, Rating R
WHERE
T.tconst = R.tconst
AND runtimeMinutes > 0 AND runtimeMinutes < 300 -- get rid of outliers and missing values --
AND startYear > 0 -- get rid of missing values --
AND titleType = 'movie' -- get movies
AND numVotes >= 1000
AND genres LIKE '%Action%'
ORDER BY averageRating DESC
LIMIT 10
id | title | titleType | runtimeMinutes | startYear | genres | averageRating | numVotes |
---|---|---|---|---|---|---|---|
tt0259534 | Ramayana: The Legend of Prince Rama | movie | 135 | 1993 | Action,Adventure,Animation | 9.199999809265137 | 14441 |
tt0167260 | The Lord of the Rings: The Return of the King | movie | 201 | 2003 | Action,Adventure,Drama | 9.0 | 1970161 |
tt29284103 | Nene Saroja | movie | 155 | 2023 | Action | 9.0 | 1038 |
tt0468569 | The Dark Knight | movie | 152 | 2008 | Action,Crime,Drama | 9.0 | 2857548 |
tt14090476 | Sarsenapati Hambirrao | movie | 158 | 2022 | Action,Drama,History | 8.899999618530273 | 2877 |
tt16492678 | Demon Slayer: Kimetsu no Yaiba - Tsuzumi Mansion Arc | movie | 87 | 2021 | Action,Animation,Fantasy | 8.899999618530273 | 24226 |
tt0075143 | The Message | movie | 207 | 1976 | Action,Adventure,Biography | 8.899999618530273 | 10447 |
tt0120737 | The Lord of the Rings: The Fellowship of the Ring | movie | 178 | 2001 | Action,Adventure,Drama | 8.899999618530273 | 1998121 |
tt3189256 | Om | movie | 150 | 1995 | Action,Crime,Drama | 8.899999618530273 | 3917 |
tt0237376 | Kireedam | movie | 124 | 1989 | Action,Drama | 8.899999618530273 | 8490 |
Using CTEs to do some complex analysis¶
Here I will use a Common Table Expression (CTE) to construct a table that I use as an input in another query.
This is done with the syntax:
WITH table_name as (
SELECT ...
)
SELECT ...
FROM table_name, ...
...
%%sql
WITH good_action_movies AS (
SELECT *
FROM Title T JOIN Rating R ON T.tconst = R.tconst
WHERE genres LIKE '%Action%' AND averageRating > 7 AND numVotes > 5000
),
prolific_actors AS (
SELECT N.nconst, primaryName, COUNT(*) as numRoles
FROM Name N JOIN Principal P ON N.nconst = P.nconst
WHERE category = 'actor'
GROUP BY N.nconst, primaryName
)
SELECT primaryTitle, primaryName, numRoles, ROUND(averageRating) AS rating
FROM good_action_movies m, prolific_actors a, principal p
WHERE p.tconst = m.tconst AND p.nconst = a.nconst
ORDER BY rating DESC, numRoles DESC
LIMIT 10
primaryTitle | primaryName | numRoles | rating |
---|---|---|---|
Threat Level Midnight: The Movie | Steve Carell | 51 | 10.0 |
Threat Level Midnight: The Movie | Steve Carell | 51 | 10.0 |
Threat Level Midnight: The Movie | Rainn Wilson | 36 | 10.0 |
Threat Level Midnight: The Movie | Ed Helms | 36 | 10.0 |
Threat Level Midnight: The Movie | Ed Helms | 36 | 10.0 |
Threat Level Midnight: The Movie | John Krasinski | 28 | 10.0 |
Threat Level Midnight: The Movie | Craig Robinson | 26 | 10.0 |
Threat Level Midnight: The Movie | Ricky Gervais | 17 | 10.0 |
Threat Level Midnight: The Movie | Stephen Merchant | 14 | 10.0 |
Threat Level Midnight: The Movie | Creed Bratton | 12 | 10.0 |
How do ratings vary through the years?¶
If we want to know the critic ratings for each of these action movies, we can join Title
with the Rating
table.
In the cell below, we use the <<
operator to store the result of our query as a variable.
%sql imdb
%config SqlMagic.autopandas = True
%%sql action_movies_df <<
SELECT
T.tconst AS id,
primaryTitle AS title,
titleType,
runtimeMinutes AS time,
startYear AS year,
genres,
averageRating AS rating,
numVotes
FROM Title T, Rating R
WHERE
T.tconst = R.tconst
AND time > 0 AND time < 300 -- get rid of outliers and missing values --
AND year > 0 -- get rid of missing values --
AND titleType = 'movie' -- get movies
AND numVotes >= 1000
AND genres LIKE '%Action%'
ORDER BY startYear
action_movies_df
id | title | titleType | time | year | genres | rating | numVotes | |
---|---|---|---|---|---|---|---|---|
0 | tt0004465 | The Perils of Pauline | movie | 199 | 1914 | Action,Adventure,Drama | 6.3 | 1085 |
1 | tt0004635 | The Squaw Man | movie | 74 | 1914 | Action,Drama,Romance | 5.7 | 1041 |
2 | tt0006333 | 20,000 Leagues Under the Sea | movie | 85 | 1916 | Action,Adventure,Sci-Fi | 6.1 | 1980 |
3 | tt0011387 | The Last of the Mohicans | movie | 73 | 1920 | Action,Adventure,Drama | 6.7 | 1439 |
4 | tt0012642 | A Sailor-Made Man | movie | 47 | 1921 | Action,Adventure,Comedy | 6.8 | 1246 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
7850 | tt29195117 | Justice League: Crisis on Infinite Earths - Pa... | movie | 93 | 2024 | Action,Adventure,Animation | 6.3 | 3413 |
7851 | tt29515802 | S-99 | movie | 119 | 2024 | Action | 8.8 | 2049 |
7852 | tt29538571 | Sixty Minutes | movie | 88 | 2024 | Action,Adventure,Drama | 5.7 | 3627 |
7853 | tt29722855 | Badland Hunters | movie | 107 | 2024 | Action,Adventure,Drama | 5.9 | 8596 |
7854 | tt3359350 | Road House | movie | 121 | 2024 | Action,Thriller | 6.2 | 48811 |
7855 rows × 8 columns
import plotly.express as px
px.scatter(action_movies_df, x = "year", y="rating", opacity=0.5)
Using a Common Table Expression (CTE):
%%sql action_grouped_df <<
WITH action_movies AS (
SELECT
T.tconst AS id,
primaryTitle AS title,
titleType,
runtimeMinutes AS time,
startYear AS year,
genres,
averageRating AS rating,
numVotes
FROM Title T, Rating R
WHERE
T.tconst = R.tconst
AND time > 0 AND time < 300 -- get rid of outliers and missing values --
AND year > 0 -- get rid of missing values --
AND titleType = 'movie' -- get movies
AND numVotes >= 1000
AND genres LIKE '%Action%'
)
SELECT
year,
median(rating) AS median_rating,
quantile_cont(rating, .25) AS lower_q,
quantile_cont(rating, .75) AS upper_q,
FROM action_movies
GROUP BY year
ORDER BY year
Let's look at what we got.
import plotly.express as px
action_grouped_df['error_p'] = action_grouped_df['upper_q'] - action_grouped_df['median_rating']
action_grouped_df['error_m'] = action_grouped_df['median_rating'] - action_grouped_df['lower_q']
fig = px.line(action_grouped_df, x="year", y="median_rating",
error_y="error_p", error_y_minus="error_m")
fig
How does the runtime of an action movie influence the rating it receives?¶
px.scatter(action_movies_df, x="time", y="rating", opacity=0.1)
px.density_heatmap(action_movies_df, x="time", y="rating",nbinsx=100, nbinsy=100)
%%sql action_grouped_df <<
WITH action_movies AS (
SELECT
T.tconst AS id,
primaryTitle AS title,
titleType,
runtimeMinutes AS time,
startYear AS year,
genres,
averageRating AS rating,
numVotes
FROM Title T, Rating R
WHERE
T.tconst = R.tconst
AND time > 0 AND time < 300 -- get rid of outliers and missing values --
AND year > 0 -- get rid of missing values --
AND titleType = 'movie' -- get movies
AND numVotes >= 1000
AND genres LIKE '%Action%'
)
SELECT
ceil(time / 10) * 10 as time_desc,
median(rating) AS median_rating,
quantile_cont(rating, .25) AS lower_q,
quantile_cont(rating, .75) AS upper_q,
FROM action_movies
GROUP BY time_desc
ORDER BY time_desc
import plotly.express as px
action_grouped_df['error_p'] = action_grouped_df['upper_q'] - action_grouped_df['median_rating']
action_grouped_df['error_m'] = action_grouped_df['median_rating'] - action_grouped_df['lower_q']
fig = px.line(action_grouped_df, x="time_desc", y="median_rating",
error_y="error_p", error_y_minus="error_m")
fig
Could we use a movie's runtime to predict its rating?
import sklearn.linear_model as lm
import numpy as np
X, Y = action_movies_df["time"].to_numpy(), action_movies_df["rating"].to_numpy()
model = lm.LinearRegression()
model.fit(X[..., np.newaxis], Y)
xs = np.linspace(X.min(),X.max())
yhats = model.predict(xs[:, np.newaxis])
fig = px.scatter(x=X, y=Y, opacity=0.1)
fig.add_scatter(x=xs, y=yhats)
px.scatter(x=X, y=Y, opacity=0.1, trendline="ols")