import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pathlib import Path
from sqlalchemy import create_engine
from ds100_utils import fetch_and_cache
sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
import sqlite3
conn = sqlite3.connect('test.db')
conn.executescript("""
DROP TABLE IF EXISTS s;
DROP TABLE IF EXISTS t;
CREATE TABLE s(
id INTEGER PRIMARY KEY,
name TEXT
);
INSERT INTO s VALUES
(0, 'Apricot'),
(1, 'Boots'),
(2, 'Cally'),
(4, 'Eugene');
CREATE TABLE t(
id INTEGER PRIMARY KEY,
breed TEXT
);
INSERT INTO t VALUES
(1, 'persian'),
(2, 'ragdoll'),
(4, 'bengal'),
(5, 'persian');
""");
def print_sql(s):
first, *rest = s.strip().split('\n')
print(f'sql> {first}')
for line in rest:
print(f'...> {line}')
for result in conn.execute(s):
print(result)
print_sql('SELECT * FROM s;')
print_sql('SELECT * FROM t;')
query = '''
SELECT *
FROM s
WHERE id >= 2;
'''
print_sql(query)
#cross join
query = '''
SELECT *
FROM s, t
'''
print_sql(query)
#inner join
query = '''
SELECT *
FROM s JOIN t
ON s.id = t.id
'''
print_sql(query)
#left outer join
query = '''
SELECT *
FROM s LEFT JOIN t
ON s.id = t.id
'''
print_sql(query)
#right inner join unsupported, so we have to switch s and t
query = '''
SELECT *
FROM t LEFT JOIN s
ON s.id = t.id
'''
print_sql(query)
Doing an outer join is possible but is not very concise and requires UNION keyword.
#full outer join with UNION
query = '''
SELECT s.id, s.name, t.id, t.breed
FROM s LEFT JOIN t
ON s.id = t.id
UNION
SELECT s.id, s.name, t.id, t.breed
FROM t LEFT JOIN s
ON s.id = t.id
'''
print_sql(query)
data_directory = './'
# From https://www.imdb.com/interfaces/
fetch_and_cache('https://datasets.imdbws.com/title.basics.tsv.gz', 'titles.tsv.gz', data_directory)
fetch_and_cache('https://datasets.imdbws.com/name.basics.tsv.gz', 'names.tsv.gz', data_directory)
!gunzip -kf {data_directory}/titles.tsv.gz
!gunzip -kf {data_directory}/names.tsv.gz
!ls -lh {data_directory}
# Running the line below will probaly crash your kernel because the names
# dataset is too big to comfortable fit into main memory.
#pd.read_csv(f'{data_directory}/names.tsv', sep='\t')
# Use sqlite3 instead of Python (must be run from the terminal)
# note: To close sqlite3, press ctrl-d
"""
$ sqlite3 imdb.db
sqlite> .mode tabs
sqlite> .import titles.tsv titles
sqlite> .import names.tsv names
""";
While the code above runs, let's explore the data files a bit.
!wc {data_directory}/*.tsv
!head -n5 {data_directory}/titles.tsv
!head -n5 {data_directory}/names.tsv
conn = sqlite3.connect(data_directory + '/imdb.db')
print_sql('SELECT * FROM titles LIMIT 10;')
for exp in conn.execute('SELECT sql FROM sqlite_master;'):
print(exp[0])
# Reminder: Python f strings
who, where = 'John', 'here'
f"{who} is {where}"
get_10_movies = f'''
SELECT primaryTitle AS title,
runtimeMinutes AS time
FROM titles
LIMIT 10;
'''
print_sql(get_10_movies)
get_10_movies_time_as_int = f'''
SELECT primaryTitle AS title,
CAST(runtimeMinutes as int) AS time
FROM titles
LIMIT 10;
'''
print_sql(get_10_movies_time_as_int)
select_action = f'''
SELECT tconst AS id,
primaryTitle AS title,
CAST(runtimeMinutes AS int) AS time,
CAST(startYear AS int) AS year
FROM titles
WHERE genres LIKE '%Action%' AND
year > 1900 AND
isAdult = '0' AND
titleType = 'movie' AND
time > 60 AND time < 180
'''
create_action_table = f'''
DROP TABLE IF EXISTS action_movie;
CREATE TABLE action_movie AS {select_action};
'''
conn.executescript(create_action_table)
print_sql('SELECT * FROM action_movie LIMIT 10')
We can check to see whether this table is small enough to read into pandas:
print_sql('SELECT COUNT(*) from action_movie')
df = pd.read_sql('SELECT * FROM action_movie', conn)
df
df.shape
df.hist('year');
sns.scatterplot(df['year'], df['time']);
sns.kdeplot(df['year'], df['time']);
df['time'].groupby(df['year']).mean().plot();
df['decade'] = (df['year'] // 10) * 10
sns.boxplot('decade', 'time', data=df)
plt.xticks(rotation=45);
pd.read_sql('SELECT * FROM action_movie ORDER BY RANDOM() LIMIT 3', conn)
#selecting 3 random years
three_years = '''
SELECT year FROM action_movie
GROUP BY year
ORDER BY RANDOM()
LIMIT 3
'''
pd.read_sql(three_years, conn)
#selecting all movies from 3 random years
three_years = '''
SELECT year FROM action_movie
GROUP BY year
ORDER BY RANDOM()
LIMIT 3
'''
cluster_sample = f'''
SELECT * FROM action_movie
WHERE year IN ({three_years})
'''
pd.read_sql(f'''
SELECT *
FROM ({cluster_sample})
''', conn)
# Query has two select statements
# Inner select statement creates the table of 3 randomly chosen years.
print(cluster_sample)
#case statement example
pd.read_sql('''
SELECT id, title, time, year,
CASE WHEN time < 70 THEN 'short'
WHEN time < 120 THEN 'medium'
ELSE 'long'
END as length
FROM action_movie
''', conn).sample(10)
#case statement example with a base expression
pd.read_sql('''
SELECT id, title, time, year,
CASE year LIKE '%197%' WHEN True THEN '70s'
ELSE 'not 70s'
END as decade
FROM action_movie
''', conn).sample(10)
select_people = f'''
SELECT primaryName AS name, knownForTitles
FROM names
'''
pd.read_sql(f'{select_people} LIMIT 10', conn)
select_people = f'''
SELECT primaryName AS name,
SUBSTR(knownForTitles, 0, INSTR(knownForTitles, ',')) AS most_popular_id
FROM names
'''
pd.read_sql(f'{select_people} LIMIT 10', conn)
pd.read_sql('SELECT * FROM names LIMIT 10;', conn)
select_actors = f'''
SELECT primaryName AS name,
CAST(birthYear AS int) AS born,
SUBSTR(knownForTitles, 0, INSTR(knownForTitles, ',')) AS movie_id,
CASE WHEN primaryProfession LIKE '%actor%' THEN 'actor'
WHEN primaryProfession LIKE '%actress%' THEN 'actress'
END AS profession
FROM names
WHERE primaryProfession LIKE '%act%' AND
born > 1800
'''
pd.read_sql(f'{select_actors} LIMIT 10', conn)
select = f'''
SELECT name, profession, born, title, year
FROM ({select_actors}) JOIN action_movie
ON movie_id = id;
'''
conn.executescript(f'''
DROP TABLE IF EXISTS action_ppl;
CREATE TABLE action_ppl AS {select};
''')
pd.read_sql('SELECT * FROM action_ppl LIMIT 10', conn)
The table above gives us all the people whose first movie in IMDB's "known for" list is an action movie, along with the name of that movie. We call these people "action people".
There's a lot we can do with this table, i.e. we can find all the people whose top role has the word "Vampire" in it.
vampire_people = '''
SELECT *
FROM action_ppl
WHERE title LIKE "%Vampire%"
'''
pd.read_sql(vampire_people, conn)
Or we can read in our dataframe into pandas and make a plot. Let's see the age of each actor when their best known movie was made.
df = pd.read_sql('SELECT * FROM action_ppl', conn)
df.shape
df['age'] = df['year'] - df['born']
bins = np.linspace(0, 100, 21)
sns.distplot(df[df['profession']=='actor']['age'], bins=bins)
sns.distplot(df[df['profession']=='actress']['age'], bins=bins)
plt.legend(labels=['Actor', 'Actress'])
plt.xticks(bins, rotation=90)
plt.xlim(0, 100)
plt.ylabel('density');
plt.title('Ages of actors in the Action movies they are known for');