import pandas as pd
import plotly.express as px


%load_ext sql


%%sql
sqlite:///data/lec18_basic_examples.db


%%sql
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;

 * sqlite:///data/lec18_basic_examples.db
Done.


import sqlalchemy
# create a SQL Alchemy connection to the database
engine = sqlalchemy.create_engine("sqlite:///data/lec18_basic_examples.db")
connection = engine.connect()
pd.read_sql("""
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;""", connection)


df = pd.read_sql("""
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;""", connection)


px.bar(df, x = "type", y = "MAX(cost)")


%%sql
SELECT lightning FROM Dish

 * sqlite:///data/lec18_basic_examples.db
(sqlite3.OperationalError) no such column: lightning
[SQL: SELECT lightning FROM Dish]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


#pd.read_sql("""SELECT lightning FROM Dish""", connection)


%xmode minimal

Exception reporting mode: Minimal


pd.read_sql("""SELECT lightning FROM Dish""", connection)

OperationalError: no such column: lightning


The above exception was the direct cause of the following exception:

OperationalError: (sqlite3.OperationalError) no such column: lightning
[SQL: SELECT lightning FROM Dish]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


query = """
SELECT * FROM Dish
LIMIT {limit}
"""


lim = 5


%%sql res <<
{query.format(limit=lim)}

 * sqlite:///data/lec18_basic_examples.db
Done.
Returning data to local variable res


res.sql

'SELECT * FROM Dish\nLIMIT 5'


res.DataFrame()


from os.path import exists

# From https://www.imdb.com/interfaces/
from ds100_utils import fetch_and_cache
data_directory = './data'
fetch_and_cache('https://datasets.imdbws.com/title.basics.tsv.gz', 'titles.tsv.gz', data_directory)
fetch_and_cache('https://datasets.imdbws.com/name.basics.tsv.gz', 'names.tsv.gz', data_directory)
if not exists(f"{data_directory}/titles.tsv"):
    !gunzip -kf {data_directory}/titles.tsv.gz
    !gunzip -kf {data_directory}/names.tsv.gz
    
!ls -lh {data_directory}

Using cached version that was downloaded (UTC): Thu Nov  3 06:09:18 2022
Using cached version that was downloaded (UTC): Thu Nov  3 06:09:20 2022
total 3.4G
-rw-r--r-- 1 jovyan jovyan 1.6G Nov  3 06:13 imdb.db
-rw-r--r-- 1 jovyan jovyan  28K Sep 14 22:37 lec18_basic_examples.db
-rw-r--r-- 1 jovyan jovyan 688M Nov  3 06:09 names.tsv
-rw-r--r-- 1 jovyan jovyan 225M Nov  3 06:09 names.tsv.gz
-rw-r--r-- 1 jovyan jovyan 764M Nov  3 06:09 titles.tsv
-rw-r--r-- 1 jovyan jovyan 156M Nov  3 06:09 titles.tsv.gz


# Running the line below will probaly crash your kernel because the names 
# dataset is too big to comfortable fit into main memory. 
#pd.read_csv(f'{data_directory}/names.tsv', sep='\t')


from os.path import exists

imdb_file_exists = exists('./data/imdb.db')
if not imdb_file_exists:
    !(cd data; sqlite3 imdb.db ".mode tabs" ".import titles.tsv titles" ".import names.tsv names") 2> /dev/null


imdb_db = "sqlite:///data/imdb.db"
engine = sqlalchemy.create_engine(imdb_db)
connection = engine.connect()


%sql $imdb_db


%%sql tables <<
SELECT sql FROM sqlite_master WHERE type='table';

 * sqlite:///data/imdb.db
   sqlite:///data/lec18_basic_examples.db
Done.
Returning data to local variable tables


tables


print(tables[0].sql)

CREATE TABLE "titles"(
  "tconst" TEXT,
  "titleType" TEXT,
  "primaryTitle" TEXT,
  "originalTitle" TEXT,
  "isAdult" TEXT,
  "startYear" TEXT,
  "endYear" TEXT,
  "runtimeMinutes" TEXT,
  "genres" TEXT
)


print(tables[1]["sql"])

CREATE TABLE "names"(
  "nconst" TEXT,
  "primaryName" TEXT,
  "birthYear" TEXT,
  "deathYear" TEXT,
  "primaryProfession" TEXT,
  "knownForTitles" TEXT
)


get_10_movies = """
SELECT *
FROM titles
LIMIT 10;
"""


%%sql
$get_10_movies

 * sqlite:///data/imdb.db
   sqlite:///data/lec18_basic_examples.db
Done.


action_movies_query = """
SELECT tconst AS id,
  primaryTitle AS title, 
  runtimeMinutes AS time, 
  startYear AS year
FROM titles
WHERE titleType = 'movie' AND 
      genres LIKE '%Action%'"""


%%sql action_movies << 
{action_movies_query}

 * sqlite:///data/imdb.db
   sqlite:///data/lec18_basic_examples.db
Done.
Returning data to local variable action_movies


action_movies.DataFrame()


pd.read_sql(action_movies_query, connection)


%%sql action_movies << 
SELECT tconst AS id,
  primaryTitle AS title, 
  CAST(runtimeMinutes AS int) AS time, 
  CAST(startYear AS int) AS year
FROM titles
WHERE genres LIKE '%Action%' AND
  titleType = 'movie' AND
  time > 60 AND time < 180 AND
  year > 0

 * sqlite:///data/imdb.db
   sqlite:///data/lec18_basic_examples.db
Done.
Returning data to local variable action_movies


action_movies = action_movies.DataFrame()
action_movies


px.histogram(action_movies, x = "year")


action_movies['time'].groupby(action_movies['year']).mean().plot();


action_movies['decade'] = (action_movies['year'] // 10) * 10
px.box(action_movies, x = 'decade', y = 'time', color = "decade")
#plt.xticks(rotation=45);


import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x = 'decade', y = 'time', data = action_movies)
plt.xticks(rotation=45);

	name	type	cost
0	ravioli	entree	10
1	pork bun	entree	7
2	taco	entree	7
3	edamame	appetizer	4
4	fries	appetizer	4

tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
tt0000001	short	Carmencita	Carmencita	1894	\N	1	Documentary,Short
tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	1892	\N	5	Animation,Short
tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	1892	\N	4	Animation,Comedy,Romance
tt0000004	short	Un bon bock	Un bon bock	1892	\N	12	Animation,Short
tt0000005	short	Blacksmith Scene	Blacksmith Scene	1893	\N	1	Comedy,Short
tt0000006	short	Chinese Opium Den	Chinese Opium Den	1894	\N	1	Short
tt0000007	short	Corbett and Courtney Before the Kinetograph	Corbett and Courtney Before the Kinetograph	1894	\N	1	Short,Sport
tt0000008	short	Edison Kinetoscopic Record of a Sneeze	Edison Kinetoscopic Record of a Sneeze	1894	\N	1	Documentary,Short
tt0000009	movie	Miss Jerry	Miss Jerry	1894	\N	45	Romance
tt0000010	short	Leaving the Factory	La sortie de l'usine Lumière à Lyon	1895	\N	1	Documentary,Short

	id	title	time	year
0	tt0000574	The Story of the Kelly Gang	70	1906
1	tt0002574	What Happened to Mary	150	1912
2	tt0003545	Who Will Marry Mary?	\N	1913
3	tt0003747	Cameo Kirby	50	1914
4	tt0003897	The Exploits of Elaine	220	1914
...	...	...	...	...
42674	tt9904270	Get Rid of It	\N	\N
42675	tt9904682	SIUAT	\N	\N
42676	tt9905492	Midnight Reckoning	\N	\N
42677	tt9905708	Résilience	\N	\N
42678	tt9907670	Wanderer in a Business Suit	\N	1961

	id	title	time	year
0	tt0000574	The Story of the Kelly Gang	70	1906
1	tt0002574	What Happened to Mary	150	1912
2	tt0003545	Who Will Marry Mary?	\N	1913
3	tt0003747	Cameo Kirby	50	1914
4	tt0003897	The Exploits of Elaine	220	1914
...	...	...	...	...
42674	tt9904270	Get Rid of It	\N	\N
42675	tt9904682	SIUAT	\N	\N
42676	tt9905492	Midnight Reckoning	\N	\N
42677	tt9905708	Résilience	\N	\N
42678	tt9907670	Wanderer in a Business Suit	\N	1961

	id	title	time	year
0	tt0000574	The Story of the Kelly Gang	70	1906
1	tt0002574	What Happened to Mary	150	1912
2	tt0004223	The Life of General Villa	105	1914
3	tt0004450	Die Pagode	82	1917
4	tt0004635	The Squaw Man	74	1914
...	...	...	...	...
24385	tt9900748	The Robinsons	110	2019
24386	tt9900782	Kaithi	145	2019
24387	tt9900908	Useless Handcuffs	89	1969
24388	tt9901162	The Robinsons	90	2020
24389	tt9904066	Fox Hunting	66	2019

SQL continued¶

Introducing sqlalchemy¶

Using `%%sql` the smart way¶

IMDB Example¶

Action Movie EDA¶

The LIKE Keyword and CAST Keyword¶

Visualization¶

SQL continued¶

Introducing sqlalchemy¶

Using %%sql the smart way¶

IMDB Example¶

Action Movie EDA¶

The LIKE Keyword and CAST Keyword¶

Visualization¶

Using `%%sql` the smart way¶