import pandas as pd
import plotly.express as px


%load_ext sql


%%sql
sqlite:///data/basic_examples.db


%%sql
SELECT * FROM Dish;

 * sqlite:///data/basic_examples.db
Done.


%%sql
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;

 * sqlite:///data/basic_examples.db
Done.


import sqlalchemy
import pandas as pd

# create a SQL Alchemy connection to the database
engine = sqlalchemy.create_engine("sqlite:///data/basic_examples.db")
connection = engine.connect()

# query is a multi-line Python string.
query = """
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;"""

pd.read_sql(query, connection)


query

'\nSELECT type, MAX(cost)\nFROM Dish\nGROUP BY type;'


import plotly.express as px

query = """
SELECT type, MAX(cost)
FROM Dish
GROUP BY type;"""

# save query result to a DataFrame df
df = pd.read_sql(query, connection) 

# then, visualize the data
px.bar(df, x = "type", y = "MAX(cost)")


%%sql
SELECT lightning FROM Dish

 * sqlite:///data/basic_examples.db
(sqlite3.OperationalError) no such column: lightning
[SQL: SELECT lightning FROM Dish]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


#pd.read_sql("""SELECT lightning FROM Dish""", connection)


%xmode minimal

Exception reporting mode: Minimal


# pd.read_sql("""SELECT lightning FROM Dish""", connection)

%%sql PYTHON_VARIABLE <<
BODY OF QUERY


%%sql drag << 
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;

 * sqlite:///data/basic_examples.db
Done.
Returning data to local variable drag


drag


type(drag)

sql.run.ResultSet


query = """
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;
"""


%%sql
$query

 * sqlite:///data/basic_examples.db
Done.


%%sql
$query

 * sqlite:///data/basic_examples.db
Done.


x, y = 2, 3
s = "The result of x + y is {result}"
print(s.format(result=(x+y)))

The result of x + y is 5


query = """
SELECT * FROM Dish
LIMIT {limit}
"""


lim = 5


%%sql res <<
{query.format(limit=lim)}

 * sqlite:///data/basic_examples.db
Done.
Returning data to local variable res


res.sql

'SELECT * FROM Dish\nLIMIT 5'


res.DataFrame()


from os.path import exists

# From https://www.imdb.com/interfaces/
from ds100_utils import fetch_and_cache
data_directory = './data'
fetch_and_cache('https://datasets.imdbws.com/title.basics.tsv.gz', 'titles.tsv.gz', data_directory)
fetch_and_cache('https://datasets.imdbws.com/name.basics.tsv.gz', 'names.tsv.gz', data_directory)
if not exists(f"{data_directory}/titles.tsv"):
    !gunzip -kf {data_directory}/titles.tsv.gz
    !gunzip -kf {data_directory}/names.tsv.gz
    
!ls -lh {data_directory}

Using cached version that was downloaded (UTC): Thu Apr  6 17:41:43 2023
Using cached version that was downloaded (UTC): Thu Apr  6 17:41:47 2023
total 3.5G
-rw-r--r-- 1 jovyan jovyan  52K Apr  6 17:26 basic_examples.db
-rw-r--r-- 1 jovyan jovyan 1.7G Apr  6 17:43 imdb.db
-rw-r--r-- 1 jovyan jovyan 711M Apr  6 17:41 names.tsv
-rw-r--r-- 1 jovyan jovyan 233M Apr  6 17:41 names.tsv.gz
-rw-r--r-- 1 jovyan jovyan 797M Apr  6 17:41 titles.tsv
-rw-r--r-- 1 jovyan jovyan 163M Apr  6 17:41 titles.tsv.gz


# Running the line below will probaly crash your kernel because the names 
# dataset is too big to comfortable fit into main memory. 
#pd.read_csv(f'{data_directory}/names.tsv', sep='\t')


from os.path import exists

imdb_file_exists = exists('./data/imdb.db')
if not imdb_file_exists:
    !(cd data; sqlite3 imdb.db ".mode tabs" ".import titles.tsv titles" ".import names.tsv names") 2> /dev/null


import sqlalchemy
import pandas as pd

imdb_db = "sqlite:///data/imdb.db"
engine = sqlalchemy.create_engine(imdb_db)
connection = engine.connect()


%sql $imdb_db


%%sql tables <<
SELECT sql FROM sqlite_master WHERE type='table';

   sqlite:///data/basic_examples.db
 * sqlite:///data/imdb.db
Done.
Returning data to local variable tables


tables


print(tables[0].sql)

CREATE TABLE "titles"(
  "tconst" TEXT,
  "titleType" TEXT,
  "primaryTitle" TEXT,
  "originalTitle" TEXT,
  "isAdult" TEXT,
  "startYear" TEXT,
  "endYear" TEXT,
  "runtimeMinutes" TEXT,
  "genres" TEXT
)


print(tables[1]["sql"])

CREATE TABLE "names"(
  "nconst" TEXT,
  "primaryName" TEXT,
  "birthYear" TEXT,
  "deathYear" TEXT,
  "primaryProfession" TEXT,
  "knownForTitles" TEXT
)


get_10_movies = """
SELECT *
FROM titles
LIMIT 10;
"""


%%sql
$get_10_movies

   sqlite:///data/basic_examples.db
 * sqlite:///data/imdb.db
Done.


action_movies_query = """
SELECT tconst AS id,
  primaryTitle AS title, 
  runtimeMinutes AS time, 
  startYear AS year
FROM titles
WHERE titleType = 'movie' AND 
      genres LIKE '%Action%'"""


%%sql action_movies << 
{action_movies_query}

   sqlite:///data/basic_examples.db
 * sqlite:///data/imdb.db
Done.
Returning data to local variable action_movies


action_movies.DataFrame()


pd.read_sql(action_movies_query, connection)


%%sql action_movies << 
SELECT tconst AS id,
  primaryTitle AS title, 
  CAST(runtimeMinutes AS int) AS time, 
  CAST(startYear AS int) AS year
FROM titles
WHERE genres LIKE '%Action%' AND
  titleType = 'movie' AND
  time > 60 AND time < 180 AND
  year > 0

   sqlite:///data/basic_examples.db
 * sqlite:///data/imdb.db
Done.
Returning data to local variable action_movies


action_movies = action_movies.DataFrame()
action_movies


px.histogram(action_movies, x = "year")


action_movies['time'].groupby(action_movies['year']).mean().plot();


action_movies['decade'] = (action_movies['year'] // 10) * 10
px.box(action_movies, x = 'decade', y = 'time', color = "decade")
#plt.xticks(rotation=45);


import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x = 'decade', y = 'time', data = action_movies)
plt.xticks(rotation=45);


%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


%%sql
sqlite:///data/basic_examples.db


%%sql
-- Copy exercise into here --

 * sqlite:///data/basic_examples.db
   sqlite:///data/imdb.db
0 rows affected.

[]

name	type	cost
ravioli	entree	10
ramen	entree	7
taco	entree	7
edamame	appetizer	4
fries	appetizer	4
potsticker	appetizer	4
ice cream	dessert	5

name	year	cute
drogon	2011	-100
dragon 2	2019	0

name	year	cute
drogon	2011	-100
dragon 2	2019	0

name	year	cute
drogon	2011	-100
dragon 2	2019	0

	name	type	cost
0	ravioli	entree	10
1	ramen	entree	7
2	taco	entree	7
3	edamame	appetizer	4
4	fries	appetizer	4

SQL continued¶

Introducing sqlalchemy¶

Downsides to `pd.read_sql`¶

Other `%%sql` tricks¶

Pipe SQL output to Python name/variable with `<<`¶

The `$query` syntax for cell magic¶

The `${query}` syntax for cell magic¶

IMDB Example¶

Action Movie EDA¶

The LIKE Keyword and CAST Keyword¶

Visualization¶

JOINs¶

tconst	titleType	primaryTitle	originalTitle	startYear	endYear	runtimeMinutes	genres
tt0000001	short	Carmencita	Carmencita	1894	\N	1	Documentary,Short
tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	1892	\N	5	Animation,Short
tt0000003	short	Pauvre Pierrot	Pauvre Pierrot	1892	\N	4	Animation,Comedy,Romance
tt0000004	short	Un bon bock	Un bon bock	1892	\N	12	Animation,Short
tt0000005	short	Blacksmith Scene	Blacksmith Scene	1893	\N	1	Comedy,Short
tt0000006	short	Chinese Opium Den	Chinese Opium Den	1894	\N	1	Short
tt0000007	short	Corbett and Courtney Before the Kinetograph	Corbett and Courtney Before the Kinetograph	1894	\N	1	Short,Sport
tt0000008	short	Edison Kinetoscopic Record of a Sneeze	Edison Kinetoscopic Record of a Sneeze	1894	\N	1	Documentary,Short
tt0000009	movie	Miss Jerry	Miss Jerry	1894	\N	45	Romance
tt0000010	short	Leaving the Factory	La sortie de l'usine Lumière à Lyon	1895	\N	1	Documentary,Short

	id	title	time	year
0	tt0000574	The Story of the Kelly Gang	70	1906
1	tt0002574	What Happened to Mary	150	1912
2	tt0003545	Who Will Marry Mary?	\N	1913
3	tt0003747	Cameo Kirby	50	1914
4	tt0003897	The Exploits of Elaine	220	1914
...	...	...	...	...
44048	tt9904270	Get Rid of It	\N	\N
44049	tt9904682	SIUAT	\N	\N
44050	tt9905492	Midnight Reckoning	\N	\N
44051	tt9905708	Résilience	\N	\N
44052	tt9907670	Wanderer in a Business Suit	\N	1961

	id	title	time	year
0	tt0000574	The Story of the Kelly Gang	70	1906
1	tt0002574	What Happened to Mary	150	1912
2	tt0004223	The Life of General Villa	105	1914
3	tt0004450	Die Pagode	82	1917
4	tt0004635	The Squaw Man	74	1914
...	...	...	...	...
25054	tt9897764	River Pirates	90	2018
25055	tt9898858	Coffee & Kareem	88	2020
25056	tt9900782	Kaithi	145	2019
25057	tt9900908	Useless Handcuffs	89	1969
25058	tt9904066	Fox Hunting	66	2019

SQL continued¶

Introducing sqlalchemy¶

Downsides to pd.read_sql¶

Other %%sql tricks¶

Pipe SQL output to Python name/variable with <<¶

The $query syntax for cell magic¶

The ${query} syntax for cell magic¶

IMDB Example¶

Action Movie EDA¶

The LIKE Keyword and CAST Keyword¶

Visualization¶

JOINs¶

Downsides to `pd.read_sql`¶

Other `%%sql` tricks¶

Pipe SQL output to Python name/variable with `<<`¶

The `$query` syntax for cell magic¶

The `${query}` syntax for cell magic¶