%load_ext sql

%sql duckdb:///data/example_duck.db --alias duck

from sqlalchemy import create_engine

snow_engine = create_engine(
    f"snowflake://{user}:{password}@{account_identifier}")
%sql snow_engine --alias snow

db_engine = create_engine(
  url = f"databricks://token:{access_token}@{server_hostname}?" +
        f"http_path={http_path}&catalog={catalog}&schema={schema}"
)
%sql db_engine --alias db

%%sql
SELECT * FROM Dragon;

%%sql
SELECT *
FROM Dragon;

import sqlalchemy 
import pandas as pd

engine = sqlalchemy.create_engine("duckdb:///data/example_duck.db")

query = """
SELECT * 
FROM Dragon;
"""

df = pd.read_sql(query, engine)
df

import seaborn as sns
mpg = sns.load_dataset("mpg")

%%sql
SELECT * FROM mpg

%%sql
SELECT * FROM information_schema.tables

%%sql
SELECT * FROM information_schema.columns

SELECT * FROM information_schema.columns

pd.read_sql("SELECT * FROM sqlite_schema", "sqlite:///data/basic_examples.db")

from sqlalchemy import inspect
inspector = inspect(engine)
inspector.get_table_names()

['assignment', 'dish', 'dragon', 'grade', 'scene', 'student']

inspector.get_columns('scene')

[{'name': 'id',
  'type': INTEGER(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'biome',
  'type': VARCHAR(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'city',
  'type': VARCHAR(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'visitors',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'created_at',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': 'current_date()',
  'autoincrement': False,
  'comment': None}]

sqlite_engine = sqlalchemy.create_engine("sqlite:///data/basic_examples.db")
inspect(sqlite_engine).get_columns("scene")

[{'name': 'id',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 1},
 {'name': 'biome',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'city',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'visitors',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 0},
 {'name': 'created_at',
  'type': DATETIME(),
  'nullable': True,
  'default': "DATETIME('now')",
  'primary_key': 0}]

%%sql

DROP TABLE IF EXISTS grade;
DROP TABLE IF EXISTS assignment;
DROP TABLE IF EXISTS student;


CREATE TABLE student (
    student_id INTEGER PRIMARY KEY,
    name VARCHAR,
    email VARCHAR
);

CREATE TABLE assignment (
    assignment_id INTEGER PRIMARY KEY,
    description VARCHAR
);

CREATE TABLE grade (
    student_id INTEGER,
    assignment_id INTEGER,
    score REAL CHECK (score > 0 AND score <= 100),
    FOREIGN KEY (student_id) REFERENCES student(student_id),
    FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)
);

INSERT INTO student VALUES
(123, 'JoeyG', 'jegonzal@berkeley.edu'),
(456, 'NargesN', 'norouzi@berkeley.edu');

INSERT INTO assignment VALUES
(1, 'easy assignment'),
(2, 'hard assignment');

%%sql 
INSERT INTO grade VALUES
(123, 1, 80),
(123, 2, 42),
(456, 2, 100);

%sql SELECT * FROM grade;

%%sql
SELECT * FROM Dragon;

%%sql
SELECT cute, year FROM Dragon;

%%sql
SELECT cute AS cuteness,
       year AS "birth year"
FROM Dragon;

%%sql
SELECT DISTINCT year
FROM Dragon;

%%sql
SELECT name, year
FROM Dragon
WHERE cute > 0;

%%sql
SELECT name, cute, year
FROM Dragon
WHERE cute > 0 OR year > 2013;

%%sql
SELECT name, year
FROM Dragon 
WHERE name IN ('puff', 'hiccup');

%%sql
SELECT name, cute
FROM Dragon
WHERE cute IS NOT NULL;

%%sql
SELECT *
FROM Dragon
ORDER BY cute DESC;

%%sql
SELECT *
FROM Dragon
LIMIT 2;

%%sql
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;

%%sql
SELECT *
FROM Dragon
ORDER BY RANDOM() 
LIMIT 2

%%sql
SELECT * 
FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);

%%sql
SELECT *
FROM Dish;

%%sql
SELECT type
FROM Dish;

%%sql
SELECT type
FROM Dish
GROUP BY type;

%%sql
SELECT type, SUM(cost)
FROM Dish
GROUP BY type;

%%sql
SELECT type, 
       SUM(cost), 
       MIN(cost),
       MAX(name)
FROM Dish
GROUP BY type;

%%sql
SELECT year, COUNT(cute)
FROM Dragon
GROUP BY year;

%%sql
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

%sql SELECT * FROM Dragon

dragon_table = %sql SELECT * FROM Dragon
dragon_table

type(dragon_table)

sql.run.ResultSet

dragon_df = dragon_table.DataFrame()
dragon_df

%config SqlMagic.autopandas = True

dragon_df = %sql SELECT * FROM Dragon
dragon_df

type(dragon_df)

pandas.core.frame.DataFrame

%%sql
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

%%sql dragon_years <<
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

dragon_years

from ds100_utils import fetch_and_cache

url = "https://gist.github.com/domoritz/fd517a3a3a210c24a488e61870e2cf2c/raw/b1d53719e8e0eb9f6a95de82fdaccf0b001c0dea/flights-1m.parquet"
fetch_and_cache(url,"flights.parquet")

Using cached version that was downloaded (UTC): Tue Apr  2 03:41:25 2024

PosixPath('data/flights.parquet')

%%sql
SELECT * FROM 'data/flights.parquet' LIMIT 10;

%%sql avg_delays <<

SELECT 
    dayname(fl_date) AS "Day of the Week", 
    mean(dep_delay) AS "Mean Departure Delay", 
FROM 'data/flights.parquet'
GROUP BY "Day of the Week"

import plotly.express as px
px.bar(avg_delays, x="Day of the Week", y = "Mean Departure Delay",
       category_orders={"Day of the Week": ["Sunday", "Monday", "Tuesday", 
                      "Wednesday", "Thursday", "Friday", "Saturday"]})

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

	name	year	cute
0	hiccup	2010	10.0
1	drogon	2011	-100.0
2	dragon 2	2019	0.0
3	puff	2010	100.0
4	smaug	2011	NaN

mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
15.0	8	429.0	198.0	4341	10.0	70	usa	ford galaxie 500
14.0	8	454.0	220.0	4354	9.0	70	usa	chevrolet impala
14.0	8	440.0	215.0	4312	8.5	70	usa	plymouth fury iii
14.0	8	455.0	225.0	4425	10.0	70	usa	pontiac catalina
15.0	8	390.0	190.0	3850	8.5	70	usa	amc ambassador dpl

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

Lecture 20 – Data 100, Spring 2024¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Approach #3: `pd.read_sql`¶

Approach "3" -- Duck DB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Basic Queries¶

Grouping Data with `GROUP BY`¶

Working with the `sql` results as Python variables¶

Storing one-line `%sql` queries¶

Storing multi-line `%%sql` queries¶

More Adavnced SQL with DuckDB¶

table_catalog	table_schema	table_name	table_type	self_referencing_column_name	reference_generation	user_defined_type_catalog	user_defined_type_schema	user_defined_type_name	is_insertable_into	is_typed	commit_action	TABLE_COMMENT
example_duck	main	assignment	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	dish	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	dragon	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	grade	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	scene	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	student	BASE TABLE	None	None	None	None	None	YES	NO	None	None

	type	name	tbl_name	rootpage	sql
0	table	sqlite_sequence	sqlite_sequence	7	CREATE TABLE sqlite_sequence(name,seq)
1	table	Dragon	Dragon	2	CREATE TABLE Dragon (\n name TEXT PRIMARY K...
2	index	sqlite_autoindex_Dragon_1	Dragon	3	None
3	table	Dish	Dish	4	CREATE TABLE Dish (\n name TEXT PRIMARY KEY...
4	index	sqlite_autoindex_Dish_1	Dish	5	None
5	table	Scene	Scene	6	CREATE TABLE Scene (\n id INTEGER PRIMARY K...

name	type	cost
ravioli	entree	10
ramen	entree	13
taco	entree	7
edamame	appetizer	4
fries	appetizer	4
potsticker	appetizer	4
ice cream	dessert	5

	FL_DATE	DEP_DELAY	ARR_DELAY	AIR_TIME	DISTANCE	DEP_TIME	ARR_TIME
0	2006-01-01	5	19	350	2475	9.083333	12.483334
1	2006-01-02	167	216	343	2475	11.783334	15.766666
2	2006-01-03	-7	-2	344	2475	8.883333	12.133333
3	2006-01-04	-5	-13	331	2475	8.916667	11.950000
4	2006-01-05	-3	-17	321	2475	8.950000	11.883333
5	2006-01-06	-4	-32	320	2475	8.933333	11.633333
6	2006-01-08	-3	-2	346	2475	8.950000	12.133333
7	2006-01-09	3	0	334	2475	9.050000	12.166667
8	2006-01-10	-7	-21	334	2475	8.883333	11.816667
9	2006-01-11	8	-10	321	2475	9.133333	12.000000

student_id	assignment_id	score
123	1	80.0
123	2	42.0
456	2	100.0

Lecture 20 – Data 100, Spring 2024¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Approach #3: pd.read_sql¶

Approach "3" -- Duck DB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Basic Queries¶

Grouping Data with GROUP BY¶

Working with the sql results as Python variables¶

Storing one-line %sql queries¶

Storing multi-line %%sql queries¶

More Adavnced SQL with DuckDB¶

Approach #3: `pd.read_sql`¶

Grouping Data with `GROUP BY`¶

Working with the `sql` results as Python variables¶

Storing one-line `%sql` queries¶

Storing multi-line `%%sql` queries¶