%load_ext sql

%sql sqlite:///data/basic_examples.db --alias sqlite

%sql duckdb:///data/example_duck.db --alias duckdb

from sqlalchemy import create_engine

snow_engine = create_engine(
    f"snowflake://{user}:{password}@{account_identifier}")
%sql snow_engine --alias snow

db_engine = create_engine(
  url = f"databricks://token:{access_token}@{server_hostname}?" +
        f"http_path={http_path}&catalog={catalog}&schema={schema}"
)
%sql db_engine --alias db

%sql SELECT * FROM Dragon;

%%sql
SELECT * FROM Dragon;

%%sql
SELECT *
FROM Dragon;

dragon_table = %sql SELECT * FROM Dragon
dragon_table

type(dragon_table)

sql.run.resultset.ResultSet

dragon_df = dragon_table.DataFrame()
dragon_df

%config SqlMagic.autopandas = True

dragon_df = %sql SELECT * FROM Dragon
dragon_df

type(dragon_df)

pandas.core.frame.DataFrame

%%sql res <<
SELECT *
FROM Dragon;

res

import sqlalchemy 
import pandas as pd

engine = sqlalchemy.create_engine("duckdb:///data/example_duck.db")

query = """
SELECT * 
FROM Dragon;
"""

df = pd.read_sql(query, engine)
df

import seaborn as sns
import duckdb
mpg = sns.load_dataset("mpg")

duckdb.query("SELECT * FROM mpg").df()

%%sql
SELECT * FROM information_schema.tables

%%sql
SELECT * FROM information_schema.columns

SELECT * FROM information_schema.columns

pd.read_sql("SELECT * FROM sqlite_schema", "sqlite:///data/basic_examples.db")

from sqlalchemy import inspect
inspector = inspect(engine)
inspector.get_table_names()

['dish', 'dragon', 'scene']

inspector.get_columns('scene')

[{'name': 'id',
  'type': Integer(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'biome',
  'type': String(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'city',
  'type': String(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'visitors',
  'type': Integer(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'created_at',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': 'current_date()',
  'autoincrement': False,
  'comment': None}]

sqlite_engine = sqlalchemy.create_engine("sqlite:///data/basic_examples.db")
inspect(sqlite_engine).get_columns("scene")

[{'name': 'id',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 1},
 {'name': 'biome',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'city',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'visitors',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 0},
 {'name': 'created_at',
  'type': DATETIME(),
  'nullable': True,
  'default': "DATETIME('now')",
  'primary_key': 0}]

%sql duckdb:///data/duckdb_example.db --alias student_db

%%sql student_db

DROP TABLE IF EXISTS grade;
DROP TABLE IF EXISTS assignment;
DROP TABLE IF EXISTS student;


CREATE TABLE student (
    student_id INTEGER PRIMARY KEY,
    name VARCHAR,
    email VARCHAR
);

CREATE TABLE assignment (
    assignment_id INTEGER PRIMARY KEY,
    description VARCHAR
);

CREATE TABLE grade (
    student_id INTEGER,
    assignment_id INTEGER,
    score REAL CHECK (score > 0 AND score <= 100),
    FOREIGN KEY (student_id) REFERENCES student(student_id),
    FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)
);

INSERT INTO student VALUES
(123, 'JoeyG', 'jegonzal@berkeley.edu'),
(456, 'NargesN', 'norouzi@berkeley.edu');

INSERT INTO assignment VALUES
(1, 'easy assignment'),
(2, 'hard assignment');

%%sql 
INSERT INTO grade VALUES
(123, 1, 80),
(123, 2, 42),
(456, 2, 100);

%sql SELECT * FROM grade;

%%sql duckdb
SELECT * FROM Dragon;

%%sql
SELECT cute, year FROM Dragon;

%%sql
SELECT cute AS cuteness,
       year AS "birth year"
FROM Dragon;

%%sql
SELECT DISTINCT year
FROM Dragon;

%%sql
SELECT name, year
FROM Dragon
WHERE cute > 0;

%%sql
SELECT name, cute, year
FROM Dragon
WHERE cute > 0 OR year > 2013;

%%sql
SELECT name, year
FROM Dragon 
WHERE name IN ('puff', 'hiccup');

%%sql
SELECT name, cute
FROM Dragon
WHERE cute IS NOT NULL;

%%sql
SELECT *
FROM Dragon
ORDER BY cute DESC;

%%sql
SELECT *
FROM Dragon
LIMIT 2;

%%sql
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;

%%sql
SELECT *
FROM Dragon
ORDER BY RANDOM() 
LIMIT 2

%%sql
SELECT * 
FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);

%%sql
SELECT *
FROM Dish;

%%sql
SELECT type
FROM Dish;

%%sql
SELECT type
FROM Dish
GROUP BY type;

%%sql
SELECT type, SUM(cost)
FROM Dish
GROUP BY type;

%%sql
SELECT type, 
       SUM(cost), 
       MIN(cost),
       MAX(name)
FROM Dish
GROUP BY type;

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

	name	year	cute
0	hiccup	2010	10.0
1	drogon	2011	-100.0
2	dragon 2	2019	0.0
3	puff	2010	100.0
4	smaug	2011	NaN

Lecture 20 – Data 100, Spring 2025¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Storing one-line `%sql` queries¶

Storing output of multiple SQL lines¶

Approach #2: `pd.read_sql`¶

Approach #3 -- DuckDB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Basic Queries¶

`SELECT` and `FROM`¶

Aliasing with `AS`¶

Uniqueness with `DISTINCT`¶

Filtering with `WHERE`¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Sampling with `RANDOM()`¶

Grouping Data with `GROUP BY`¶

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
...	...	...	...	...	...	...	...	...	...
393	27.0	4	140.0	86.0	2790	15.6	82	usa	ford mustang gl
394	44.0	4	97.0	52.0	2130	24.6	82	europe	vw pickup
395	32.0	4	135.0	84.0	2295	11.6	82	usa	dodge rampage
396	28.0	4	120.0	79.0	2625	18.6	82	usa	ford ranger
397	31.0	4	119.0	82.0	2720	19.4	82	usa	chevy s-10

	table_catalog	table_schema	table_name	table_type	self_referencing_column_name	reference_generation	user_defined_type_catalog	user_defined_type_schema	user_defined_type_name	is_insertable_into	is_typed	commit_action	TABLE_COMMENT
0	example_duck	main	dish	BASE TABLE	None	None	None	None	None	YES	NO	None	None
1	example_duck	main	dragon	BASE TABLE	None	None	None	None	None	YES	NO	None	None
2	example_duck	main	scene	BASE TABLE	None	None	None	None	None	YES	NO	None	None

	type	name	tbl_name	rootpage	sql
0	table	sqlite_sequence	sqlite_sequence	7	CREATE TABLE sqlite_sequence(name,seq)
1	table	Dragon	Dragon	2	CREATE TABLE Dragon (\n name TEXT PRIMARY K...
2	index	sqlite_autoindex_Dragon_1	Dragon	3	None
3	table	Dish	Dish	4	CREATE TABLE Dish (\n name TEXT PRIMARY KEY...
4	index	sqlite_autoindex_Dish_1	Dish	5	None
5	table	Scene	Scene	6	CREATE TABLE Scene (\n id INTEGER PRIMARY K...

	name	type	cost
0	ravioli	entree	10
1	ramen	entree	13
2	taco	entree	7
3	edamame	appetizer	4
4	fries	appetizer	4
5	potsticker	appetizer	4
6	ice cream	dessert	5

	type	sum("cost")	min("cost")	max("name")
0	entree	30.0	7	taco
1	dessert	5.0	5	ice cream
2	appetizer	12.0	4	potsticker

	student_id	assignment_id	score
0	123	1	80.0
1	123	2	42.0
2	456	2	100.0

Lecture 20 – Data 100, Spring 2025¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Storing one-line %sql queries¶

Storing output of multiple SQL lines¶

Approach #2: pd.read_sql¶

Approach #3 -- DuckDB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Basic Queries¶

SELECT and FROM¶

Aliasing with AS¶

Uniqueness with DISTINCT¶

Filtering with WHERE¶

Ordering data using ORDER BY¶

Restricting output with LIMIT and OFFSET¶

Sampling with RANDOM()¶

Grouping Data with GROUP BY¶

Storing one-line `%sql` queries¶

Approach #2: `pd.read_sql`¶

`SELECT` and `FROM`¶

Aliasing with `AS`¶

Uniqueness with `DISTINCT`¶

Filtering with `WHERE`¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Sampling with `RANDOM()`¶

Grouping Data with `GROUP BY`¶