pip install jupysql --upgrade

%load_ext sql

There's a new jupysql version available (0.10.14), you're running 0.8.0. To upgrade: pip install jupysql --upgrade
Deploy FastAPI apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup

%sql sqlite:///data/basic_examples.db --alias sqlite

%sql duckdb:///data/example_duck.db --alias duckdb

from sqlalchemy import create_engine

snow_engine = create_engine(
    f"snowflake://{user}:{password}@{account_identifier}")
%sql snow_engine --alias snow

db_engine = create_engine(
  url = f"databricks://token:{access_token}@{server_hostname}?" +
        f"http_path={http_path}&catalog={catalog}&schema={schema}"
)
%sql db_engine --alias db

%%sql
SELECT * FROM Dragon;

%%sql
SELECT *
FROM Dragon;

import sqlalchemy 
import pandas as pd

engine = sqlalchemy.create_engine("duckdb:///data/example_duck.db")

query = """
SELECT * 
FROM Dragon;
"""

df = pd.read_sql(query, engine)
df

import seaborn as sns
mpg = sns.load_dataset("mpg")

%%sql
SELECT * FROM mpg

%%sql
SELECT * FROM information_schema.tables

%%sql
SELECT * FROM information_schema.columns

SELECT * FROM information_schema.columns

pd.options.display.max_colwidth = None
pd.options.display.max_rows = None
pd.read_sql("SELECT * FROM sqlite_schema", "duckdb:///data/example_duck.db")

from sqlalchemy import inspect
inspector = inspect(engine)
inspector.get_table_names()

['dish', 'dragon', 'scene']

inspector.get_columns('scene')

[{'name': 'id',
  'type': INTEGER(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'biome',
  'type': VARCHAR(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'city',
  'type': VARCHAR(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'visitors',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'created_at',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': 'current_date()',
  'autoincrement': False,
  'comment': None}]

sqlite_engine = sqlalchemy.create_engine("sqlite:///data/basic_examples.db")
inspect(sqlite_engine).get_columns("scene")

[{'name': 'id',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 1},
 {'name': 'biome',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'city',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'primary_key': 0},
 {'name': 'visitors',
  'type': INTEGER(),
  'nullable': True,
  'default': None,
  'primary_key': 0},
 {'name': 'created_at',
  'type': DATETIME(),
  'nullable': True,
  'default': "DATETIME('now')",
  'primary_key': 0}]

%%sql

DROP TABLE IF EXISTS grade;
DROP TABLE IF EXISTS assignment;
DROP TABLE IF EXISTS student;


CREATE TABLE student (
    student_id INTEGER PRIMARY KEY,
    name VARCHAR,
    email VARCHAR
);

CREATE TABLE assignment (
    assignment_id INTEGER PRIMARY KEY,
    description VARCHAR
);

CREATE TABLE grade (
    student_id INTEGER,
    assignment_id INTEGER,
    score REAL CHECK (score > 0 AND score <= 100),
    FOREIGN KEY (student_id) REFERENCES student(student_id),
    FOREIGN KEY (assignment_id) REFERENCES assignment(assignment_id)
);

INSERT INTO student VALUES
(123, 'JoeyG', 'jegonzal@berkeley.edu'),
(456, 'NargesN', 'norouzi@berkeley.edu');

INSERT INTO assignment VALUES
(1, 'easy assignment'),
(2, 'hard assignment');

%%sql 
INSERT INTO grade VALUES
(123, 1, 80),
(123, 2, 42),
(456, 2, 100);

%sql SELECT * FROM grade;

%%sql
SELECT * FROM Dragon;

%%sql
SELECT cute, year FROM Dragon;

%%sql
SELECT cute AS cuteness,
       year AS "birth year"
FROM Dragon;

%%sql
SELECT DISTINCT year
FROM Dragon;

%%sql
SELECT name, year
FROM Dragon
WHERE cute > 0;

%%sql
SELECT name, cute, year
FROM Dragon
WHERE cute > 0 OR year > 2013;

%%sql
SELECT name, year
FROM Dragon 
WHERE name IN ('puff', 'hiccup');

%%sql
SELECT name, cute
FROM Dragon
WHERE cute IS NOT NULL;

%%sql
SELECT *
FROM Dragon
ORDER BY cute DESC;

%%sql
SELECT *
FROM Dragon
LIMIT 2;

%%sql
SELECT *
FROM Dragon
LIMIT 2
OFFSET 1;

%%sql
SELECT *
FROM Dragon
ORDER BY RANDOM() 
LIMIT 2

%%sql
SELECT * 
FROM Dragon USING SAMPLE reservoir(2 ROWS) REPEATABLE (100);

%%sql
SELECT *
FROM Dish;

%%sql
SELECT type
FROM Dish;

%%sql
SELECT type
FROM Dish
GROUP BY type;

%%sql
SELECT type, SUM(cost)
FROM Dish
GROUP BY type;

%%sql
SELECT type, 
       SUM(cost), 
       MIN(cost),
       MAX(name)
FROM Dish
GROUP BY type;

%%sql
SELECT year, COUNT(cute)
FROM Dragon
GROUP BY year;

%%sql
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

%sql SELECT * FROM Dragon

dragon_table = %sql SELECT * FROM Dragon
dragon_table

type(dragon_table)

sql.run.ResultSet

dragon_df = dragon_table.DataFrame()
dragon_df

%config SqlMagic.autopandas = True

dragon_df = %sql SELECT * FROM Dragon
dragon_df

type(dragon_df)

pandas.core.frame.DataFrame

%%sql
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

%%sql dragon_years <<
SELECT year, COUNT(*)
FROM Dragon
GROUP BY year;

dragon_years

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

	name	year	cute
0	hiccup	2010	10.0
1	drogon	2011	-100.0
2	dragon 2	2019	0.0
3	puff	2010	100.0
4	smaug	2011	NaN

mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino
15.0	8	429.0	198.0	4341	10.0	70	usa	ford galaxie 500
14.0	8	454.0	220.0	4354	9.0	70	usa	chevrolet impala
14.0	8	440.0	215.0	4312	8.5	70	usa	plymouth fury iii
14.0	8	455.0	225.0	4425	10.0	70	usa	pontiac catalina
15.0	8	390.0	190.0	3850	8.5	70	usa	amc ambassador dpl

name	year	cute
hiccup	2010	10
drogon	2011	-100
dragon 2	2019	0
puff	2010	100
smaug	2011	None

Lecture 20 – Data 100, Fall 2024¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Approach #3: `pd.read_sql`¶

Approach "3" -- Duck DB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Example of table creation with interesting constraints¶

Basic Queries¶

SELECT and FROM¶

Aliasing with `AS`¶

Uniqueness with `DISTINCT`¶

Filtering with `WHERE`¶

Checking against NULL¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Sampling¶

Grouping Data with `GROUP BY`¶

Working with the `sql` results as Python variables¶

Storing one-line `%sql` queries¶

Storing multi-line `%%sql` queries¶

table_catalog	table_schema	table_name	table_type	self_referencing_column_name	reference_generation	user_defined_type_catalog	user_defined_type_schema	user_defined_type_name	is_insertable_into	is_typed	commit_action	TABLE_COMMENT
example_duck	main	dish	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	dragon	BASE TABLE	None	None	None	None	None	YES	NO	None	None
example_duck	main	scene	BASE TABLE	None	None	None	None	None	YES	NO	None	None

	type	name	tbl_name	sql
0	table	dish	dish	CREATE TABLE dish("name" VARCHAR PRIMARY KEY, "type" VARCHAR, "cost" INTEGER, CHECK(("cost" >= 0)));
1	table	dragon	dragon	CREATE TABLE dragon("name" VARCHAR PRIMARY KEY, "year" INTEGER, cute INTEGER, CHECK(("year" >= 2000)));
2	table	scene	scene	CREATE TABLE scene(id INTEGER PRIMARY KEY, biome VARCHAR NOT NULL, city VARCHAR NOT NULL, visitors INTEGER, created_at TIMESTAMP DEFAULT(current_date()), CHECK((visitors >= 0)));

name	type	cost
ravioli	entree	10
ramen	entree	13
taco	entree	7
edamame	appetizer	4
fries	appetizer	4
potsticker	appetizer	4
ice cream	dessert	5

student_id	assignment_id	score
123	1	80.0
123	2	42.0
456	2	100.0

Lecture 20 – Data 100, Fall 2024¶

Starting Up SQL¶

Approach #1: SQL Magic¶

Approach #3: pd.read_sql¶

Approach "3" -- Duck DB Special¶

Tables and Schema¶

Getting Schema information with SQLAlchemy¶

Example of table creation with interesting constraints¶

Basic Queries¶

SELECT and FROM¶

Aliasing with AS¶

Uniqueness with DISTINCT¶

Filtering with WHERE¶

Checking against NULL¶

Ordering data using ORDER BY¶

Restricting output with LIMIT and OFFSET¶

Sampling¶

Grouping Data with GROUP BY¶

Working with the sql results as Python variables¶

Storing one-line %sql queries¶

Storing multi-line %%sql queries¶

Approach #3: `pd.read_sql`¶

Aliasing with `AS`¶

Uniqueness with `DISTINCT`¶

Filtering with `WHERE`¶

Ordering data using `ORDER BY`¶

Restricting output with `LIMIT` and `OFFSET`¶

Grouping Data with `GROUP BY`¶

Working with the `sql` results as Python variables¶

Storing one-line `%sql` queries¶

Storing multi-line `%%sql` queries¶