import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns


import plotly.offline as py
import plotly.express as px

import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf

cf.set_config_file(offline=True, world_readable=False)

/opt/conda/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning:

The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.


from ds100_utils import fetch_and_cache


calls_file = fetch_and_cache("https://data.cityofberkeley.info/api/views/k2nh-s5h5/rows.csv?accessType=DOWNLOAD",
                "calls_for_service.csv", force=False)

Using cached version that was downloaded (UTC): Sun Jun 27 18:20:07 2021


stops_file = fetch_and_cache("https://data.cityofberkeley.info/api/views/4tbf-3yt8/rows.json?accessType=DOWNLOAD",
                "stops.json", force=False)

Using cached version that was downloaded (UTC): Sun Jun 27 18:20:08 2021


import os
print(calls_file, "is",  os.path.getsize(calls_file) / 1e6, "MB")
print(stops_file, "is", os.path.getsize(stops_file) / 1e6, "MB")

data/calls_for_service.csv is 0.941358 MB
data/stops.json is 17.134125 MB


with open(calls_file, "r") as f:
    print(calls_file, "is", sum(1 for l in f), "lines.")

data/calls_for_service.csv is 15452 lines.


with open(stops_file, "r") as f:
    print(stops_file, "is", sum(1 for l in f), "lines.")

data/stops.json is 59573 lines.


print(calls_file, "======================")
with open(calls_file, "r") as f:
    for i in range(20):
        print(i, "\t", repr(f.readline()))

data/calls_for_service.csv ======================
0 	 'CASENO,OFFENSE,EVENTDT,EVENTTM,CVLEGEND,CVDOW,InDbDate,Block_Location,BLKADDR,City,State\n'
1 	 '19092769,THEFT MISD. (UNDER $950),12/09/2019 12:00:00 AM,13:00,LARCENY,1,09/10/2020 07:00:11 AM,"SHATTUCK AVE\n'
2 	 'Berkeley, CA",SHATTUCK AVE,Berkeley,CA\n'
3 	 '19045891,NARCOTICS,08/18/2019 12:00:00 AM,17:20,DRUG VIOLATION,0,09/10/2020 07:00:08 AM,"FRONTAGE STREET &GILMAN ST\n'
4 	 'Berkeley, CA",FRONTAGE STREET &GILMAN ST,Berkeley,CA\n'
5 	 '19060215,ASSAULT/BATTERY MISD.,10/23/2019 12:00:00 AM,10:45,ASSAULT,3,09/10/2020 07:00:10 AM,"2200 MILVIA ST\n'
6 	 'Berkeley, CA\n'
7 	 '(37.868574, -122.270415)",2200 MILVIA ST,Berkeley,CA\n'
8 	 '19092681,VANDALISM,12/01/2019 12:00:00 AM,18:40,VANDALISM,0,09/10/2020 07:00:11 AM,"VIRGINIA ST\n'
9 	 'Berkeley, CA",VIRGINIA ST,Berkeley,CA\n'
10 	 '19044228,ASSAULT/BATTERY MISD.,08/10/2019 12:00:00 AM,22:51,ASSAULT,6,09/10/2020 07:00:08 AM,"UNIVERSITY AVENUE &FRONTAGE\n'
11 	 'Berkeley, CA",UNIVERSITY AVENUE &FRONTAGE,Berkeley,CA\n'
12 	 '19092551,THEFT MISD. (UNDER $950),11/17/2019 12:00:00 AM,12:00,LARCENY,0,09/10/2020 07:00:11 AM,"ASHBY AVE\n'
13 	 'Berkeley, CA",ASHBY AVE,Berkeley,CA\n'
14 	 '19047517,BURGLARY AUTO,08/25/2019 12:00:00 AM,18:25,BURGLARY - VEHICLE,0,09/10/2020 07:00:08 AM,"CATALINA AVE\n'
15 	 'Berkeley, CA",CATALINA AVE,Berkeley,CA\n'
16 	 '19091711,VANDALISM,08/19/2019 12:00:00 AM,22:00,VANDALISM,1,09/10/2020 07:00:08 AM,"CALIFORNIA STREET & FAIRVIEW ST\n'
17 	 'Berkeley, CA",CALIFORNIA STREET & FAIRVIEW ST,Berkeley,CA\n'
18 	 '19092111,VANDALISM,09/24/2019 12:00:00 AM,20:00,VANDALISM,2,09/10/2020 07:00:09 AM,"600 CANYON RD\n'
19 	 'Berkeley, CA",600 CANYON RD,Berkeley,CA\n'


print(stops_file, "======================")
with open(stops_file, "r") as f:
    for i in range(20):
        print(i, "\t", repr(f.readline()))

data/stops.json ======================
0 	 '{\n'
1 	 '  "meta" : {\n'
2 	 '    "view" : {\n'
3 	 '      "id" : "4tbf-3yt8",\n'
4 	 '      "name" : "Berkeley PD - Stop Data (NEW)",\n'
5 	 '      "attribution" : "City of Berkeley Police Department",\n'
6 	 '      "averageRating" : 0,\n'
7 	 '      "category" : "Public Safety",\n'
8 	 '      "createdAt" : 1588602591,\n'
9 	 '      "description" : "This data was extracted from the Department’s Public Safety Server and covers data beginning January 26, 2015.  On January 26, 2015 the department began collecting data pursuant to General Order B-4 (issued December 31, 2014). Under that Order, officers were required to provide certain data after making any detention (vehicle, bicycle, pedestrian, suspicious auto).  This dataset provides information about detentions, including the race, sex, age range, of the person detained; the reason for the stop; the type of enforcement taken (if any), and whether or not a search was conducted.  Also provided are the date, time, location of the detention, as well as the incident number and call for service type.",\n'
10 	 '      "displayType" : "table",\n'
11 	 '      "downloadCount" : 48,\n'
12 	 '      "hideFromCatalog" : false,\n'
13 	 '      "hideFromDataJson" : false,\n'
14 	 '      "licenseId" : "USGOV_WORKS",\n'
15 	 '      "newBackend" : true,\n'
16 	 '      "numberOfComments" : 0,\n'
17 	 '      "oid" : 34059901,\n'
18 	 '      "provenance" : "official",\n'
19 	 '      "publicationAppendEnabled" : false,\n'


calls = pd.read_csv(calls_file, warn_bad_lines=True)
calls.head()


calls.shape[0]

5227


calls.groupby(["City", "State"]).count()


dow = pd.Series(["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"], name="Day")
dow

0       Sunday
1       Monday
2      Tuesday
3    Wednesday
4     Thursday
5       Friday
6     Saturday
Name: Day, dtype: object


df_dow = pd.DataFrame(dow)
# Notice that I am dropping the column if it already exists to
# make it so I can run this cell more than once
calls = pd.merge(calls.drop(columns="Day", errors="ignore"), 
         df_dow, left_on='CVDOW', right_index=True).sort_index()
calls


calls['Block_Location'].head(10)

0                           SHATTUCK AVE\nBerkeley, CA
1             FRONTAGE STREET &GILMAN ST\nBerkeley, CA
2    2200 MILVIA ST\nBerkeley, CA\n(37.868574, -122...
3                            VIRGINIA ST\nBerkeley, CA
4            UNIVERSITY AVENUE &FRONTAGE\nBerkeley, CA
5                              ASHBY AVE\nBerkeley, CA
6                           CATALINA AVE\nBerkeley, CA
7        CALIFORNIA STREET & FAIRVIEW ST\nBerkeley, CA
8                          600 CANYON RD\nBerkeley, CA
9    1700 FOURTH ST\nBerkeley, CA\n(37.871883, -122...
Name: Block_Location, dtype: object


calls_lat_lon = (
    # Remove newlines
    calls['Block_Location'].str.replace("\n", "\t") 
    # Extract Lat and Lon using regular expression
    .str.extract(".*\((?P<Lat>\d*\.\d*)\, (?P<Lon>-?\d*\.\d*)\)", expand=True)
)
calls_lat_lon.head(20)


(~calls_lat_lon.isnull()).mean()

Lat    0.963076
Lon    0.963076
dtype: float64


# Remove Lat and Lon if they already existed before (reproducible)
calls.drop(["Lat", "Lon"], axis=1, inplace=True, errors="ignore")
# Join in the the latitude and longitude data
calls = calls.merge(calls_lat_lon, left_index=True, right_index=True)
# calls[["Lat", "Lon"]] = calls_lat_lon
# calls.join(calls_lat_lon)
calls.head()


calls[calls['Lat'].isnull()].head(10)


import json

with open("data/stops.json", "rb") as f:
    stops_json = json.load(f)


type(stops_json)

dict


stops_json.keys()

dict_keys(['meta', 'data'])


stops_json['meta'].keys()

dict_keys(['view'])


stops_json['meta']['view'].keys()

dict_keys(['id', 'name', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'licenseId', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'grants', 'license', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])


print(stops_json['meta']['view']['description'])

This data was extracted from the Department’s Public Safety Server and covers data beginning January 26, 2015.  On January 26, 2015 the department began collecting data pursuant to General Order B-4 (issued December 31, 2014). Under that Order, officers were required to provide certain data after making any detention (vehicle, bicycle, pedestrian, suspicious auto).  This dataset provides information about detentions, including the race, sex, age range, of the person detained; the reason for the stop; the type of enforcement taken (if any), and whether or not a search was conducted.  Also provided are the date, time, location of the detention, as well as the incident number and call for service type.


type(stops_json['meta']['view']['columns'])

list


for c in stops_json['meta']['view']['columns']:
    top = ""
    items = ""
    if "cachedContents" in c:
        top = c["cachedContents"]["top"]
        for i in top:
            items = items + "\n\t" + i["item"] + ":" + i["count"]
    print(c["name"], ":", items)

sid : 
id : 
position : 
created_at : 
created_meta : 
updated_at : 
updated_meta : 
meta : 
CreateDatetime : 
	2015-06-11T06:48:26.000:10
	2020-02-13T08:10:43.000:6
	2017-01-26T11:10:06.000:6
	2020-01-07T08:15:40.000:6
	2016-07-03T23:45:04.000:5
	2015-02-01T19:33:13.000:5
	2016-06-11T19:55:36.000:5
	2015-11-16T07:19:28.000:5
	2016-04-19T00:50:19.000:5
	2016-03-25T01:13:14.000:5
	2017-01-06T00:25:16.000:5
	2016-12-22T23:58:59.000:5
	2016-07-19T02:51:56.000:4
	2015-11-09T22:17:36.000:4
	2015-12-19T02:11:34.000:4
	2016-07-21T02:03:22.000:4
	2017-06-29T02:15:29.000:4
	2018-07-02T21:36:29.000:4
	2016-05-25T01:20:06.000:4
	2016-05-01T01:23:53.000:4
IncidentNumber : 
	2020-00001049:6
	2020-00008547:6
	2015-00033565:5
	2016-00039405:5
	2016-00017339:5
	2016-00075241:5
	2016-00034574:5
	2017-00000828:5
	2015-00033576:5
	2015-00067321:5
	2015-00006347:5
	2016-00022806:5
	2016-00030608:4
	2019-00025759:4
	2016-00049625:4
	2018-00010987:4
	2015-00033137:4
	2017-00045853:4
	2016-00042412:4
	2017-00057823:4
Address : 
	UNIVERSITY AVE / 6TH ST:525
	UNIVERSITY AVE / SAN PABLO AVE:472
	ASHBY AVE / 7TH ST:449
	6TH ST / UNIVERSITY AVE:449
	7TH ST / POTTER ST:372
	UNIVERSITY AVE / MARTIN LUTHER KING JR WAY:360
	ASHBY AVE / SAN PABLO AVE:351
	80 BOLIVAR DR:349
	SAN PABLO AVE / UNIVERSITY AVE:340
	UNIVERSITY AVE / FRONTAGE RD:320
	7TH ST / ASHBY AVE:311
	UNIVERSITY AVE / ACTON ST:296
	UNIVERSITY AVE / SACRAMENTO ST:295
	SHATTUCK AVE / KITTREDGE ST:272
	UNIVERSITY AVE / BONAR ST:242
	SAN PABLO AVE / ASHBY AVE:231
	POTTER ST / 7TH ST:217
	UNIVERSITY AVE / 7TH ST:214
	1998 SHATTUCK AVE:211
	UNIVERSITY AVE / SHATTUCK AVE:204
City : 
	BERKELEY:53060
	OAKLAND:1818
	EMERYVILLE:316
	ALBANY:129
	Berkeley:2
Lat : 
	-361:2958
	37.867993895:974
	37.869096588:812
	37.851008554:760
	37.851514871:589
	37.852140636:582
	37.871555258:500
	37.870410562:410
	37.868489249:396
	37.872151446:394
	37.866444425:392
	37.853281083:355
	37.870112645:349
	37.864546214:349
	37.868202006:335
	37.854316382:335
	37.851615813:311
	37.8693028530001:289
	37.865779797:269
	37.869696081:265
Lon : 
	-361:2958
	-122.297664389:974
	-122.292119433:812
	-122.291104028:760
	-122.291270411:589
	-122.286683357:582
	-122.273001335:500
	-122.281937988:410
	-122.267867519:396
	-122.26841392:394
	-122.305556167:392
	-122.279011568:355
	-122.301738812:349
	-122.284248929:349
	-122.29662475:335
	-122.271084145:335
	-122.289369289:311
	-122.272234021:289
	-122.267576002:269
	-122.287499477:265
CallType : 
	T:46101
	1194:6873
	1196:3247
	1194B:2497
Race : 
	White:20544
	Black:19776
	Hispanic:7534
	Other:5885
	Asian:4621
	0:358
Gender : 
	Male:41115
	Female:17245
	0:358
Age : 
	>40:21829
	18-29:19276
	30-39:15822
	<18:1433
	0:358
Reason : 
	Traffic:49525
	Investigation:5329
	Reas. Susp.:2307
	Prob./Parole:984
	0:358
	Wanted:211
Enforcement : 
	Warning:35217
	Citation:20262
	Arrest:1930
	Other:951
	0:358
Car Search : 
	No Search:49500
	Search:8860
	0:358


for i in range(3):
    print(i, "\t", stops_json['data'][i])

0 	 ['row-2j6s.qe6z_jtkn', '00000000-0000-0000-D599-E1C6017B6966', 0, 1589533219, None, 1589533219, None, '{ }', '2017-05-02T20:50:29', '2017-00024695', '1275 WALNUT ST', 'BERKELEY', '37.88452086', '-122.268652364', '1194', 'White', 'Female', '30-39', 'Investigation', 'Warning', 'No Search']
1 	 ['row-79rz~ucps-35r6', '00000000-0000-0000-B83C-7A2ADDC55727', 0, 1598605226, None, 1598605226, None, '{ }', '2018-02-20T18:42:04', '2018-00010451', '2176 KITTREDGE ST', 'BERKELEY', '37.868233788', '-122.266394615', '1194', 'White', 'Male', '>40', 'Investigation', 'Other', 'No Search']
2 	 ['row-tmr9_n9ye.6bzh', '00000000-0000-0000-3120-DC62279E5799', 0, 1598605226, None, 1598605226, None, '{ }', '2020-03-04T16:18:11', '2020-00012764', '1414 UNIVERSITY AVE', 'BERKELEY', '37.869690714', '-122.283865643', 'T', 'Black', 'Female', '>40', 'Traffic', 'Citation', 'No Search']


# Load the data from JSON and assign column titles
stops = pd.DataFrame(
    stops_json['data'],
    columns=[c['name'] for c in stops_json['meta']['view']['columns']])

stops.head()


stops.columns

Index(['sid', 'id', 'position', 'created_at', 'created_meta', 'updated_at',
       'updated_meta', 'meta', 'CreateDatetime', 'IncidentNumber', 'Address',
       'City', 'Lat', 'Lon', 'CallType', 'Race', 'Gender', 'Age', 'Reason',
       'Enforcement', 'Car Search'],
      dtype='object')


pd.set_option('display.max_columns', 100) 
pd.set_option('display.max_rows', 100)


stops.head(10)


calls.head()


print("There are", calls['CASENO'].unique().shape[0], "unique case numbers.")
print("There are", calls.shape[0], "calls in the table.")

There are 5227 unique case numbers.
There are 5227 calls in the table.


calls['CASENO'].sort_values().reset_index(drop=True).iplot(
    yTitle="Case Number", xTitle="Location in File")


calls['CASENO'].sort_values().head()

4121    17043531
2204    19010435
4893    19013597
3402    19035723
1587    19035759
Name: CASENO, dtype: int64


calls.iloc[[4121]]


calls.head(3)


calls["EVENTDT"][0]

'12/09/2019 12:00:00 AM'


dates = pd.to_datetime(calls["EVENTDT"])
dates[0]

Timestamp('2019-12-09 00:00:00')


pd.DataFrame(dict(transformed=dates, original=calls["EVENTDT"])).head()


times = pd.to_datetime(calls["EVENTTM"]).dt.time
times.head()

0    13:00:00
1    17:20:00
2    10:45:00
3    18:40:00
4    22:51:00
Name: EVENTTM, dtype: object


from datetime import datetime
timestamps = pd.concat([dates, times], axis=1).apply(
    lambda r: datetime.combine(r['EVENTDT'], r['EVENTTM']), axis=1)
timestamps.head()

0   2019-12-09 13:00:00
1   2019-08-18 17:20:00
2   2019-10-23 10:45:00
3   2019-12-01 18:40:00
4   2019-08-10 22:51:00
dtype: datetime64[ns]


calls['timestamp'] = timestamps
calls.head()


calls['timestamp'].min()

Timestamp('2019-07-01 00:00:00')


calls['timestamp'].max()

Timestamp('2019-12-20 01:30:00')


dow = ["Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"]
calls.groupby('Day')['CASENO'].count()[dow].iplot(kind='bar', yTitle="Count")


calls['hour_of_day'] = (
    calls['timestamp'].dt.hour * 60 + calls['timestamp'].dt.minute ) / 60.


py.iplot(ff.create_distplot([calls['hour_of_day']],group_labels=["Hour"],bin_size=1, show_rug=False))


px.violin(calls.sort_values("CVDOW"), y="hour_of_day", x="Day", box=True, points="all", hover_name="CVLEGEND")


calls.head()


calls['OFFENSE'].value_counts().iplot(kind="bar")


calls['CVLEGEND'].value_counts().iplot(kind="bar")


boxes = [(len(df), go.Violin(y=df["hour_of_day"], name=i)) for (i, df) in calls.groupby("CVLEGEND")]
py.iplot([r[1] for r in sorted(boxes, key=lambda x:x[0], reverse=True)])


py.iplot(ff.create_distplot([
    calls[calls['CVLEGEND'] == "NOISE VIOLATION"]['hour_of_day'],
    calls[calls['CVLEGEND'] == "DRUG VIOLATION"]['hour_of_day'],
    calls[calls['CVLEGEND'] == "LIQUOR LAW VIOLATION"]['hour_of_day'],
    calls[calls['CVLEGEND'] == "FRAUD"]['hour_of_day']
],
    group_labels=["Noise Violation", "Drug Violation", "Liquor Violation", "Fraud"], 
    ))


calls["missing_lat_lon"] = calls[['Lat', 'Lon']].isnull().any(axis=1)
calls.groupby("CVLEGEND")['missing_lat_lon'].mean().sort_values().iplot(kind="barh")


import folium
import folium.plugins # The Folium Javascript Map Library

SF_COORDINATES = (37.87, -122.28)
sf_map = folium.Map(location=SF_COORDINATES, zoom_start=13)
locs = calls[['Lat', 'Lon']].astype('float').dropna().to_numpy()
heatmap = folium.plugins.HeatMap(locs.tolist(), radius = 10)
sf_map.add_child(heatmap)


cluster = folium.plugins.MarkerCluster()
for _, r in calls[['Lat', 'Lon', 'CVLEGEND']].tail(1000).dropna().iterrows():
    cluster.add_child(
        folium.Marker([float(r["Lat"]), float(r["Lon"])], popup=r['CVLEGEND']))
    
sf_map = folium.Map(location=SF_COORDINATES, zoom_start=13)
sf_map.add_child(cluster)
sf_map

	Lat	Lon
0	NaN	NaN
1	NaN	NaN
2	37.868574	-122.270415
3	NaN	NaN
4	NaN	NaN
5	NaN	NaN
6	NaN	NaN
7	NaN	NaN
8	NaN	NaN
9	37.871883	-122.301255
10	37.870948	-122.27733
11	37.895601	-122.285524
12	37.868164	-122.256314
13	37.873687	-122.268616
14	37.885385	-122.273547
15	37.880667	-122.29489
16	37.871167	-122.268285
17	37.869058	-122.270455
18	37.873017	-122.275481
19	37.866327	-122.263359

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	19092769	THEFT MISD. (UNDER $950)	12/09/2019 12:00:00 AM	13:00	LARCENY	1	09/10/2020 07:00:11 AM	SHATTUCK AVE\nBerkeley, CA	SHATTUCK AVE	Berkeley	CA
1	19045891	NARCOTICS	08/18/2019 12:00:00 AM	17:20	DRUG VIOLATION	0	09/10/2020 07:00:08 AM	FRONTAGE STREET &GILMAN ST\nBerkeley, CA	FRONTAGE STREET &GILMAN ST	Berkeley	CA
2	19060215	ASSAULT/BATTERY MISD.	10/23/2019 12:00:00 AM	10:45	ASSAULT	3	09/10/2020 07:00:10 AM	2200 MILVIA ST\nBerkeley, CA\n(37.868574, -122...	2200 MILVIA ST	Berkeley	CA
3	19092681	VANDALISM	12/01/2019 12:00:00 AM	18:40	VANDALISM	0	09/10/2020 07:00:11 AM	VIRGINIA ST\nBerkeley, CA	VIRGINIA ST	Berkeley	CA
4	19044228	ASSAULT/BATTERY MISD.	08/10/2019 12:00:00 AM	22:51	ASSAULT	6	09/10/2020 07:00:08 AM	UNIVERSITY AVENUE &FRONTAGE\nBerkeley, CA	UNIVERSITY AVENUE &FRONTAGE	Berkeley	CA

	sid	id	created_at	created_meta	updated_at	updated_meta	meta	CreateDatetime	IncidentNumber	...	City	Lat	Lon	CallType	Race	Gender	Age	Reason	Enforcement	Car Search
0	row-2j6s.qe6z_jtkn	00000000-0000-0000-D599-E1C6017B6966	1589533219	None	1589533219	None	{ }	2017-05-02T20:50:29	2017-00024695	...	BERKELEY	37.88452086	-122.268652364	1194	White	Female	30-39	Investigation	Warning	No Search
1	row-79rz~ucps-35r6	00000000-0000-0000-B83C-7A2ADDC55727	1598605226	None	1598605226	None	{ }	2018-02-20T18:42:04	2018-00010451	...	BERKELEY	37.868233788	-122.266394615	1194	White	Male	>40	Investigation	Other	No Search
2	row-tmr9_n9ye.6bzh	00000000-0000-0000-3120-DC62279E5799	1598605226	None	1598605226	None	{ }	2020-03-04T16:18:11	2020-00012764	...	BERKELEY	37.869690714	-122.283865643	T	Black	Female	>40	Traffic	Citation	No Search
3	row-cjve_43yp_s8ec	00000000-0000-0000-4EEF-F940D151C603	1598605226	None	1598605226	None	{ }	2018-12-04T15:11:14	2018-00067166	...	BERKELEY	37.867138358	-122.268229602	1196	Hispanic	Male	30-39	Traffic	Warning	No Search
4	row-wxqp_6mgh-v4r5	00000000-0000-0000-8B94-7D86EB47C51D	1589533219	None	1589533219	None	{ }	2020-05-13T09:37:53	2020-00023435	...	BERKELEY	37.865779797	-122.267576002	1194	Black	Male	>40	Traffic	Citation	No Search

	transformed	original
0	2019-12-09	12/09/2019 12:00:00 AM
1	2019-08-18	08/18/2019 12:00:00 AM
2	2019-10-23	10/23/2019 12:00:00 AM
3	2019-12-01	12/01/2019 12:00:00 AM
4	2019-08-10	08/10/2019 12:00:00 AM

Lecture 6 – Data 100, Summer 2021¶

Introduction¶

Getting the Data¶

Calls Data¶

Stop Data¶

Most data has bad documentation:¶

Reproducible Data Science¶

Downloading the Data¶

Exploring the data¶

Structure¶

How big is the data?¶

What is the file format? (Can we trust extensions?)¶

What are some observations about Calls data?¶

What are some observations about Stops data?¶

A quick note on JSON¶

Loading the Data¶

Loading the Calls Data¶

Preliminary observations on the data?¶

Checking that the City and State fields are all Berkeley CA¶

Decoding day of the week¶

Cleaning Block Location¶

Loading the stops.json Data¶

We can now examine what keys are in the top level json object¶

Observation¶

Digging into Meta Data¶

Columns Meta data¶

Observations?¶

Examining the Data Field¶

Building a Dataframe from JSON¶

Preliminary Observations¶

Stop Data¶

EDA¶

Are Case Numbers unique?¶

What might we be observing?¶

Examining the Date¶

What time range does the data represent¶

Are there any other interesting temporal patterns¶

How about temporal patterns within a day?¶

Observations?¶

Stratified Analysis¶

Observations?¶

Examining the Event¶

The Offense Field¶

Observations?¶

CVLEGEND¶

Stratified Analysis of Time of Day by CVLEGEND¶

Examining Location information¶

Observations?¶

Examine data geographically¶

Questions¶

What are some observations about `Calls` data?¶

What are some observations about `Stops` data?¶

Loading the `stops.json` Data¶