The objective of this lecture is to introduce environmental data science to students who are already learning applied data science (e.g., Data 100). But this isn't even all of environmental data science — just the socioeconomic side and not the geophysics.
There are three key results of the lecture:
geopandas
as an extension to pandas
, focusing on different expressions of the geometry
attribute.Data 100 Guest lecture by Dan Hammer.
geopandas
¶import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
# We'll fix this to have it installed on the hub
try:
import mapclassify
except ImportError:
%pip install mapclassify
## Create a dataframe of random latitude-longitude pairs
x = np.random.uniform(-180, 180, 1000)
y = np.random.uniform(-90, 90, 1000)
df = pd.DataFrame(
{
"longitude": x,
"latitude" : y
}
)
df.head()
longitude | latitude | |
---|---|---|
0 | 167.529282 | 25.161197 |
1 | -22.754637 | 70.419415 |
2 | -133.943472 | 45.020820 |
3 | -61.090139 | -46.965735 |
4 | -146.484651 | 87.502942 |
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.longitude, df.latitude)
)
gdf
longitude | latitude | geometry | |
---|---|---|---|
0 | 167.529282 | 25.161197 | POINT (167.52928 25.16120) |
1 | -22.754637 | 70.419415 | POINT (-22.75464 70.41941) |
2 | -133.943472 | 45.020820 | POINT (-133.94347 45.02082) |
3 | -61.090139 | -46.965735 | POINT (-61.09014 -46.96574) |
4 | -146.484651 | 87.502942 | POINT (-146.48465 87.50294) |
... | ... | ... | ... |
995 | 165.612730 | 2.332156 | POINT (165.61273 2.33216) |
996 | -141.963469 | -23.016917 | POINT (-141.96347 -23.01692) |
997 | 115.619938 | 26.828147 | POINT (115.61994 26.82815) |
998 | -55.055840 | 57.937209 | POINT (-55.05584 57.93721) |
999 | 26.290661 | -84.511058 | POINT (26.29066 -84.51106) |
1000 rows × 3 columns
## Built-in plotting does not include CRS or Projections, but points are points
## NOTE: I won't go into the finer points of projections or coordinate reference systems
## because I want you to like me. And you won't like me if I start talking about that.
df.plot.scatter(x="longitude", y="latitude");
## The coordinates *mean* something. No need to label the axes as lat-lon.
gdf.plot();
## Polygon geometry rather than point geometry
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()
pop_est | continent | name | iso_a3 | gdp_md_est | geometry | |
---|---|---|---|---|---|---|
0 | 889953.0 | Oceania | Fiji | FJI | 5496 | MULTIPOLYGON (((180.00000 -16.06713, 180.00000... |
1 | 58005463.0 | Africa | Tanzania | TZA | 63177 | POLYGON ((33.90371 -0.95000, 34.07262 -1.05982... |
2 | 603253.0 | Africa | W. Sahara | ESH | 907 | POLYGON ((-8.66559 27.65643, -8.66512 27.58948... |
3 | 37589262.0 | North America | Canada | CAN | 1736425 | MULTIPOLYGON (((-122.84000 49.00000, -122.9742... |
4 | 328239523.0 | North America | United States of America | USA | 21433226 | MULTIPOLYGON (((-122.84000 49.00000, -120.0000... |
world.plot();
## Easy to reconfigure
world.to_crs("EPSG:7789").plot();
fig, geo_ax = plt.subplots(figsize=(15, 10))
world[world.name != "Antarctica"].plot(
ax=geo_ax,
color="grey",
edgecolor="white"
)
gdf.plot(
ax=geo_ax,
markersize=2
)
plt.box(None)