import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# We'll fix this to have it installed on the hub
try:
    import mapclassify
except ImportError:
    %pip install mapclassify


## Create a dataframe of random latitude-longitude pairs
x = np.random.uniform(-180, 180, 1000)
y = np.random.uniform(-90, 90, 1000)

df = pd.DataFrame(
    {
        "longitude": x,
        "latitude" : y
    }
)

df.head()


gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df.longitude, df.latitude)
)

gdf


## Built-in plotting does not include CRS or Projections, but points are points
## NOTE: I won't go into the finer points of projections or coordinate reference systems 
## because I want you to like me. And you won't like me if I start talking about that.
df.plot.scatter(x="longitude", y="latitude");


## The coordinates *mean* something. No need to label the axes as lat-lon.
gdf.plot();


## Polygon geometry rather than point geometry
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()


world.plot();


## Easy to reconfigure 
world.to_crs("EPSG:7789").plot();


fig, geo_ax = plt.subplots(figsize=(15, 10))

world[world.name != "Antarctica"].plot(
    ax=geo_ax,
    color="grey",
    edgecolor="white"
)

gdf.plot(
    ax=geo_ax,
    markersize=2
)

plt.box(None)


# Coal fired powerplants from https://www.carbonbrief.org/mapped-worlds-coal-power-plants
# Annual carbon reported in MtCO2
power_gdf = gpd.read_file("https://global-coal-map-2020.s3.eu-west-2.amazonaws.com/data/coal2020.geojson")

fig, geo_ax = plt.subplots(figsize=(15, 10))

world[world.name != "Antarctica"].plot(
    ax=geo_ax,
    color="grey",
    edgecolor="white"
)

power_gdf.plot(
    ax=geo_ax,
    markersize=2,
    cmap="OrRd",
    column="annualCarbon",
    scheme='QUANTILES',
    legend=True
)

plt.box(None)


## US State Map
usa = gpd.read_file("https://raw.githubusercontent.com/danhammer/envirods/main/data/gz_2010_us_040_00_20m.geojson")

## Farmers markets
farmers_mkts = pd.read_csv("https://raw.githubusercontent.com/danhammer/envirods/main/data/farmers-mkts.csv")
farmers_mkts = gpd.GeoDataFrame(
    farmers_mkts, 
    geometry=gpd.points_from_xy(farmers_mkts.x, farmers_mkts.y)
)


## Limit to just the Four Corners states
states = ["New Mexico", "Arizona", "Utah", "Colorado"]

state_gdf = usa[usa.NAME.isin(states)]
mkt_gdf = farmers_mkts[farmers_mkts.State.isin(states)]


fig, ax = plt.subplots(figsize=(15,10))

state_gdf.plot(
    ax=ax,
    color="lightgrey",
    edgecolor="white"
)

mkt_gdf.plot(ax=ax);


## Remove outlier. Not my table. 
mkt_gdf = mkt_gdf[mkt_gdf.x < -100]

fig, ax = plt.subplots(figsize=(15,10))

state_gdf.plot(
    ax=ax,
    color="lightgrey",
    edgecolor="white"
)

mkt_gdf.plot(ax=ax)
plt.box(None)


mkt_gdf.head()


## Limit the dataframe and encode the attributes for the presence of features of 
## the markets.
attributes = [
    'Credit', 'WIC', 'WICcash', 'SFMNP', 'SNAP', 'Bakedgoods', 'Cheese', 'Crafts',
    'Flowers', 'Eggs', 'Seafood', 'Herbs', 'Vegetables', 'Honey', 'Jams',
    'Maple', 'Meat', 'Nursery', 'Nuts', 'Plants', 'Poultry', 'Prepared',
    'Soap', 'Trees', 'Wine'
]

X = mkt_gdf[attributes]
X = (X == "Y").astype(int)
X


from sklearn.cluster import AgglomerativeClustering

# the distance between points is given by `euclidean` while the clusters 
# are defined by minimizing variance using the Ward method.
cluster = AgglomerativeClustering(
    n_clusters=5, 
    affinity='euclidean', 
    linkage='ward'
)

# assign a new column based on the label and print the values in the new column
label = cluster.fit_predict(X.values)

# what are the different labels?
set(label)

{0, 1, 2, 3, 4}


# plot the labels, incrementally, on a map to figure out if there is a geographic
# pattern associated with it.
fig, ax = plt.subplots(figsize=(9,9))

state_gdf.plot(
    ax=ax,
    color="lightgrey",
    edgecolor="white"
)

# cycle through the labels,
# note that the label "1" seems to reflect farmers' markets in New Mexico
mkt_gdf[label != 1].plot(
    ax=ax
)

# plot the New Mexico farmers' markets
mkt_gdf[label == 1].plot(
    ax=ax,
    color="orange"
);


# import this library for a simple linear regression
from scipy import stats


# Create a dummy dataset
N = 10000

beta0 = 200000
beta1 = 500
beta2 = -1000

alpha0 = 20
alpha1 = 0.5

_df = pd.DataFrame(
    data = {
        "greenness": np.random.uniform(0, 100, N),
        "epsilon": np.random.normal(0, 7000, N),
        "gamma": np.random.normal(0, 10, N)
    }
)

_df["distance"] = alpha0 + alpha1*_df.greenness + _df.gamma
_df["homeprice"] = beta0 + beta1*_df.greenness + beta2*_df.distance + _df.epsilon


plt.scatter(
    _df.distance,
    _df.homeprice,
    s=0.5
)

m, b, _, _, _ = stats.linregress(_df.distance, _df.homeprice)

plt.plot(
    _df.distance,
    b + m * _df.distance,
    color="red"
)

print(f"True impact: {beta2}")
print(f"Estimated impact: {m}")

True impact: -1000
Estimated impact: -322.85524361176374


## Add the true relationship between distance and home price in orange
## We know TRUTH because we set it, as the all-knowing creator
## Shows that if something isn't measured, it's not valued
plt.scatter(
    _df.distance,
    _df.homeprice,
    s=0.5
)

plt.plot(
    _df.distance,
    b + m * _df.distance,
    color="red"
)

plt.plot(
    _df.distance,
    beta0 + beta2*_df.distance,
    color="orange"
)

print(f"True impact: {beta2}")
print(f"Estimated impact: {m}")

True impact: -1000
Estimated impact: -322.85524361176374

	longitude	latitude
0	167.529282	25.161197
1	-22.754637	70.419415
2	-133.943472	45.020820
3	-61.090139	-46.965735
4	-146.484651	87.502942

	longitude	latitude	geometry
0	167.529282	25.161197	POINT (167.52928 25.16120)
1	-22.754637	70.419415	POINT (-22.75464 70.41941)
2	-133.943472	45.020820	POINT (-133.94347 45.02082)
3	-61.090139	-46.965735	POINT (-61.09014 -46.96574)
4	-146.484651	87.502942	POINT (-146.48465 87.50294)
...	...	...	...
995	165.612730	2.332156	POINT (165.61273 2.33216)
996	-141.963469	-23.016917	POINT (-141.96347 -23.01692)
997	115.619938	26.828147	POINT (115.61994 26.82815)
998	-55.055840	57.937209	POINT (-55.05584 57.93721)
999	26.290661	-84.511058	POINT (26.29066 -84.51106)

	pop_est	continent	name	iso_a3	gdp_md_est	geometry
0	889953.0	Oceania	Fiji	FJI	5496	MULTIPOLYGON (((180.00000 -16.06713, 180.00000...
1	58005463.0	Africa	Tanzania	TZA	63177	POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...
2	603253.0	Africa	W. Sahara	ESH	907	POLYGON ((-8.66559 27.65643, -8.66512 27.58948...
3	37589262.0	North America	Canada	CAN	1736425	MULTIPOLYGON (((-122.84000 49.00000, -122.9742...
4	328239523.0	North America	United States of America	USA	21433226	MULTIPOLYGON (((-122.84000 49.00000, -120.0000...

	FMID	State	Zip	Schedule	x	y	Location	Credit	WIC	WICcash	...	Meat	Nursery	Nuts	Plants	Poultry	Prepared	Soap	Trees	Wine	geometry
22	22004647	Colorado	NaN	NaN	-105.073000	40.395400	NaN	N	N	N	...	N	N	N	N	N	N	N	N	N	POINT (-105.07300 40.39540)
51	1002771	New Mexico	NaN	June - October Saturday 7:00 AM to 12:00 PM	-106.565838	35.103988	Other	Y	Y	N	...	N	N	N	N	N	N	N	N	N	POINT (-106.56584 35.10399)
52	1002771	New Mexico	NaN	June - October Tuesday 7:00 AM to 12:00 PM	-106.633172	35.082375	Healthcare Institution	Y	Y	N	...	N	N	N	N	N	N	N	N	N	POINT (-106.63317 35.08237)
53	1000998	Colorado	80903	June - October Monday 10:00 AM to 3:00 PM	-104.823038	38.838084	Local government building grounds	Y	N	N	...	Y	N	N	Y	N	Y	Y	N	N	POINT (-104.82304 38.83808)
75	22002651	Arizona	85233	NaN	-111.726000	33.321900	NaN	N	N	N	...	N	N	N	N	N	N	N	N	N	POINT (-111.72600 33.32190)

	Credit	WIC	WICcash	SFMNP	SNAP	Bakedgoods	Cheese	Crafts	Flowers	Eggs	...	Maple	Meat	Nursery	Nuts	Plants	Poultry	Prepared	Soap	Trees	Wine
22	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
51	1	1	0	1	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
52	1	1	0	1	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
53	1	0	0	0	1	1	1	1	1	1	...	0	1	0	0	1	0	1	1	0	0
75	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7626	0	0	0	0	1	1	1	1	1	0	...	1	1	1	1	1	1	1	1	0	1
7726	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7779	0	0	0	0	1	1	1	1	1	1	...	0	1	1	0	1	1	1	1	1	1
7780	0	0	0	0	1	1	1	1	0	1	...	0	1	0	1	1	1	1	1	0	0
7861	1	0	0	0	0	1	1	0	1	1	...	0	0	1	1	1	0	1	1	0	0

Environmental Data Science: An introductory lecture¶

Spatial data¶

An introduction to `geopandas`¶

A natural experiment from spatial data¶

Omitted variable bias¶

Environmental Data Science: An introductory lecture¶

Spatial data¶

An introduction to geopandas¶

A natural experiment from spatial data¶

Omitted variable bias¶

An introduction to `geopandas`¶