Calculating spatial averages for each country after spatial join - python

Hello I am using the following code at the bottom to extract countries from coordinates. Please see the following url which provides a more detailed explanation of the code: Extracting countries from NetCDF data using geopandas.
My main variable/value is the monthly mean pdsi value from: https://psl.noaa.gov/data/gridded/data.pdsi.html. The image below represents a portion of the visualization created by the code below. The shaded squares represent the spatial regions of pdsi values, which is overlapping a shapefile of the world.
From the image of Belgium, you can see that the 4 squares that touch the land area of Belgium are also touching other countries. If I attribute the base values to the Belgium, I believe this overestimates the mean pdsi values. Especially when considering the bottom two squares barely touch Belgium, the weight of these values when calculating the mean should be significantly lower. Thus, is there a way to incorporate some sort of weighted average where the area of each square within a country can be used as the weight to adjust each pdsi value? Additionally, I would like to standardize this process not only for Belgium, but for all countries as well.
Any help would be greatly appreciated!
import geopandas as gpd
import numpy as np
import plotly.express as px
import requests
from pathlib import Path
from zipfile import ZipFile
import urllib
import shapely.geometry
import xarray as xr
# download NetCDF data...
# fmt: off
url = "https://psl.noaa.gov/repository/entry/get/pdsi.mon.mean.selfcalibrated.nc?entryid=synth%3Ae570c8f9-ec09-4e89-93b4-babd5651e7a9%3AL2RhaV9wZHNpL3Bkc2kubW9uLm1lYW4uc2VsZmNhbGlicmF0ZWQubmM%3D"
f = Path.cwd().joinpath(Path(urllib.parse.urlparse(url).path).name)
# fmt: on
if not f.exists():
r = requests.get(url, stream=True, headers={"User-Agent": "XY"})
with open(f, "wb") as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
ds = xr.open_dataset(f)
pdsi = ds.to_dataframe()
pdsi = pdsi.reset_index().dropna() # don't care about places in oceans...
# use subset for testing... last 5 times...
pdsim = pdsi.loc[pdsi["time"].isin(pdsi.groupby("time").size().index[-5:])]
# create geopandas dataframe
gdf = gpd.GeoDataFrame(
pdsim, geometry=pdsim.loc[:, ["lon", "lat"]].apply(shapely.geometry.Point, axis=1)
)
# make sure that data supports using a buffer...
assert (
gdf["lat"].diff().loc[lambda s: s.ne(0)].mode()
== gdf["lon"].diff().loc[lambda s: s.ne(0)].mode()
).all()
# how big should the square buffer be around the point??
buffer = gdf["lat"].diff().loc[lambda s: s.ne(0)].mode().values[0] / 2
gdf["geometry"] = gdf["geometry"].buffer(buffer, cap_style=3)
# Import shapefile from geopandas
path_to_data = gpd.datasets.get_path("naturalearth_lowres")
world_shp = gpd.read_file(path_to_data)
# the solution... spatial join buffered polygons to countries
# comma separate associated countries
gdf = gdf.join(
world_shp.sjoin(gdf.set_crs("EPSG:4326"))
.groupby("index_right")["name"]
.agg(",".join)
)
gdf["time_a"] = gdf["time"].dt.strftime("%Y-%b-%d")
# simplest way to test is visualise...
px.choropleth_mapbox(
gdf,
geojson=gdf.geometry,
locations=gdf.index,
color="pdsi",
hover_data=["name"],
animation_frame="time_a",
opacity=.3
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

using https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.intersection.html you can get part of grid that intersects with country polygon
using area, you can calculate proportion of overlap
from this I have generated two visualisations
show countries a grid overlaps and how much it overlaps
aggregate to countries using a weighted average plus calculate other measures that can be used for transparency
I do not know if this is mathematically / scientifically sound to aggregate PDSI in this way (either means or weighted averages). This does demonstrate how to get results your question requests.
# the solution... spatial join buffered polygons to countries
# plus work out overlap between PDSI grid and country. Area of each grid is constant...
gdf_c = (
world_shp.sjoin(gdf.set_crs("EPSG:4326"))
.merge(
gdf.loc[:, "geometry"],
left_on="index_right",
right_index=True,
suffixes=("", "_pdsi"),
)
.assign(
overlap=lambda d: (
d["geometry"]
.intersection(gpd.GeoSeries(d["geometry_pdsi"], crs="EPSG:4326"))
.area
/ (buffer * 2) ** 2
).round(3)
)
)
# comma separate associated countries and a list of overlaps
gdf_pdsi = gdf.loc[:, ["geometry", "time", "pdsi"]].join(
gdf_c.groupby("index_right").agg({"name": ",".join, "overlap": list})
)
gdf_pdsi["time_a"] = gdf_pdsi["time"].dt.strftime("%Y-%b-%d")
# simplest way to test is visualise...
fig_buf = px.choropleth_mapbox(
gdf_pdsi,
geojson=gdf_pdsi.geometry,
locations=gdf_pdsi.index,
color="pdsi",
hover_data=["name", "overlap"],
animation_frame="time_a",
opacity=0.3,
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)
fig_buf
import pandas as pd
# prepare data to plot by country
df_pdsi = (
gdf_c.groupby(["name", "time"])
.apply(
lambda d: pd.Series(
{
"weighted_pdsi": (d["pdsi"] * d["overlap"]).sum() / d["overlap"].sum(),
"unweighted_pdsi": d["pdsi"].mean(),
"min_pdsi": d["pdsi"].min(),
"max_pdsi": d["pdsi"].max(),
"min_overlap": d["overlap"].min(),
"max_overlap": d["overlap"].max(),
"size_pdsi": len(d["pdsi"]),
# "pdsi_list":[round(v,2) for v in d["pdsi"]]
}
)
)
.reset_index()
)
df_pdsi["time_a"] = df_pdsi["time"].dt.strftime("%Y-%b-%d")
fig = px.choropleth_mapbox(
df_pdsi,
geojson=world_shp.set_index("name").loc[:, "geometry"],
locations="name",
color="weighted_pdsi",
hover_data=df_pdsi.columns,
animation_frame="time_a",
opacity=0.3,
).update_layout(
mapbox={"style": "carto-positron", "zoom": 1},
margin={"l": 0, "r": 0, "t": 0, "b": 0},
)
fig

Related

Draw polygons around a set of points and create clusters in python

I have a Pandas DataFrame containing Lat, Long coordinates. How do I draw non-overlapping polygons around a cluster of points and aggregate the geometries in a geopandas DataFrame. Below is sample code to work with:
import pandas as pd
import numpy as np
import geopandas as gpd
df = pd.DataFrame({
'yr': [2018, 2017, 2018, 2016],
'id': [0, 1, 2, 3],
'v': [10, 12, 8, 10],
'lat': [32.7418248, 32.8340583, 32.8340583, 32.7471895],
'lon':[-97.524066, -97.0805484, -97.0805484, -96.9400779]
})
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Long'], df['Lat']))
# set crs for buffer calculations
df.set_crs("ESRI:102003", inplace=True)
The Polygons can be of any shape, however, must include a minimum of 5 points. I tried creating a buffer around the points but circle is not the ideal solution. I am looking for a way to draw a more flexible polygon.
This polygon representation will be added as a new column to the pandas dataframe containing the points.
https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.buffer.html
your question and sample data make no sense! You say you want clusters of 5 points or more and only provide 4 points. Leaving person who answers this question mandated to find some data. Better practice is to generate a MWE of what you've tried which can possibly become solution you want. Have used UK hospitals to get some data with lat / lon
from your other scatter gun questions, it's clear you have tried using geohash as a solution. Let's explore this
get geohash for each point geolib.geohash.encode()
aggregate points in same geohash by using dissolve() This will give a MULTIPOINT geometry. Convert this to POLYGON using convex_hull
now have polygons that do not overlap and have clusters of points. It doesn't ensure that a cluster has a minimum of 5 points
import requests, io
import pandas as pd
import numpy as np
import geopandas as gpd
import geolib.geohash
import folium
# get some data that meets sample with enough data
df = (
pd.read_csv(
io.StringIO(requests.get("https://assets.nhs.uk/data/foi/Hospital.csv").text),
sep="Č",
engine="python",
)
.rename(columns={"Latitude": "lat", "Longitude": "lon"})
.loc[:, ["lat", "lon"]]
).dropna()
df["id"] = df.index
df["yr"] = np.random.choice(range(2016, 2019), len(df))
df["v"] = np.random.randint(0, 11, len(df))
# get geohash so points in same area can be clustered
df["geohash"] = df.apply(lambda r: geolib.geohash.encode(r["lon"], r["lat"], 3), axis=1)
# construct geodataframe
gdf = gpd.GeoDataFrame(
df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="epsg:4386"
)
# cluster points to polygons
gdf2 = gdf.dissolve(by="geohash", aggfunc={"v": "sum", "id":"count", "yr":"mean"})
gdf2["geometry"] = gdf2["geometry"].convex_hull
# let's visualise everything
m = gdf2.explore(color="green", name="cluster", height=300, width=600)
m = gdf.explore(column="geohash", m=m, name="popints")
folium.LayerControl().add_to(m)
m
Use Geopandas convex hull.
The convex hull of a geometry is the smallest convex Polygon containing all the points in each geometry.
https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoSeries.convex_hull.html

Define a circe that circumscribes a set of points (shapefile) in python

I have a shapefile of points, defined by X and Y coordinates, ad the ID feature.
I have at least 3 different points with the same ID number.
I would like to define, for each ID, the shapefile of a circle that circumscribes the points.
How can this be done in python environment?
there is a library that does it: https://pypi.org/project/miniball/
it's pretty forward to integrate in standard pandas pattern https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
solution really reduces to this:
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
with sample example and visualisation, circles do become distorted due to epsg:4326 projection limitations
full working example
import geopandas as gpd
import numpy as np
import shapely
import miniball
import math
import pandas as pd
cities = gpd.read_file(gpd.datasets.get_path("naturalearth_cities"))
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
# a semi-synthetic grouping of cities
world["size"] = world.groupby("continent")["pop_est"].apply(
lambda d: pd.cut(d, 2, labels=list("ab"), duplicates="drop").astype(str)
)
cities = cities.sjoin(world.loc[:, ["continent", "iso_a3", "size", "geometry"]])
cities["group"] = cities["continent"] + cities["size"]
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
# visualize it
m = cities.explore(column=col, height=300, width=600, legend=False)
gdf_c.loc[~gdf_c["geometry"].is_empty].explore(
m=m, column=col, marker_kwds={"radius": 20}, legend=False
)
output

Plotly scatter large volume geographic data

I tried to write a code that creates a visualization of all forest fires that happened during the year 2021. The CSV file containing the data is around 1.5Gb, the program looks correct for me, but when I try to run it, it gets stuck without displaying any visualization or error message. The last time I tried, it run for almost half a day until python crashed.
I don't know if I am having an infinite loop, if that's because the file is too big or if there is something else I am missing.
Can anyone provide feedback, please?
Here is my code:
import csv
from datetime import datetime
from plotly.graph_objs import Scattergeo , Layout
from plotly import offline
filename='fire_nrt_J1V-C2_252284.csv'
with open(filename) as f:
reader=csv.reader(f)
header_row=next(reader)
lats, lons, brights, dates=[],[],[],[]
for row in reader:
date=datetime.strptime(row[5], '%Y-%m-%d')
lat=row[0]
lon=row[1]
bright=row[2]
lats.append(lat)
lons.append(lon)
brights.append(bright)
dates.append(date)
data=[{
'type':'scattergeo',
'lon':lons,
'lat':lats,
'text':dates,
'marker':{
'size':[5*bright for bright in brights],
'color': brights,
'colorscale':'Reds',
'colorbar': {'title':'Fire brightness'},
}
}]
my_layout=Layout(title="Forestfires during the year 2021")
fig={'data':data,'layout':my_layout}
offline.plot(fig, filename='global_fires_2021.html')
have found data you describe here https://wifire-data.sdsc.edu/dataset/viirs-i-band-375-m-active-fire-data/resource/3ce73b20-f584-44f7-996b-2f319c480294
plotly uses resources for every point plotted on a scatter. So there is a limit before you run out of resources
there are other approaches to plotting larger number of points
https://plotly.com/python/mapbox-density-heatmaps/ fewer limits, but still limited on very large data sets
https://plotly.com/python/datashader/ can work with very large data sets as it generates an image. It is more challenging to work with (install and navigate API)
data sourcing
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
df = pd.read_csv("https://firms.modaps.eosdis.nasa.gov/data/active_fire/noaa-20-viirs-c2/csv/J1_VIIRS_C2_Global_7d.csv")
df
scatter_geo
limited to random sample of 1000 rows
px.scatter_geo(
df.sample(1000),
lat="latitude",
lon="longitude",
color="bright_ti4",
# size="size",
hover_data=["acq_date"],
color_continuous_scale="reds",
)
density mapbox
px.density_mapbox(
df.sample(5000),
lat="latitude",
lon="longitude",
z="bright_ti4",
radius=3,
color_continuous_scale="reds",
zoom=1,
mapbox_style="carto-positron",
)
datashader Mapbox
all data
some libraries are more difficult to install and use
need to deal with this issue https://community.plotly.com/t/datashader-image-distorted-when-passed-to-mapbox/39375/2
import datashader as ds, colorcet
from pyproj import Transformer
t3857_to_4326 = Transformer.from_crs(3857, 4326, always_xy=True)
# project CRS to ensure image overlays appropriately back over mapbox
# https://community.plotly.com/t/datashader-image-distorted-when-passed-to-mapbox/39375/2
df.loc[:, "longitude_3857"], df.loc[:, "latitude_3857"] = ds.utils.lnglat_to_meters(
df.longitude, df.latitude
)
RESOLUTION=1000
cvs = ds.Canvas(plot_width=RESOLUTION, plot_height=RESOLUTION)
agg = cvs.points(df, x="longitude_3857", y="latitude_3857")
img = ds.tf.shade(agg, cmap=colorcet.fire).to_pil()
fig = go.Figure(go.Scattermapbox())
fig.update_layout(
mapbox={
"style": "carto-positron",
"layers": [
{
"sourcetype": "image",
"source": img,
# Sets the coordinates array contains [longitude, latitude] pairs for the image corners listed in
# clockwise order: top left, top right, bottom right, bottom left.
"coordinates": [
t3857_to_4326.transform(
agg.coords["longitude_3857"].values[a],
agg.coords["latitude_3857"].values[b],
)
for a, b in [(0, -1), (-1, -1), (-1, 0), (0, 0)]
],
}
],
},
margin={"l": 0, "r": 0, "t": 0, "r": 0},
)

How to set cell alpha values for a heatmap in Python?

I would like to show a heatmap in any Python library, where color is determined by the mean value of data points in the cell. But additionally, I'd like to visualize the uncertainty of these means, but using the number of data points in each cell as the alpha value. Hence, cells with few data points will have a color, but faded.
How can I do this in Python the easiest way, when the full color is determined by two values?
import pandas as pd
import numpy as np
import seaborn as sns
from operator import attrgetter
N = 10000
D = pd.DataFrame(
{
"x": np.random.uniform(size=N),
"y": np.random.uniform(size=N),
"c": np.random.choice([0, 1], size=N),
}
)
x_group = pd.cut(D["x"], 10).apply(attrgetter("left"))
y_group = pd.cut(D["y"], 10).apply(attrgetter("left"))
means = D.pivot_table("c", x_group, y_group, aggfunc="mean")
sizes = D.pivot_table("c", x_group, y_group, aggfunc="size")
sns.heatmap(means) # here sizes should be used for the alpha channel - maybe in a postprocessing?

How to update histogram based on selection of points (Altair)?

My goal is to update the histogram shown on the right side of 1 based on the selection of points on the left side.
Initially the plot seems to be alright, however once a selection is made the histogram won't be redrawn (altair 3.2.0)
Does anybody know how to do this?
below is the code to recreate the example:
import altair as alt
import pandas as pd
import numpy as np
from random import choice
dates = pd.date_range("1.1.2019", "2.28.2019")
np.random.seed(999)
Y = np.random.normal(0.5, 0.1, len(dates))
features = [choice(["a", "b", "c"]) for i in range(len(dates))]
df = pd.DataFrame({"dates": dates, "Y": Y, "features": features})
base = alt.Chart(df)
area_args = {"opacity": 0.3, "interpolate": "step"}
pts = alt.selection(type="interval", encodings=["x"])
points = (
base.mark_circle()
.encode(alt.X("dates:T"), alt.Y("Y:Q"), color="features:N")
.add_selection(pts)
)
yscale = alt.Scale(domain=(0, 1))
right_hist = (
base.mark_area(**area_args)
.encode(
alt.Y(
"Y:Q", bin=alt.Bin(maxbins=20, extent=yscale.domain), stack=None, title=""
),
alt.X("count()", stack=None, title=""),
alt.Color("features:N"),
)
.transform_filter(pts)
)
(points | right_hist)
edit1: another image to clarify my point #jvp
Solved in the comments as an issue with the OPs setup and how the plots were rendered on their end.

Categories

Resources