Define a circe that circumscribes a set of points (shapefile) in python - python

I have a shapefile of points, defined by X and Y coordinates, ad the ID feature.
I have at least 3 different points with the same ID number.
I would like to define, for each ID, the shapefile of a circle that circumscribes the points.
How can this be done in python environment?

there is a library that does it: https://pypi.org/project/miniball/
it's pretty forward to integrate in standard pandas pattern https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
solution really reduces to this:
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
with sample example and visualisation, circles do become distorted due to epsg:4326 projection limitations
full working example
import geopandas as gpd
import numpy as np
import shapely
import miniball
import math
import pandas as pd
cities = gpd.read_file(gpd.datasets.get_path("naturalearth_cities"))
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
# a semi-synthetic grouping of cities
world["size"] = world.groupby("continent")["pop_est"].apply(
lambda d: pd.cut(d, 2, labels=list("ab"), duplicates="drop").astype(str)
)
cities = cities.sjoin(world.loc[:, ["continent", "iso_a3", "size", "geometry"]])
cities["group"] = cities["continent"] + cities["size"]
def circle(points):
p, r = miniball.get_bounding_ball(np.array([points.x, points.y]).T)
return shapely.geometry.Point(p).buffer(math.sqrt(r))
col = "group"
# generate circles around groups of points
gdf_c = cities.groupby(col, as_index=False).agg(geometry=("geometry", circle))
# visualize it
m = cities.explore(column=col, height=300, width=600, legend=False)
gdf_c.loc[~gdf_c["geometry"].is_empty].explore(
m=m, column=col, marker_kwds={"radius": 20}, legend=False
)
output

Related

Extract raster value using multipolygon type of shape using python

I am trying to export raster values using multipolygon shapefile in python. I have found the answer here, but the calculation there is not valid for multipolygon. Could please someone guide me, how i should correct the code in order to have not polygon but multipolygon datatype in calculation.
My code is below:
import rasterio
from rasterio.mask import mask
import geopandas as gpd
import numpy as np
from rasterio import Affine
from shapely.geometry import mapping
shapefile = gpd.read_file(r'/Users..../polygon_sector.shp')
geoms = shapefile.geometry.values
geometry = geoms[0] # shapely geometry
# transform to GeJSON format
geoms = [mapping(geoms[0])]
# extract the raster values within the polygon
with rasterio.open("/Users/.../map_reclass.tif") as src:
out_image, out_transform = mask(src, geoms, crop=True)
# no data values of the original raster
no_data=src.nodata
print(no_data)
# extract the values of the masked array
data = out_image[0,:,:]
# extract the row, columns of the valid values
row, col = np.where(data != no_data)
rou = np.extract(data != no_data, data)
# affine import Affine
T1 = out_transform * Affine.translation(0.5, 0.5) # reference the pixel centre
rc2xy = lambda r, c: (c, r) * T1
d = gpd.GeoDataFrame({'col':col,'row':row,'ROU':rou})
# coordinate transformation
d['x'] = d.apply(lambda row: rc2xy(row.row,row.col)[0], axis=1)
d['y'] = d.apply(lambda row: rc2xy(row.row,row.col)[1], axis=1)
# geometry
from shapely.geometry import Point
d['geometry'] =d.apply(lambda row: Point(row['x'], row['y']), axis=1)
# save to a shapefile
d.to_file(r'/Users/y.../result_full.shp', driver='ESRI Shapefile')
I have tried to assign the other geometry (multipolygon) but i did it wrong, since when i print the geometry it was still POLYGON, not MULTIPOLYGON. So far as i understood it should come from shapely.

Wrong points buffer using geopandas

Good evening,
I'm working on a product to detect local events (strike) within subscription areas.
The yellow polygons should be 40KM (left) and 50KM (right) circles around central red points. Green points are my strikes that should be detected in my process.
It appears that my current use of buffer() does not produce 40/50 Km buffer radius as expected and then my process in missing my two events .
My code:
# Create my two events to detect
df_strike = pd.DataFrame(
{ 'Latitude': [27.0779, 31.9974],
'Longitude': [51.5144, 38.7078]})
gdf_events = gpd.GeoDataFrame(df_strike, geometry=gpd.points_from_xy(df_strike.Longitude, df_strike.Latitude),crs = {'init':'epsg:4326'})
# Get location to create buffer
SUB_LOCATION = pd.DataFrame(
{ 'perimeter_id': [1370, 13858],
'distance' : [40.0, 50.0],
'custom_lat': [31.6661, 26.6500],
'custom_lon': [38.6635, 51.5700]})
gdf_locations = gpd.GeoDataFrame(SUB_LOCATION, geometry=gpd.points_from_xy(SUB_LOCATION.custom_lon, SUB_LOCATION.custom_lat), crs = {'init':'epsg:4326'})
# Now reproject to a crs using meters
gdf_locations = gdf_locations.to_crs({'init':'epsg:3857'})
gdf_events = gdf_events.to_crs({'init':'epsg:3857'})
# Create buffer using distance (in meters) from locations
gdf_locations['geometry'] = gdf_locations['geometry'].buffer(gdf_locations['distance']*1000)
# Matching events within buffer
matching_entln = pd.DataFrame(gpd.sjoin(gdf_locations, gdf_events, how='inner'))
But my result is an empty dataframe and should not be. If I compute distance between events and locations (distance between red and green points):
pnt1 = Point(27.0779, 51.5144)
pnt2 = Point(26.65, 51.57)
points_df = gpd.GeoDataFrame({'geometry': [pnt1, pnt2]}, crs='EPSG:4326')
points_df = points_df.to_crs('EPSG:3857')
points_df2 = points_df.shift() #We shift the dataframe by 1 to align pnt1 with pnt2
points_df.distance(points_df2)
Returns: 48662.078723 meters
and
pnt1 = Point(31.9974, 38.7078)
pnt2 = Point(31.6661, 38.6635)
points_df = gpd.GeoDataFrame({'geometry': [pnt1, pnt2]}, crs='EPSG:4326')
points_df = points_df.to_crs('EPSG:3857')
points_df2 = points_df.shift() #We shift the dataframe by 1 to align pnt1 with pnt2
points_df.distance(points_df2)
Returns: 37417.343796 meters
Then I was expecting to have this result :
>>> pd.DataFrame(gpd.sjoin(gdf_locations, gdf_events, how='inner'))
subscriber_id perimeter_id distance custom_lat custom_lon geometry index_right Latitude Longitude
0 19664 1370 40.0 31.6661 38.6635 POLYGON ((2230301.324 3642618.584, 2230089.452... 1 31.9974 38.7078
1 91201 13858 50.0 26.6500 51.5700 POLYGON ((3684499.890 3347425.378, 3684235.050... 0 27.0779 51.5144
I think my buffer is at ~47KM and ~38KM instead of 50KM and 40KM as expected. Am I missing something here which could explain that empty result ?
Certain computations with geodataframe's methods that involves distances, namely, .distance(), .buffer() in this particular case, are based on Euclidean geometry and map projection coordinate systems. Their results are not reliable, to always get the correct results one should avoid using them and use direct computation with geographic coordinates instead. Doing so with proper module/library, you will get great-circle arc distances instead of errorneous euclidean distances. Thus avoid mysterious errors.
Here I present the runnable code that show how to proceed along the line that I proposed:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import cartopy.crs as ccrs
import cartopy
import matplotlib.pyplot as plt
import numpy as np
from pyproj import Geod
# Create my two events to detect
df_strike = pd.DataFrame(
{ 'Latitude': [27.0779, 31.9974],
'Longitude': [51.5144, 38.7078]})
gdf_events = gpd.GeoDataFrame(df_strike, geometry=gpd.points_from_xy(df_strike.Longitude, df_strike.Latitude),crs = {'init':'epsg:4326'})
# Get location to create buffer
SUB_LOCATION = pd.DataFrame(
{ 'perimeter_id': [1370, 13858],
'distance' : [40.0, 50.0],
'custom_lat': [31.6661, 26.6500],
'custom_lon': [38.6635, 51.5700]})
gdf_locations = gpd.GeoDataFrame(SUB_LOCATION, geometry=gpd.points_from_xy(SUB_LOCATION.custom_lon, SUB_LOCATION.custom_lat), crs = {'init':'epsg:4326'})
# Begin: My code----------------
def point_buffer(lon, lat, radius_m):
# Use this instead of `.buffer()` provided by geodataframe
# Adapted from:
# https://stackoverflow.com/questions/31492220/how-to-plot-a-tissot-with-cartopy-and-matplotlib
geod = Geod(ellps='WGS84')
num_vtxs = 64
lons, lats, _ = geod.fwd(np.repeat(lon, num_vtxs),
np.repeat(lat, num_vtxs),
np.linspace(360, 0, num_vtxs),
np.repeat(radius_m, num_vtxs),
radians=False
)
return Polygon(zip(lons, lats))
# Get location to create buffer
# Create buffer geometries from points' coordinates and distances using ...
# special function `point_buffer()` defined above
gdf_locations['geometry'] = gdf_locations.apply(lambda row : point_buffer(row.custom_lon, row.custom_lat, 1000*row.distance), axis=1)
# Convert CRS to Mercator (epsg:3395), it will match `ccrs.Mercator()`
# Do not use Web_Mercator (epsg:3857), it is crude approx of 3395
gdf_locations = gdf_locations.to_crs({'init':'epsg:3395'})
gdf_events = gdf_events.to_crs({'init':'epsg:3395'})
# Matching events within buffer
matching_entln = pd.DataFrame(gpd.sjoin(gdf_locations, gdf_events, how='inner'))
# Visualization
# Use cartopy for best result
fig = plt.figure(figsize=(9,8))
ax = fig.add_subplot(projection=ccrs.Mercator())
gdf_locations.plot(color="green", ax=ax, alpha=0.4)
gdf_events.plot(color="red", ax=ax, alpha=0.9, zorder=23)
ax.coastlines(lw=0.3, color="gray")
ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.gridlines(crs=ccrs.PlateCarree(), draw_labels=True)
# Other helpers
# Horiz/vert lines are plotted to mark the circles' centers
ax.hlines([31.6661,26.6500], 30, 60, transform=ccrs.PlateCarree(), lw=0.1)
ax.vlines([38.6635, 51.5700], 20, 35, transform=ccrs.PlateCarree(), lw=0.1)
ax.set_extent([35, 55, 25, 33], crs=ccrs.PlateCarree())
Spatial joining:
# Matching events within buffer
matching_entln = pd.DataFrame(gpd.sjoin(gdf_locations, gdf_events, how='inner'))
matching_entln[["perimeter_id", "distance", "index_right", "Latitude", "Longitude"]] #custom_lat custom_lon
Compute distances between points for checking
This checks the result of the spatial join if computed distances are less than the buffered distances.
# Use greatcircle arc length
geod = Geod(ellps='WGS84')
# centers of buffered-circles
from_lon1, from_lon2 = [38.6635, 51.5700]
from_lat1, from_lat2 = [31.6661, 26.6500]
# event locations
to_lon1, to_lon2= [51.5144, 38.7078]
to_lat1, to_lat2 = [27.0779, 31.9974]
_,_, dist_m = geod.inv(from_lon1, from_lat1, to_lon2, to_lat2, radians=False)
print(dist_m) #smaller than 40 km == inside
# Get: 36974.419811328786 m.
_,_, dist_m = geod.inv(from_lon2, from_lat2, to_lon1, to_lat1, radians=False)
print(dist_m) #smaller than 50 km == inside
# Get: 47732.76744655724 m.
My notes
Serious geographic computation should be done directly with geodetic computation without the use of map projection of any kind.
Map projection is used when you need graphic visualization. But correct geographic values that are computed/transformed to map projection CRS correctly are expected.
Computation with map projection (grid) coordinate beyond its allowable limits (and get bad results) is often happen with inexperienced users.
Computation involving map/grid position/values using euclidean geometry should be performed within small extent of projection areas that all kinds of map distortions is very low.

Check if the point within a polygon (speed up)

I have 2 data frames. 1) data - long and lat points 2) border = shapefile of a city
I need to check which points are within the shapefile and save them. Here is my code to do that:
Data
city = pd.read_csv("D:...path.../data.csv")
crs = {'init':'epsg:4326'}
geometry = [Point(xy) for xy in zip(city.longitude,city.latitude)]
city_point = gpd.GeoDataFrame(city,crs=crs,geometry=geometry)
Border
border = gpd.read_file("C:...path.../border.shp")
border_gdf = gpd.GeoDataFrame(border, geometry='geometry')
Final check
city_point['inside'] = city_point['geometry'].apply(border_gdf.contains)
city_point = city_point[city_point.inside != True]
Libraries
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
city_point[city_point.geometry.within(border_gdf.iloc[0].geometry)]

Creating a shape file from a bounding box coordinates list

There is already few existing questions about this topic, but I unfortunately did not find something that could fix my problem.
I have a point Lat, Long coordinate i.e. Lat= 10 and Long = 10. I want to create a shape file of a 0.5 degree bounding box around this point, so the bounding box should be as follow:
minimum Long= 9.75
minimum Lat = 9.75
maximum Long = 10.25
maximum Lat = 10.25
Does anyone knows how to do that in Python?
Here's one way to do it using shapely, geopandas and pandas:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
def bbox(lat,lng, margin):
return Polygon([[lng-margin, lat-margin],[lng-margin, lat+margin],
[lng+margin,lat+margin],[lng+margin,lat-margin]])
gpd.GeoDataFrame(pd.DataFrame(['p1'], columns = ['geom']),
crs = {'init':'epsg:4326'},
geometry = [bbox(10,10, 0.25)]).to_file('poly.shp')
I want to enchance Bruno Carballo's code. I hope it will easier for you
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
# function to return polygon
def bbox(long0, lat0, lat1, long1):
return Polygon([[long0, lat0],
[long1,lat0],
[long1,lat1],
[long0, lat1]])
test = bbox(9.75, 9.75, 10.25, 10.25)
gpd.GeoDataFrame(pd.DataFrame(['p1'], columns = ['geom']),
crs = {'init':'epsg:4326'},
geometry = [test]).to_file('poly.shp')
And here is an implementation of Bruno Carballo's answer that applies it to en entire DataFrame:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon
# function to return polygon
def bbox(vec):
long0, lat0, lat1, long1 = vec[0], vec[1], vec[2], vec[3]
return Polygon([[long0, lat0],
[long0,lat1],
[long1,lat1],
[long1, lat0]])
def extentPolygon(df):
return(
pd.DataFrame({'geometry' : df[['ext_min_x','ext_min_y','ext_max_y','ext_max_x']].apply(bbox, axis = 1)})
)
df = pd.DataFrame({'ext_min_x' : [9.75, 9.78], 'ext_max_x' : [10.25, 10.28],
'ext_min_y' : [9.75, 9.78], 'ext_max_y' : [10.25, 10.28]})
df = extentPolygon(df)
After which you can easily turn the resulting DataFrame into a GeoDataFrame:
df_gp = gdp.GeoDataFrame(df)

get bins coordinates with hexbin in matplotlib

I use matplotlib's method hexbin to compute 2d histograms on my data.
But I would like to get the coordinates of the centers of the hexagons in order to further process the results.
I got the values using get_array() method on the result, but I cannot figure out how to get the bins coordinates.
I tried to compute them given number of bins and the extent of my data but i don't know the exact number of bins in each direction. gridsize=(10,2) should do the trick but it does not seem to work.
Any idea?
I think this works.
from __future__ import division
import numpy as np
import math
import matplotlib.pyplot as plt
def generate_data(n):
"""Make random, correlated x & y arrays"""
points = np.random.multivariate_normal(mean=(0,0),
cov=[[0.4,9],[9,10]],size=int(n))
return points
if __name__ =='__main__':
color_map = plt.cm.Spectral_r
n = 1e4
points = generate_data(n)
xbnds = np.array([-20.0,20.0])
ybnds = np.array([-20.0,20.0])
extent = [xbnds[0],xbnds[1],ybnds[0],ybnds[1]]
fig=plt.figure(figsize=(10,9))
ax = fig.add_subplot(111)
x, y = points.T
# Set gridsize just to make them visually large
image = plt.hexbin(x,y,cmap=color_map,gridsize=20,extent=extent,mincnt=1,bins='log')
# Note that mincnt=1 adds 1 to each count
counts = image.get_array()
ncnts = np.count_nonzero(np.power(10,counts))
verts = image.get_offsets()
for offc in xrange(verts.shape[0]):
binx,biny = verts[offc][0],verts[offc][1]
if counts[offc]:
plt.plot(binx,biny,'k.',zorder=100)
ax.set_xlim(xbnds)
ax.set_ylim(ybnds)
plt.grid(True)
cb = plt.colorbar(image,spacing='uniform',extend='max')
plt.show()
I would love to confirm that the code by Hooked using get_offsets() works, but I tried several iterations of the code mentioned above to retrieve center positions and, as Dave mentioned, get_offsets() remains empty. The workaround that I found is to use the non-empty 'image.get_paths()' option. My code takes the mean to find centers but which means it is just a smidge longer, but it does work.
The get_paths() option returns a set of x,y coordinates embedded that can be looped over and then averaged to return the center position for each hexagram.
The code that I have is as follows:
counts=image.get_array() #counts in each hexagon, works great
verts=image.get_offsets() #empty, don't use this
b=image.get_paths() #this does work, gives Path([[]][]) which can be plotted
for x in xrange(len(b)):
xav=np.mean(b[x].vertices[0:6,0]) #center in x (RA)
yav=np.mean(b[x].vertices[0:6,1]) #center in y (DEC)
plt.plot(xav,yav,'k.',zorder=100)
I had this same problem. I think what needs to be developed is a framework to have a HexagonalGrid object which can then be applied to many different data sets (and it would be awesome to do it for N dimensions). This is possible and it surprises me that neither Scipy or Numpy has anything for it (furthermore there seems to be nothing else like it except perhaps binify)
That said, I assume you want to use hexbinning to compare multiple binned data sets. This requires some common base. I got this to work using matplotlib's hexbin the following way:
import numpy as np
import matplotlib.pyplot as plt
def get_data (mean,cov,n=1e3):
"""
Quick fake data builder
"""
np.random.seed(101)
points = np.random.multivariate_normal(mean=mean,cov=cov,size=int(n))
x, y = points.T
return x,y
def get_centers (hexbin_output):
"""
about 40% faster than previous post only cause you're not calculating the
min/max every time
"""
paths = hexbin_output.get_paths()
v = paths[0].vertices[:-1] # adds a value [0,0] to the end
vx,vy = v.T
idx = [3,0,5,2] # index for [xmin,xmax,ymin,ymax]
xmin,xmax,ymin,ymax = vx[idx[0]],vx[idx[1]],vy[idx[2]],vy[idx[3]]
half_width_x = abs(xmax-xmin)/2.0
half_width_y = abs(ymax-ymin)/2.0
centers = []
for i in xrange(len(paths)):
cx = paths[i].vertices[idx[0],0]+half_width_x
cy = paths[i].vertices[idx[2],1]+half_width_y
centers.append((cx,cy))
return np.asarray(centers)
# important parts ==>
class Hexagonal2DGrid (object):
"""
Used to fix the gridsize, extent, and bins
"""
def __init__ (self,gridsize,extent,bins=None):
self.gridsize = gridsize
self.extent = extent
self.bins = bins
def hexbin (x,y,hexgrid):
"""
To hexagonally bin the data in 2 dimensions
"""
fig = plt.figure()
ax = fig.add_subplot(111)
# Note mincnt=0 so that it will return a value for every point in the
# hexgrid, not just those with count>mincnt
# Basically you fix the gridsize, extent, and bins to keep them the same
# then the resulting count array is the same
hexbin = plt.hexbin(x,y, mincnt=0,
gridsize=hexgrid.gridsize,
extent=hexgrid.extent,
bins=hexgrid.bins)
# you could close the figure if you don't want it
# plt.close(fig.number)
counts = hexbin.get_array().copy()
return counts, hexbin
# Example ===>
if __name__ == "__main__":
hexgrid = Hexagonal2DGrid((21,5),[-70,70,-20,20])
x_data,y_data = get_data((0,0),[[-40,95],[90,10]])
x_model,y_model = get_data((0,10),[[100,30],[3,30]])
counts_data, hexbin_data = hexbin(x_data,y_data,hexgrid)
counts_model, hexbin_model = hexbin(x_model,y_model,hexgrid)
# if you want the centers, they will be the same for both
centers = get_centers(hexbin_data)
# if you want to ignore the cells with zeros then use the following mask.
# But if want zeros for some bins and not others I'm not sure an elegant way
# to do this without using the centers
nonzero = counts_data != 0
# now you can compare the two data sets
variance_data = counts_data[nonzero]
square_diffs = (counts_data[nonzero]-counts_model[nonzero])**2
chi2 = np.sum(square_diffs/variance_data)
print(" chi2={}".format(chi2))

Categories

Resources