Checking area of overlap between two dataframes with parallel dask GeoPandas - python

I have two different GeoDataFrames: One of which contain polygon squares in a large grid. The other contains larger, and fewer, polygons.
I wish to calculate the area of overlap within each of the grid squares with the other, larger squares.
To do so, I made a simple loop method
for _, patch in tqdm(layer.iterrows(), total=layer.shape[0], desc=name):
# Index of intersecting squares
idx = joined.intersects(patch.geometry)
intersection_polygon = joined[idx].intersection(patch.geometry)
area_of_intersection = intersection_polygon.area
joined.loc[idx, "value"] += area_of_intersection
In an attempt to speed up this method, I converted the layer DataFrame, which contains the larger patches to a Dask-DataFrame.
I implemented it the following way:
def multi_area(patch, joined=None):
# Index of intersecting squares
idx = joined.intersects(patch.geometry)
intersection_polygon = joined[idx].intersection(patch.geometry)
area_of_intersection = intersection_polygon.area
joined.loc[idx, "value"] += area_of_intersection
return joined["value"]
layer_dask = dask_geopandas.from_geopandas(layer, npartitions=8)
with ProgressBar():
joined["value"] = layer_dask.apply(multi_area, meta=joined, joined=joined, axis=1).compute(scheduler='multiprocessing')
This, however, returns the error AttributeError: 'GeoDataFrame' object has no attribute 'name', and at this point I am unsure if this is the optimal way of doing it, and what I am doing wrong.
The job I will be doing will have 400 million grid squares, so I am planning on batching this calculation out on smaller areas later, as I can't come up with a smarter way of doing it...

I managed to speed up the process quite a bit using spatial joins and overlay as suggested by Michael in the comments.
In addition I implemented Dask Dataframes so the final code becomes:
import dask_geopandas as dg
import geopandas as gpd
def dissolve_shuffle(ddf, by=None, **kwargs):
"""Shuffle and map partition"""
meta = ddf._meta.dissolve(by=by, as_index=False, **kwargs)
shuffled = ddf.shuffle(
by, npartitions=ddf.npartitions, shuffle="tasks", ignore_index=True
)
return shuffled.map_partitions(
gpd.GeoDataFrame.dissolve, by=by, as_index=False, meta=meta, **kwargs
)
def calculate_area_overlap_dask(
df_grid,
layer,
nthreads=8,
) -> gpd.GeoDataFrame:
"""This function calculates the area of overlap in each grid cell for a given map-layer
"""
layer = layer[["geometry"]]
df_grid = df_grid[["geometry"]]
# Split up the layer using the grid
_overlay = gpd.overlay(layer, df_grid, how="intersection")
# Convert the overlay to a dask geopandas dataframe and calculate the area of each new polygon
_overlay = dg.from_geopandas(_overlay, npartitions=nthreads)
_overlay["area"] = _overlay.area
_overlay = _overlay.compute()
# Convert the grid to a dask geopandas dataframe and spatial join all split layer polygons to corresponding grid cells
df_grid = dg.from_geopandas(df_grid, npartitions=nthreads)
joined = dg.sjoin(df_grid, _overlay, how="inner").reset_index()
# Faster dissolve of area within each grid cell
scored_grid = dissolve_shuffle(
joined,
"index",
)
scored_grid = scored_grid.compute()
return scored_grid
def polygon_to_grid(name: str, gdf) -> gpd.GeoDataFrame:
"""This function converts a geodataframe to a grid of polygons
"""
gdf["value"] = range(len(gdf.index))
# Rasteriser polygonet
out_grid: xr.Dataset = make_geocube(
vector_data=gdf,
measurements=["value"],
resolution=(-100, 100),
fill=np.nan,
)
vals: xr.DataArray = out_grid.value.values
vals[~np.isnan(vals)] = np.arange(len(vals[~np.isnan(vals)]), dtype=np.int32)
vals[np.isnan(vals)] = -9999
out_grid.value.values = vals
out_grid.rio.to_raster( f"{name}_raster.tif")
# Read saved raster
src: xr.Dataset = rasterio.open(f"{name}_raster.tif")
r = src.read(1).astype(np.int32)
# Convert polygons
shapes = features.shapes(r, mask=r != -9999, transform= src.transform)
polygons: list[Polygon] = list(shapes)
geom: list[Polygon] = [shapely.geometry.shape(i[0]) for i in polygons]
# Convert to geodataframe
grid = gpd.GeoDataFrame(
geometry=gpd.GeoSeries(
geom,
),
)
return grid
if __name__=="__main__":
area = gpd.read_file("some_area.shp")
layer = gpd.read_file("some_map_layer.shp")
area_grid = polygon_to_grid("area", area)
grid_evaluated = calculate_area_overlap_dask(area_grid, layer)
This mess ended up working, but it was very prone to memory-issues with large datasets. So I opted for a solution that was less precise, but much faster.

Related

Python VTK "normalize" a point cloud

I have done a lot of searching but have yet to find an answer. I am currently working on some data of a crop field. I have PLY files for multiple fields which I have successfully read into, filtered, and visualised using Python and VTK. My main goal is to eventually segment and run analysis on individual crop plots.
However to make that task easier I first want to "Normalize" my point cloud so that all plots are essentially "on the same level". From the image I have attached you can see that the point clod slopes from one corner to its opposite. So what I want to flatten out the image so the ground points are all on the same plane/ level. And the reset of the points adjusted accordingly.
Point Cloud
I've also included my code to show how I got to this point. If anyone has any advice on how I can achieve the normalising to one plane I would be very appreciative. Sadly I cannot include my data as it is work related.
Thanks.
Josh
import vtk
from vtk.util import numpy_support
import numpy as np
filename = 'File.ply'
# Reader
r = vtk.vtkPLYReader()
r.SetFileName(filename)
# Filters
vgf = vtk.vtkVertexGlyphFilter()
vgf.SetInputConnection(r.GetOutputPort())
# Elevation
pc = r.GetOutput()
bounds = pc.GetBounds()
#print(bounds)
minz = bounds[4]
maxz = bounds[5]
#print(bounds[4], bounds[5])
evgf = vtk.vtkElevationFilter()
evgf.SetInputConnection(vgf.GetOutputPort())
evgf.SetLowPoint(0, 0, minz)
evgf.SetHighPoint(0, 0, maxz)
#pc.GetNumberOfPoints()
# Look up table
lut = vtk.vtkLookupTable()
lut.SetHueRange(0.667, 0)
lut.SetSaturationRange(1, 1)
lut.SetValueRange(1, 1)
lut.Build
# Renderer
mapper = vtk.vtkPolyDataMapper()
mapper.SetInputConnection(evgf.GetOutputPort())
mapper.SetLookupTable(lut)
actor = vtk.vtkActor()
actor.SetMapper(mapper)
renderer = vtk.vtkRenderer()
renWin = vtk.vtkRenderWindow()
renWin.AddRenderer(renderer)
iren = vtk.vtkRenderWindowInteractor()
iren.SetRenderWindow(renWin)
renderer.AddActor(actor)
renderer.SetBackground(0, 0, 0)
renWin.Render()
iren.Start()
I once solved a similar problem. Find below some code that I used back then. It uses two functions fitPlane and findTransformFromVectors that you could replace with your own implementations.
Note that there are many ways to fit a plane through a set of points. This SO post discusses compares scipy.optimize.minimize with scipy.linalg.lstsq. In another SO post, the use of PCA or RANSAC and other methods are suggested. You probably want to use methods provided by sklearn, numpy or other modules. My solution simply (and non-robustly) computes ordinary least squares regression.
import vtk
import numpy as np
# Convert vtk to numpy arrays
from vtk.util.numpy_support import vtk_to_numpy as vtk2np
# Create a random point cloud.
center = [3.0, 2.0, 1.0]
source = vtk.vtkPointSource()
source.SetCenter(center)
source.SetNumberOfPoints(50)
source.SetRadius(1.)
source.Update()
source = source.GetOutput()
# Extract the points from the point cloud.
points = vtk2np(source.GetPoints().GetData())
points = points.transpose()
# Fit a plane. nRegression contains the normal vector of the
# regression surface.
nRegression = fitPlane(points)
# Compute a transform that maps the source center to the origin and
# plane normal to the z-axis.
trafo = findTransformFromVectors(originFrom=center,
axisFrom=nRegression.transpose(),
originTo=(0,0,0),
axisTo=(0.,0.,1.))
# Apply transform to source.
sourceTransformed = vtk.vtkTransformFilter()
sourceTransformed.SetInputData(source)
sourceTransformed.SetTransform(trafo)
sourceTransformed.Update()
# Visualize output...
Here my implementations of fitPlane and findTransformFromVectors:
# The following code has been written by normanius under the CC BY-SA 4.0
# license.
# License: https://creativecommons.org/licenses/by-sa/4.0/
# Author: normanius: https://stackoverflow.com/users/3388962/normanius
# Date: October 2018
# Reference: https://stackoverflow.com/questions/52716438
def fitPlane(X, tolerance=1e-10):
'''
Estimate the plane normal by means of ordinary least dsquares.
Requirement: points X span the full column rank. If the points lie in a
perfect plane, the regression problem is ill-conditioned!
Formulas:
a = (XX^T)^(-1)*X*z
Surface normal:
n = [a[0], a[1], -1]
n = n/norm(n)
Plane intercept:
c = a[2]/norm(n)
NOTE: The condition number for the pseudo-inverse improves if the
formulation is changed to homogenous notation.
Formulas (homogenous):
a = (XX^T)^(-1)*[1,1,1]^T
n = a[:-1]
n = n/norm(n)
c = a[-1]/norm(n)
Arguments:
X: A matrix with n rows and 3 columns
tolerance: Minimal condition number accepted. If the condition
number is lower, the algorithm returns None.
Returns:
If the computation was successful, a numpy array of length three is
returned that represents the estimated plane normal. On failure,
None is returned.
'''
X = np.asarray(X)
d,N = X.shape
X = np.vstack([X,np.ones([1,N])])
z = np.ones([d+1,1])
XXT = np.dot(X, np.transpose(X)) # XXT=X*X^T
if np.linalg.det(XXT) < 1e-10:
# The test covers the case where n<3
return None
n = np.dot(np.linalg.inv(XXT), z)
intercept = n[-1]
n = n[:-1]
scale = np.linalg.norm(n)
n /= scale
intercept /= scale
return n
def findTransformFromVectors(originFrom=None, axisFrom=None,
originTo=None, axisTo=None,
origin=None,
scale=1):
'''
Compute a transformation that maps originFrom and axisFrom to originTo
and axisTo respectively. If scale is set to 'auto', the scale will be
determined such that the axes will also match in length:
scale = norm(axisTo)/norm(axisFrom)
Arguments: originFrom: sequences with 3 elements, or None
axisFrom: sequences with 3 elements, or None
originTo: sequences with 3 elements, or None
axisTo: sequences with 3 elements, or None
origin: sequences with 3 elements, or None,
overrides originFrom and originTo if set
scale: - scalar (isotropic scaling)
- sequence with 3 elements (anisotropic scaling),
- 'auto' (sets scale such that input axes match
in length after transforming axisFrom)
- None (no scaling)
Align two axes alone, assuming that we sit on (0,0,0)
findTransformFromVectors(axisFrom=a0, axisTo=a1)
Align two axes in one point (all calls are equivalent):
findTransformFromVectors(origin=o, axisFrom=a0, axisTo=a1)
findTransformFromVectors(originFrom=o, axisFrom=a0, axisTo=a1)
findTransformFromVectors(axisFrom=a0, originTo=o, axisTo=a1)
Move between two points:
findTransformFromVectors(orgin=o0, originTo=o1)
Move from one position to the other and align axes:
findTransformFromVectors(orgin=o0, axisFrom=a0, originTo=o1, axisTo=a1)
'''
# Prelude with trickle-down logic.
# Infer the origins if an information is not set.
if origin is not None:
# Check for ambiguous input.
assert(originFrom is None and originTo is None)
originFrom = origin
originTo = origin
if originFrom is None:
originFrom = originTo
if originTo is None:
originTo = originFrom
if originTo is None:
# We arrive here only if no origin information was set.
originTo = [0.,0.,0.]
originFrom = [0.,0.,0.]
originFrom = np.asarray(originFrom)
originTo = np.asarray(originTo)
# Check if any rotation will be involved.
axisFrom = np.asarray(axisFrom)
axisTo = np.asarray(axisTo)
axisFromL2 = np.linalg.norm(axisFrom)
axisToL2 = np.linalg.norm(axisTo)
if axisFrom is None or axisTo is None or axisFromL2==0 or axisToL2==0:
rotate = False
else:
rotate = not np.array_equal(axisFrom, axisTo)
# Scale.
if scale is None:
scale = 1.
if scale == 'auto':
scale = axisToL2/axisFromL2 if axisFromL2!=0. else 1.
if np.isscalar(scale):
scale = scale*np.ones(3)
if rotate:
rAxis = np.cross(axisFrom.ravel(), axisTo.ravel()) # rotation axis
angle = np.dot(axisFrom, axisTo) / axisFromL2 / axisToL2
angle = np.arccos(angle)
# Here we finally compute the transform.
trafo = vtk.vtkTransform()
trafo.Translate(originTo)
if rotate:
trafo.RotateWXYZ(angle / np.pi * 180, rAxis[0], rAxis[1], rAxis[2])
trafo.Scale(scale[0],scale[1],scale[2])
trafo.Translate(-originFrom)
return trafo

Find indices of raster cells that intersect with a polygon

I want to get a list of indices (row,col) for all raster cells that fall within or are intersected by a polygon feature. Looking for a solution in python, ideally with gdal/ogr modules.
Other posts have suggested rasterizing the polygon, but I would rather have direct access to the cell indices if possible.
Since you don't provide a working example, it's bit unclear what your starting point is. I made a dataset with 1 polygon, if you have a dataset with multiple but only want to target a specific polygon you can add SQLStatement or where to the gdal.Rasterize call.
Sample polygon
geojson = """{"type":"FeatureCollection",
"name":"test",
"crs":{"type":"name","properties":{"name":"urn:ogc:def:crs:OGC:1.3:CRS84"}},
"features":[
{"type":"Feature","properties":{},"geometry":{"type":"MultiPolygon","coordinates":[[[[-110.254,44.915],[-114.176,37.644],[-105.729,36.41],[-105.05,43.318],[-110.254,44.915]]]]}}
]}"""
Rasterizing
Rasterizing can be done with gdal.Rasterize. You need to specify the properties of the target grid. If there is no predefined grid these could be extracted from the polygon itself
ds = gdal.Rasterize('/vsimem/tmpfile', geojson, xRes=1, yRes=-1, allTouched=True,
outputBounds=[-120, 30, -100, 50], burnValues=1,
outputType=gdal.GDT_Byte)
mask = ds.ReadAsArray()
ds = None
gdal.Unlink('/vsimem/tmpfile')
Converting to indices
Retrieving the indices from the rasterized polygon can be done with Numpy:
y_ind, x_ind = np.where(mask==1)
Clearly Rutger's solution above is the way to go with this, however I will leave my solution up. I developed a script that accomplished what I needed with the following:
Get the bounding box for each vector feature I want to check
Use the bounding box to limit the computational window (determine what portion of the raster could potentially have intersections)
Iterate over the cells within this part of the raster and construct a polygon geometry for each cell
Use ogr.Geometry.Intersects() to check if the cell intersects with the polygon feature
Note that I have only defined the methods, but I think implementation should be pretty clear -- just call match_cells with the appropriate arguments (ogr.Geometry object and geotransform matrix). Code below:
from osgeo import ogr
# Convert projected coordinates to raster cell indices
def parse_coords(x,y,gt):
row,col = None,None
if x:
col = int((x - gt[0]) // gt[1])
# If only x coordinate is provided, return column index
if not y:
return col
if y:
row = int((y - gt[3]) // gt[5])
# If only x coordinate is provided, return column index
if not x:
return row
return (row,col)
# Construct polygon geometry from raster cell
def build_cell((row,col),gt):
xres,yres = gt[1],gt[5]
x_0,y_0 = gt[0],gt[3]
top = (yres*row) + y_0
bottom = (yres*(row+1)) + y_0
right = (xres*col) + x_0
left = (xres*(col+1)) + x_0
# Create ring topology
ring = ogr.Geometry(ogr.wkbLinearRing)
ring.AddPoint(left,bottom)
ring.AddPoint(right,bottom)
ring.AddPoint(right,top)
ring.AddPoint(left,top)
ring.AddPoint(left,bottom)
# Create polygon
box = ogr.Geometry(ogr.wkbPolygon)
box.AddGeometry(ring)
return box
# Iterate over feature geometries & check for intersection
def match_cells(inputGeometry,gt):
matched_cells = []
for f,feature in enumerate(inputGeometry):
geom = feature.GetGeometryRef()
bbox = geom.GetEnvelope()
xmin,xmax = [parse_coords(x,None,gt) for x in bbox[:2]]
ymin,ymax = [parse_coords(None,y,gt) for y in bbox[2:]]
for cell_row in range(ymax,ymin+1):
for cell_col in range(xmin,xmax+1):
cell_box = build_cell((cell_row,cell_col),gt)
if cell_box.Intersects(geom):
matched_cells += [[(cell_row,cell_col)]]
return matched_cells
if you want to do this manually you'll need to test each cell for:
Square v Polygon intersection and
Square v Line intersection.
If you treat each square as a 2d point this becomes easier - it's now a Point v Polygon problem. Check in Game Dev forums for collision algorithms.
Good luck!

python region growing with polygons performance

I'm trying to create regions of polygons on the condition that they touch. In my example I have an example dataset with 382 polygons that need to be grouped together (but the full dataset contains 6355 polygons). (I would show a picture, but I don't have enough reputation to do that..)
I though of doing this brute force, but of course that takes very long and is not very optimal.
def groupBuildings(blds):
# blds is a list with shapely polygons
groups = []
for bld in blds:
group = []
group.append(bld)
for other in blds:
for any in group:
if any != other and any.intersects(other):
group.append(other)
groups.append(group)
return groups
I learned about region growing and thought that that would be a possible solution, but still the performance is terrible. I've implemented this in the following way:
def groupBuildings(blds):
# blds is a list with shapely polygons
others = blds
groups = []
while blds != []:
done = []
group = []
first = blds.pop(0)
done.append(first)
group.append(first)
for other in others:
if (other in blds) and first.touches(other):
group.append(other)
blds.remove(other)
return groups
But I think the problem here is that I don't have any nearest neighbors, so I still have to iterate over every building twice.
So my question is: are nearest neighbors essential for region growing? Or is there another way of doing this efficiently?
You will be best served using shapely.ops.cascaded_union() (docs here).
from shapely.geometry import Point, Polygon, MultiPolygon
from shapely.ops import cascaded_union
import numpy as np
polygons = [Point(200*x,200*y).buffer(b) for x,y,b in np.random.random((6000,3))]
multi = MultiPolygon(polygons)
unioned = cascaded_union(multi)
%%timeit
unioned = cascaded_union(multi)
# 2.8 seconds for me

Numpy Histogram - Python

I have a problem in which a have a bunch of images for which I have to generate histograms. But I have to generate an histogram for each pixel. I.e, for a collection of n images, I have to count the values that the pixel 0,0 assumed and generate an histogram, the same for 0,1, 0,2 and so on. I coded the following method to do this:
class ImageData:
def generate_pixel_histogram(self, images, bins):
"""
Generate a histogram of the image for each pixel, counting
the values assumed for each pixel in a specified bins
"""
max_value = 0.0
min_value = 0.0
for i in range(len(images)):
image = images[i]
max_entry = max(max(p[1:]) for p in image.data)
min_entry = min(min(p[1:]) for p in image.data)
if max_entry > max_value:
max_value = max_entry
if min_entry < min_value:
min_value = min_entry
interval_size = (math.fabs(min_value) + math.fabs(max_value))/bins
for x in range(self.width):
for y in range(self.height):
pixel_histogram = {}
for i in range(bins+1):
key = round(min_value+(i*interval_size), 2)
pixel_histogram[key] = 0.0
for i in range(len(images)):
image = images[i]
value = round(Utils.get_bin(image.data[x][y], interval_size), 2)
pixel_histogram[value] += 1.0/len(images)
self.data[x][y] = pixel_histogram
Where each position of a matrix store a dictionary representing an histogram. But, how I do this for each pixel, and this calculus take a considerable time, this seems to me to be a good problem to be parallelized. But I don't have experience with this and I don't know how to do this.
EDIT:
I tried what #Eelco Hoogendoorn told me and it works perfectly. But applying it to my code, where the data are a large number of images generated with this constructor (after the values are calculated and not just 0 anymore), I just got as h an array of zeros [0 0 0]. What I pass to the histogram method is an array of ImageData.
class ImageData(object):
def __init__(self, width=5, height=5, range_min=-1, range_max=1):
"""
The ImageData constructor
"""
self.width = width
self.height = height
#The values range each pixel can assume
self.range_min = range_min
self.range_max = range_max
self.data = np.arange(width*height).reshape(height, width)
#Another class, just the method here
def generate_pixel_histogram(realizations, bins):
"""
Generate a histogram of the image for each pixel, counting
the values assumed for each pixel in a specified bins
"""
data = np.array([image.data for image in realizations])
min_max_range = data.min(), data.max()+1
bin_boundaries = np.empty(bins+1)
# Function to wrap np.histogram, passing on only the first return value
def hist(pixel):
h, b = np.histogram(pixel, bins=bins, range=min_max_range)
bin_boundaries[:] = b
return h
# Apply this for each pixel
hist_data = np.apply_along_axis(hist, 0, data)
print hist_data
print bin_boundaries
Now I get:
hist_data = np.apply_along_axis(hist, 0, data)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/lib/shape_base.py", line 104, in apply_along_axis
outshape[axis] = len(res)
TypeError: object of type 'NoneType' has no len()
Any help would be appreciated.
Thanks in advance.
As noted by john, the most obvious solution to this is to look for library functionality that will do this for you. It exists, and it will be orders of magnitude more efficient than what you are doing here.
Standard numpy has a histogram function that can be used for this purpose. If you have only few values per pixel, it will be relatively inefficient; and it creates a dense histogram vector rather than the sparse one you produce here. Still, chances are good the code below solves your problem efficiently.
import numpy as np
#some example data; 128 images of 4x4 pixels
voxeldata = np.random.randint(0,100, (128, 4,4))
#we need to apply the same binning range to each pixel to get sensibble output
globalminmax = voxeldata.min(), voxeldata.max()+1
#number of output bins
bins = 20
bin_boundaries = np.empty(bins+1)
#function to wrap np.histogram, passing on only the first return value
def hist(pixel):
h, b = np.histogram(pixel, bins=bins, range=globalminmax)
bin_boundaries[:] = b #simply overwrite; result should be identical each time
return h
#apply this for each pixel
histdata = np.apply_along_axis(hist, 0, voxeldata)
print bin_boundaries
print histdata[:,0,0] #print the histogram of an arbitrary pixel
But the more general message id like to convey, looking at your code sample and the type of problem you are working on: do yourself a favor, and learn numpy.
Parallelization certainly would not be my first port of call in optimizing this kind of thing. Your main problem is that you're doing lots of looping at the Python level. Python is inherently slow at this kind of thing.
One option would be to learn how to write Cython extensions and write the histogram bit in Cython. This might take you a while.
Actually, taking a histogram of pixel values is a very common task in computer vision and it has already been efficiently implemented in OpenCV (which has python wrappers). There are also several functions for taking histograms in the numpy python package (though they are slower than the OpenCV implementations).

get bins coordinates with hexbin in matplotlib

I use matplotlib's method hexbin to compute 2d histograms on my data.
But I would like to get the coordinates of the centers of the hexagons in order to further process the results.
I got the values using get_array() method on the result, but I cannot figure out how to get the bins coordinates.
I tried to compute them given number of bins and the extent of my data but i don't know the exact number of bins in each direction. gridsize=(10,2) should do the trick but it does not seem to work.
Any idea?
I think this works.
from __future__ import division
import numpy as np
import math
import matplotlib.pyplot as plt
def generate_data(n):
"""Make random, correlated x & y arrays"""
points = np.random.multivariate_normal(mean=(0,0),
cov=[[0.4,9],[9,10]],size=int(n))
return points
if __name__ =='__main__':
color_map = plt.cm.Spectral_r
n = 1e4
points = generate_data(n)
xbnds = np.array([-20.0,20.0])
ybnds = np.array([-20.0,20.0])
extent = [xbnds[0],xbnds[1],ybnds[0],ybnds[1]]
fig=plt.figure(figsize=(10,9))
ax = fig.add_subplot(111)
x, y = points.T
# Set gridsize just to make them visually large
image = plt.hexbin(x,y,cmap=color_map,gridsize=20,extent=extent,mincnt=1,bins='log')
# Note that mincnt=1 adds 1 to each count
counts = image.get_array()
ncnts = np.count_nonzero(np.power(10,counts))
verts = image.get_offsets()
for offc in xrange(verts.shape[0]):
binx,biny = verts[offc][0],verts[offc][1]
if counts[offc]:
plt.plot(binx,biny,'k.',zorder=100)
ax.set_xlim(xbnds)
ax.set_ylim(ybnds)
plt.grid(True)
cb = plt.colorbar(image,spacing='uniform',extend='max')
plt.show()
I would love to confirm that the code by Hooked using get_offsets() works, but I tried several iterations of the code mentioned above to retrieve center positions and, as Dave mentioned, get_offsets() remains empty. The workaround that I found is to use the non-empty 'image.get_paths()' option. My code takes the mean to find centers but which means it is just a smidge longer, but it does work.
The get_paths() option returns a set of x,y coordinates embedded that can be looped over and then averaged to return the center position for each hexagram.
The code that I have is as follows:
counts=image.get_array() #counts in each hexagon, works great
verts=image.get_offsets() #empty, don't use this
b=image.get_paths() #this does work, gives Path([[]][]) which can be plotted
for x in xrange(len(b)):
xav=np.mean(b[x].vertices[0:6,0]) #center in x (RA)
yav=np.mean(b[x].vertices[0:6,1]) #center in y (DEC)
plt.plot(xav,yav,'k.',zorder=100)
I had this same problem. I think what needs to be developed is a framework to have a HexagonalGrid object which can then be applied to many different data sets (and it would be awesome to do it for N dimensions). This is possible and it surprises me that neither Scipy or Numpy has anything for it (furthermore there seems to be nothing else like it except perhaps binify)
That said, I assume you want to use hexbinning to compare multiple binned data sets. This requires some common base. I got this to work using matplotlib's hexbin the following way:
import numpy as np
import matplotlib.pyplot as plt
def get_data (mean,cov,n=1e3):
"""
Quick fake data builder
"""
np.random.seed(101)
points = np.random.multivariate_normal(mean=mean,cov=cov,size=int(n))
x, y = points.T
return x,y
def get_centers (hexbin_output):
"""
about 40% faster than previous post only cause you're not calculating the
min/max every time
"""
paths = hexbin_output.get_paths()
v = paths[0].vertices[:-1] # adds a value [0,0] to the end
vx,vy = v.T
idx = [3,0,5,2] # index for [xmin,xmax,ymin,ymax]
xmin,xmax,ymin,ymax = vx[idx[0]],vx[idx[1]],vy[idx[2]],vy[idx[3]]
half_width_x = abs(xmax-xmin)/2.0
half_width_y = abs(ymax-ymin)/2.0
centers = []
for i in xrange(len(paths)):
cx = paths[i].vertices[idx[0],0]+half_width_x
cy = paths[i].vertices[idx[2],1]+half_width_y
centers.append((cx,cy))
return np.asarray(centers)
# important parts ==>
class Hexagonal2DGrid (object):
"""
Used to fix the gridsize, extent, and bins
"""
def __init__ (self,gridsize,extent,bins=None):
self.gridsize = gridsize
self.extent = extent
self.bins = bins
def hexbin (x,y,hexgrid):
"""
To hexagonally bin the data in 2 dimensions
"""
fig = plt.figure()
ax = fig.add_subplot(111)
# Note mincnt=0 so that it will return a value for every point in the
# hexgrid, not just those with count>mincnt
# Basically you fix the gridsize, extent, and bins to keep them the same
# then the resulting count array is the same
hexbin = plt.hexbin(x,y, mincnt=0,
gridsize=hexgrid.gridsize,
extent=hexgrid.extent,
bins=hexgrid.bins)
# you could close the figure if you don't want it
# plt.close(fig.number)
counts = hexbin.get_array().copy()
return counts, hexbin
# Example ===>
if __name__ == "__main__":
hexgrid = Hexagonal2DGrid((21,5),[-70,70,-20,20])
x_data,y_data = get_data((0,0),[[-40,95],[90,10]])
x_model,y_model = get_data((0,10),[[100,30],[3,30]])
counts_data, hexbin_data = hexbin(x_data,y_data,hexgrid)
counts_model, hexbin_model = hexbin(x_model,y_model,hexgrid)
# if you want the centers, they will be the same for both
centers = get_centers(hexbin_data)
# if you want to ignore the cells with zeros then use the following mask.
# But if want zeros for some bins and not others I'm not sure an elegant way
# to do this without using the centers
nonzero = counts_data != 0
# now you can compare the two data sets
variance_data = counts_data[nonzero]
square_diffs = (counts_data[nonzero]-counts_model[nonzero])**2
chi2 = np.sum(square_diffs/variance_data)
print(" chi2={}".format(chi2))

Categories

Resources