Related
I'm using 7 functions to create dot density maps in Python, Pandas, and Geopandas. The functions read the US Census blockgroups and create random points that fall within the blockgroup polygon. The end result is a nice dot density map where each point represents 300 people of a certain ethnicity.
For some of the counties, this works totally fine. Here's Denver County
But for others I'm hitting list index out of range and I can't figure out why. I'm using the FIPS blockgroup as the index this is what the dataframe ends up looking like.
gen_count_dot_density_map('Yakima County, WA')
Error message:
IndexError Traceback (most recent call last)
<ipython-input-25-3174d4f9a7b7> in <module>()
----> 1 gen_count_dot_density_map('Yakima County, WA')
3 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/reshape/reshape.py in stack(frame, level, dropna)
520 # we concatenate instead.
521 dtypes = list(frame.dtypes._values)
--> 522 dtype = dtypes[0]
523
524 if is_extension_array_dtype(dtype):
IndexError: list index out of range
Here are the functions I'm using.
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO, StringIO
import shapefile
import geopandas as gpd
from shapely.geometry import shape
import osr
import pandas as pd
import requests
from shapely.geometry import Point
from numpy.random import RandomState, uniform
import numpy as np
def gen_random_points_poly(poly, num_points, seed = None):
"""
Returns a list of N randomly generated points within a polygon.
"""
min_x, min_y, max_x, max_y = poly.bounds
points = []
i=0
while len(points) < num_points:
s=RandomState(seed+i) if seed else RandomState(seed)
random_point = Point([s.uniform(min_x, max_x), s.uniform(min_y, max_y)])
if random_point.within(poly):
points.append(random_point)
i+=1
return points
def gen_points_in_gdf_polys(geometry, values, points_per_value = None, seed = None):
"""
Take a GeoSeries of Polygons along with a Series of values and returns randomly generated points within
these polygons. Optionally takes a "points_per_value" integer which indicates the number of points that
should be generated for each 1 value.
"""
if points_per_value:
new_values = (values/points_per_value).astype(int)
else:
new_values = values
new_values = new_values[new_values>0]
g = gpd.GeoDataFrame(data = {'vals':new_values}, geometry = geometry)
a = g.apply(lambda row: tuple(gen_random_points_poly(row['geometry'], row['vals'], seed)),1)
b = gpd.GeoSeries(a.apply(pd.Series).stack(), crs = geometry.crs)
b.name='geometry'
return b
def zip_shp_to_gdf(zip_file_name):
"""
Returns a GeoDataFrame from a URL for a zipped Shapefile
"""
zipfile = ZipFile(BytesIO(urlopen(zip_file_name).read()))
filenames = [y for y in sorted(zipfile.namelist()) for ending in ['dbf', 'prj', 'shp', 'shx']\
if y.endswith(ending)]
dbf, prj, shp, shx = [BytesIO(zipfile.read(filename)) for filename in filenames]
r = shapefile.Reader(shp=shp, shx=shx, dbf=dbf)
attributes, geometry = [], []
field_names = [field[0] for field in r.fields[1:]]
for row in r.shapeRecords():
geometry.append(shape(row.shape.__geo_interface__))
attributes.append(dict(zip(field_names, row.record)))
proj4_string = osr.SpatialReference(prj.read().decode('UTF-8')).ExportToProj4()
gdf = gpd.GeoDataFrame(data = attributes, geometry = geometry, crs = proj4_string)
return gdf
def get_census_variables(year, dataset, geography, area, variables, variable_labels = None):
"""Wraps the Census API and returns a DataFrame of Census Data
Parameters
----------
year : integer
Year representing the dataset vintage
dataset : string
the name of the dataset (https://api.census.gov/data.html)
geography : string
the census geography
area : dictionary
dictionary contains the FIPS codes at each nested geographic level. For example "{'county':'001', 'state':'06'}"
variables : list
list of the variables to be extracted
variable_labels : list
optional to relabel the variable names. Must be same length as "variables"
"""
base_url = 'https://api.census.gov/data/{}/acs/{}'.format(year, dataset)
#define parameters
get_parameter = ','.join(['NAME'] + variables)
for_parameter = '{}:*'.format(geography)
in_paramater = '+'.join([k+':'+v for (k,v) in area.items()])
parameters = {'get' : get_parameter,
'for' : for_parameter,
'in' : in_paramater}
#make request specifiying url and parameters
r = requests.get(base_url, params=parameters)
#read json into pandas dataframe, specifying first row as column names
data = r.json()
df=pd.DataFrame(columns = data[0], data = data[1:])
#identify geography fields - concatenate them into a fips code to be set as index and then delete them
geo_fields = [x for x in df.columns if x not in ['NAME'] + variables]
df.index = df[geo_fields].apply(lambda row: ''.join(map(str, row)), 1)
df.index.name = 'FIPS'
df = df.drop(geo_fields, 1)
if variable_labels:
df = df.rename(columns = dict(zip(variables, variable_labels)))
#convert data numeric
df = df.applymap(lambda x:pd.to_numeric(x, errors='ignore'))
return df
def gen_count_dot_density_map(county, pts_per_person = 300,
epsg = 2163, seed=10,
dot_transparency=0.4, figsize=(12,12),
ax=None, legend=False):
"""
Wraps previous functions and generates population dot density maps for a specified county by race
"""
#read in fips to county name relationship file
fips = pd.read_csv('https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt',
header=None, dtype={1:np.object, 2:np.object})
fips['name']=fips[3]+', '+fips[0]
fips['fips']=fips[1]+fips[2]
#get name from fips if fips specified
if county.isdigit():
lookup = fips.set_index('fips')['name']
county_fips = county
name = lookup[county_fips]
#get fips from name if name specified
else:
lookup = fips.set_index('name')['fips']
name = county
county_fips = lookup[name]
#get geodataframe of block group shapefile
bgfile_name = 'http://www2.census.gov/geo/tiger/GENZ2015/shp/cb_2015_{}_bg_500k.zip'.format(county_fips[:2])
bg_geo = zip_shp_to_gdf(bgfile_name)
#subset to those that are in the county and project it to the CRS
bg_geo=bg_geo[bg_geo['GEOID'].str[:5]==county_fips].to_crs(epsg=epsg).set_index("GEOID")['geometry']
#specify variable list and variable names for the census api function
varlist = ['B03002_003E',
'B03002_012E',
'B03002_004E',
'B03002_006E',
'B03002_005E',
'B03002_007E',
'B03002_008E',
'B03002_009E']
names = ['White',
'Hispanic',
'Black',
'Asian',
'AI/AN',
'NH/PI',
'Other_',
'Two Plus']
#read in block group level census variables
dems = get_census_variables(2015, 'acs5', 'block group',
{'county':county_fips[2:],
'state':county_fips[:2]}, varlist, names)
#Calculate other as sum of those not in the 4 most populated race categories
dems['Other']=dems[['AI/AN', 'NH/PI','Other_', 'Two Plus']].sum(1)
#Calculate county boundaries as the union of block groups
union = gpd.GeoSeries(bg_geo.unary_union)
#if axes object is specified, plot to this axis, otherwise create a new one
if ax:
union.plot(color='white', figsize=figsize, ax=ax)
else:
ax = union.plot(color='white', figsize=figsize)
#set aspect equal and add title if specified
ax.set(aspect='equal', xticks=[], yticks=[])
#set title as county name
ax.set_title(name, size=15)
#annotate the dot per person ratio
# ax.annotate("1 dot = {} people".format(pts_per_person),
# xy=(.5, .97), xycoords='axes fraction', horizontalalignment='center',
# fontsize = 12)
#loop each race category and generate points for each within each block group
list_of_point_categories=[]
for field in ['White','Hispanic','Black','Asian','Other']:
ps=gpd.GeoDataFrame(gen_points_in_gdf_polys(geometry = bg_geo, values=dems[field],
points_per_value = pts_per_person, seed=seed))
ps['field']=field
list_of_point_categories.append(ps)
all_points=gpd.GeoDataFrame(pd.concat(list_of_point_categories))
print(all_points.head())
all_points.plot(ax=ax, markersize=2, alpha=dot_transparency, column='field', categorical=True, legend=legend)
return ax
Check for empty values.
If you have any, you should filter them before plotting, like so:
df[~df.geometry.is_empty].plot()
Similar problem:
Geopandas sjoin() error: list index out of range
I have a code that extracts data from satellite observation nc files in total of 8 stations.
from netCDF4 import Dataset
import numpy as np
import pandas as pd
data = Dataset('rainfall.nc', 'r')
dims = data.dimensions
ndims = len(dims)
vars = data.variables
nvars = len(vars)
attrs = data.ncattrs
lon = data.variables['lon'][:]
lat = data.variables['lat'][:]
The longitude and latitude of desired stations are as follow:
A=(46.28, 38.08)
B=(49.62, 37.32)
C=(51.50, 36.65)
D=(47.00, 35.33)
E=(51.32, 35.68)
F=(51.67, 32.62)
G=(51.55, 30.68)
H=(52.60, 29.53)
For each station (from A to H) I have to do the following 6 line coding task in order to achive the data (the following code is example of "A" station), where the ?? is the desired variable such as rainfall or etc. :
long_location,lat_location = (A) "= (46.28, 38.08)"
sq_dist_lat = (lat - lat_location)**2
sq_dist_lon = (lon - long_location)**2
min_index_lat = sq_dist_lat.argmin()
min_index_lon = sq_dist_lon.argmin()
variable = data.variables['rainfall'][min_index_lat, min_index_lon]
I have created a list as follow:
locations = [A, B, C, D, E, F, G, H]
however, I need a 'for' loop to do the 6 line task which I mentioned above to all 8 stations and print a data as an array with 8 columns. I will appreciate it if anyone can help me
I face an error of 'float' object has no attribute 'argmin', I guess that you want to find the closest point between the points (A-H) to given point (lat and lon) if not - then please clarify input and expected output.
for that, you can use this piece of code:
import numpy as np
A=(46.28, 38.08)
B=(49.62, 37.32)
C=(51.50, 36.65)
D=(47.00, 35.33)
E=(51.32, 35.68)
F=(51.67, 32.62)
G=(51.55, 30.68)
H=(52.60, 29.53)
locations = [A, B, C, D, E, F, G, H]
lon = 51
lat = 35
closest_point = locations[np.argmin([np.sqrt((item[0]-lon)**2 + (item[1]-lat)**2) for item in locations])]
print(closest_point)
output:
(51.32, 35.68)
If this is not what you were looking for, take my answer as an hint
I have the following code that takes very long time to execute. The pandas DataFrames df and df_plants are very small (less than 1Mb). I wonder if there is any way to optimise this code:
import pandas as pd
import geopy.distance
import re
def is_inside_radius(latitude, longitude, df_plants, radius):
if (latitude != None and longitude != None):
lat = float(re.sub("[a-zA-Z]", "", str(latitude)))
lon = float(re.sub("[a-zA-Z]", "", str(longitude)))
for index, row in df_plants.iterrows():
coords_1 = (lat, lon)
coords_2 = (row["latitude"], row["longitude"])
dist = geopy.distance.distance(coords_1, coords_2).km
if dist <= radius:
return 1
return 0
df["inside"] = df.apply(lambda row: is_inside_radius(row["latitude"],row["longitude"],df_plants,10), axis=1)
I use regex to process latitude and longitude in df because the values contain some errors (characters) which should be deleted.
The function is_inside_radius verifies if row[latitude] and row[longitude] are inside the radius of 10 km from any of the points in df_plants.
Can you try this?
import pandas as pd
from geopy import distance
import re
def is_inside_radius(latitude, longitude, df_plants, radius):
if (latitude != None and longitude != None):
lat = float(re.sub("[a-zA-Z]", "", str(latitude)))
lon = float(re.sub("[a-zA-Z]", "", str(longitude)))
coords_1 = (lat, lon)
for row in df_plants.itertuples():
coords_2 = (row["latitude"], row["longitude"])
if distance.distance(coords_1, coords_2).km <= radius:
return 1
return 0
df["inside"] = df.map(
lambda row: is_inside_radius(
row["latitude"],
row["longitude"],
df_plants,
10),
axis=1)
From https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html#pandas-dataframe-iterrows, pandas.DataFrame.itertuples() returns namedtuples of the values which is generally faster than pandas.DataFrame.iterrows(), and preserve dtypes across returned rows.
I've encountered such a problem before, and I see one simple optimisation: try to avoid the floating point calculation as much a possible, which you can do as follows:
Imagine:
You have a circle, defined by Mx and My (center coordinates) and R (radius).
You have a point, defined by is coordinates X and Y.
If your point (X,Y) is not even within the square, defined by (Mx, My) and size 2*R, then it will also not be within the circle, defined by (Mx, My) and radius R.
In pseudo-code:
function is_inside(X,Y,Mx,My,R):
if (abs(Mx-X) >= R) OR (abs(My-Y) >= R)
then return false
else:
// and only here you perform the floating point calculation
I am using some code from a workshop to extract data from netCDF files by the coordinates closest to my specified coordinates. When using just one set of coordinates I am able to extract the values I need without trouble as below:
import numpy as np
import netCDF4
from math import pi
from numpy import cos, sin
def tunnel_fast(latvar,lonvar,lat0,lon0):
'''
Find closest point in a set of (lat,lon) points to specified point
latvar - 2D latitude variable from an open netCDF dataset
lonvar - 2D longitude variable from an open netCDF dataset
lat0,lon0 - query point
Returns iy,ix such that the square of the tunnel distance
between (latval[it,ix],lonval[iy,ix]) and (lat0,lon0)
is minimum.
'''
rad_factor = pi/180.0 # for trignometry, need angles in radians
# Read latitude and longitude from file into numpy arrays
latvals = latvar[:] * rad_factor
lonvals = lonvar[:] * rad_factor
ny,nx = latvals.shape
lat0_rad = lat0 * rad_factor
lon0_rad = lon0 * rad_factor
# Compute numpy arrays for all values, no loops
clat,clon = cos(latvals),cos(lonvals)
slat,slon = sin(latvals),sin(lonvals)
delX = cos(lat0_rad)*cos(lon0_rad) - clat*clon
delY = cos(lat0_rad)*sin(lon0_rad) - clat*slon
delZ = sin(lat0_rad) - slat;
dist_sq = delX**2 + delY**2 + delZ**2
minindex_1d = dist_sq.argmin() # 1D index of minimum element
iy_min,ix_min = np.unravel_index(minindex_1d, latvals.shape)
return iy_min,ix_min
ncfile = netCDF4.Dataset('E:\wind_level2_1.nc', 'r')
latvar = ncfile.variables['latitude']
lonvar = ncfile.variables['longitude']
#_________GG turbine_________GAD10 Latitude 51.735516, GAD10 Longitude 1.942656
iy,ix = tunnel_fast(latvar, lonvar, 51.735516, 1.942656)
print('Closest lat lon:', latvar[iy,ix], lonvar[iy,ix])
refLAT=latvar[iy,ix]
refLON = lonvar[iy,ix]
#try to find the data for this location
SARwind = ncfile.variables['sar_wind'][:,:]
ModelWind = ncfile.variables['model_speed'][:,:]
print 'iy,ix' #appears to be the index of the value of Lat,lon
print SARwind[iy,ix]
ncfile.close()
Now I am trying to loop through a text file containing coordinates coord_list to extract sets of coordinates, find the data then move to the next set of coordinates in the list. This code works on it's own as below:
import csv
from decimal import Decimal
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
#coord_list = list(reader)
coord_list = [reader]
end_row = len(coord_list)
lon_ind=1
lat_ind=2
for row in range(0, end_row-1):#end_row - 1 due to the 0 index
turbine_lat = coord_list[row][lat_ind]
turbine_lon = coord_list[row][lon_ind]
turbine_lat = [Decimal(turbine_lat)]
print 'lat',turbine_lat, 'lon',turbine_lon, row
However, I want to pass coordinates from the text file to this part of the original code iy,ix = tunnel_fast(latvar, lonvar, 51.94341, 1.922094888), replacing the numbers with variables iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon). I try to combine the two codes by creating a function get_coordinates, I get the following errors
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 65, in <module>
get_coordinates(coord_list, latvar, lonvar)
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 51, in get_coordinates
iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 27, in tunnel_fast
lat0_rad = lat0 * rad_factor
TypeError: can't multiply sequence by non-int of type 'float'
I thought this is because the turbine_lat and turbine_lon are list items so cannot be used, but this doesn't seem to be connected to the errors. I know this code needs more work anyway, but if anyone could help me spot where I am going wrong that would be very helpful. My attempt to combine the two codes is below.
import numpy as np
import netCDF4
from math import pi
from numpy import cos, sin
import csv
# edited from https://github.com/Unidata/unidata-python-workshop/blob/a56daa50d7b343c7debe93968683613642d6b9f7/notebooks/netcdf-by-coordinates.ipynb
def tunnel_fast(latvar,lonvar,lat0,lon0):
'''
Find closest point in a set of (lat,lon) points to specified point
latvar - 2D latitude variable from an open netCDF dataset
lonvar - 2D longitude variable from an open netCDF dataset
lat0,lon0 - query point
Returns iy,ix such that the square of the tunnel distance
between (latval[it,ix],lonval[iy,ix]) and (lat0,lon0)
is minimum.
'''
rad_factor = pi/180.0 # for trignometry, need angles in radians
# Read latitude and longitude from file into numpy arrays
latvals = latvar[:] * rad_factor
lonvals = lonvar[:] * rad_factor
ny,nx = latvals.shape
lat0_rad = lat0 * rad_factor
lon0_rad = lon0 * rad_factor
# Compute numpy arrays for all values, no loops
clat,clon = cos(latvals),cos(lonvals)
slat,slon = sin(latvals),sin(lonvals)
delX = cos(lat0_rad)*cos(lon0_rad) - clat*clon
delY = cos(lat0_rad)*sin(lon0_rad) - clat*slon
delZ = sin(lat0_rad) - slat;
dist_sq = delX**2 + delY**2 + delZ**2
minindex_1d = dist_sq.argmin() # 1D index of minimum element
iy_min,ix_min = np.unravel_index(minindex_1d, latvals.shape)
return iy_min,ix_min
#________________my edits___________________________________________________
def get_coordinates(coord_list, latvar, lonvar):
"this takes coordinates from a .csv and assigns them to variables"
end_row = len(coord_list)
lon_ind=1
lat_ind=2
for row in range(0, end_row-1):#end_row - 1 due to the 0 index
turbine_lat = coord_list[row][lat_ind]
turbine_lon = coord_list[row][lon_ind]
iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)
print('Closest lat lon:', latvar[iy, ix], lonvar[iy, ix])
#________________________________________________________________________________________________________________________
ncfile = netCDF4.Dataset('NOGAPS_wind_level2_1.nc', 'r')
latvar = ncfile.variables['latitude']
lonvar = ncfile.variables['longitude']
#____added in to pass to get coordinates function
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
coord_list = list(reader)
#_________take latitude from coordinateas function
get_coordinates(coord_list, latvar, lonvar)
#iy,ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)#get these from the 'assign_coordinates_fromlist.py
#print('Closest lat lon:', latvar[iy,ix], lonvar[iy,ix])
SARwind = ncfile.variables['sar_wind'][:,:]
ModelWind = ncfile.variables['model_speed'][:,:]
print 'iy,ix' #appears to be the index of the value of Lat,lon
print SARwind[iy,ix]
ncfile.close()
When I try to convert
You can unpack an argument list using *args (see the docs). In your case you could do tunnel_fast(latvar, lonvar, *coord_list[row]). You need to make sure that the order of arguments in coord_list[row] is correct and if coord_list[row] contains more than the two values then you need to slice it appropriately.
Thanks to help from a_guest
It was a simple problem of lat0 and lon0 being passed as
<type 'str'> to tunnel_fast when it requires <type 'float'>. This appears to come from loading the coord_list as a list.
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
coord_list = list(reader)
The workaround I used was to convert lat0 and lon0 to floats at the beginning of tunnel_fast
lat0 = float(lat0)
lon0 = float(lon0)
I am sure there is a more elegant way to do this, but it works.
I'm looking to compute poleward heat fluxes at a level in the atmosphere, i.e the mean of (u't') . I'm aware of the covariance function in NumPy, but cannot seem to implement it. Here is my code below.
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
myfile = '/home/ubuntu/Fluxes_Test/out.nc'
Import = Dataset(myfile, mode='r')
lon = Import.variables['lon'][:] # Longitude
lat = Import.variables['lat'][:] # Latitude
time = Import.variables['time'][:] # Time
lev = Import.variables['lev'][:] # Level
wind = Import.variables['ua'][:]
temp = Import.variables['ta'][:]
lon = lon-180 # to shift co-ordinates to -180 to 180.
variable1 = np.squeeze(wind,temp, axis=0)
variable2 = np.cov(variable1)
m = Basemap(resolution='l')
lons, lats = np.meshgrid(lon,lat)
X, Y = m(lons, lats)
cs = m.pcolor(X,Y, variable2)
plt.show()
The shape of the variables wind and temp which I am trying to compute the flux of (the covariance) are both (3960,64,128), so 3960 pieces of data on a 64x128 grid (with co-ordinates).
I tried squeezing both variables to produce a array of (3960, 3960, 64,128) so cov could work on these first two series of data (the two 3960's) of wind and temp, but this didn't work.