Using NASA's SRTM data, I've generated a global elevation heatmap.
The problem is, however, the continents tend to blend in with the ocean because of the range of elevation values. Is it possible to change the colorbar's scale so that the edges of the continents are more distinct from the ocean? I've tried different cmaps, but they all seem to suffer from the problem.
Here is my code. I'm initializing a giant array (with 0s) to hold global elevation data, and then populating it file by file from the SRTM dataset. Each file is 1 degree latitude by 1 degree longitude.
Another question I had was regarding the map itself. For some reason, the Appalachian Mountains seem to have disappeared entirely.
import os
import numpy as np
from .srtm_map import MapGenerator
from ..utils.hgt_parser import HGTParser
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import richdem as rd
class GlobalMapGenerator():
def __init__(self):
self.gen = MapGenerator()
self.base_dir = "data/elevation/"
self.hgt_files = os.listdir(self.base_dir)
self.global_elevation_data = None
def shrink(data, rows, cols):
return data.reshape(rows, data.shape[0]/rows, cols, data.shape[1]/cols).sum(axis=1).sum(axis=2)
def GenerateGlobalElevationMap(self, stride):
res = 1201//stride
max_N = 59
max_W = 180
max_S = 56
max_E = 179
# N59 --> N00
# S01 --> S56
# E000 --> E179
# W180 --> W001
# Initialize array global elevation
self.global_elevation_data = np.zeros(( res*(max_S+max_N+1), res*(max_E+max_W+1) ))
print("Output Image Shape:", self.global_elevation_data.shape)
for hgt_file in tqdm(self.hgt_files):
lat_letter = hgt_file[0]
lon_letter = hgt_file[3]
lat = int(hgt_file[1:3])
lon = int(hgt_file[4:7])
if lat_letter == "S":
# Shift south down by max_N, but south starts at S01 so we translate up by 1 too
lat_trans = max_N + lat - 1
else:
# Bigger N lat means further up. E.g. N59 is at index 0 and is higher than N00
lat_trans = max_N - lat
if lon_letter == "E":
# Shift east right by max_W
lon_trans = max_W + lon
else:
# Bigger W lon means further left. E.g. W180 is at index 0 and is more left than W001
lon_trans = max_W - lon
# load in data from file as resized
data = cv2.resize(HGTParser(os.path.join(self.base_dir, hgt_file)), (res, res))
# generate bounds (x/y --> lon.lat for data from this file for the giant array)
lat_bounds = [res*lat_trans, res*(lat_trans+1)]
lon_bounds = [res*lon_trans, res*(lon_trans+1)]
try:
self.global_elevation_data[ lat_bounds[0]:lat_bounds[1], lon_bounds[0]:lon_bounds[1] ] = data
except:
print("REFERENCE ERROR: " + hgt_file)
print("lat: ", lat_bounds)
print("lon: ", lon_bounds)
# generate figure
plt.figure(figsize=(20,20))
plt.imshow(self.global_elevation_data, cmap="rainbow")
plt.title("Global Elevation Heatmap")
plt.colorbar()
plt.show()
np.save("figures/GlobalElevationMap.npy", self.global_elevation_data)
plt.savefig("figures/GlobalElevationMap.png")
def GenerateGlobalSlopeMap(self, stride):
pass
Use a TwoSlopeNorm (docs) for your norm, like the example here.
From the example:
Sometimes we want to have a different colormap on either side of a conceptual center point, and we want those two colormaps to have different linear scales. An example is a topographic map where the land and ocean have a center at zero, but land typically has a greater elevation range than the water has depth range, and they are often represented by a different colormap.
If you set the midpoint at sea level (0), then you can have two very different scalings based on ocean elevation vs land elevation.
Example code (taken from the example linked above):
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cbook as cbook
from matplotlib import cm
dem = cbook.get_sample_data('topobathy.npz', np_load=True)
topo = dem['topo']
longitude = dem['longitude']
latitude = dem['latitude']
fig, ax = plt.subplots()
# make a colormap that has land and ocean clearly delineated and of the
# same length (256 + 256)
colors_undersea = plt.cm.terrain(np.linspace(0, 0.17, 256))
colors_land = plt.cm.terrain(np.linspace(0.25, 1, 256))
all_colors = np.vstack((colors_undersea, colors_land))
terrain_map = colors.LinearSegmentedColormap.from_list(
'terrain_map', all_colors)
# make the norm: Note the center is offset so that the land has more
# dynamic range:
divnorm = colors.TwoSlopeNorm(vmin=-500., vcenter=0, vmax=4000)
pcm = ax.pcolormesh(longitude, latitude, topo, rasterized=True, norm=divnorm,
cmap=terrain_map, shading='auto')
# Simple geographic plot, set aspect ratio beecause distance between lines of
# longitude depends on latitude.
ax.set_aspect(1 / np.cos(np.deg2rad(49)))
ax.set_title('TwoSlopeNorm(x)')
cb = fig.colorbar(pcm, shrink=0.6)
cb.set_ticks([-500, 0, 1000, 2000, 3000, 4000])
plt.show()
See how it scales numbers with this simple usage (from docs):
>>> import matplotlib. Colors as mcolors
>>> offset = mcolors.TwoSlopeNorm(vmin=-4000., vcenter=0., vmax=10000)
>>> data = [-4000., -2000., 0., 2500., 5000., 7500., 10000.]
>>> offset(data)
array([0., 0.25, 0.5, 0.625, 0.75, 0.875, 1.0])
I am having a problem with waterfall. I took this chart from matplotlib site and added my own data frame with 2 simple columns with some integer numbers. My waterfall was produced but without numbers, just empty bars. I am a bit lost and I would appreciate any suggestions.
What I am trying to build is the custom waterfall that takes one dataframe with column names, values, and some values for filters like countries. I haven't found anything like that anywhere so I am trying to build my own.
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
from matplotlib.ticker import FuncFormatter;
dataset = pd.read_csv('waterfall_test_data.csv')
#Use python 2.7+ syntax to format currency
def money(x, pos):
'The two args are the value and tick position'
return "${:,.0f}".format(x)
formatter = FuncFormatter(money)
#Data to plot. Do not include a total, it will be calculated
index = dataset['columns']
data = dataset['amount']
#Store data and create a blank series to use for the waterfall
trans = pd.DataFrame(data=data,index=index)
blank = trans.amount.cumsum().shift(1).fillna(0)
#Get the net total number for the final element in the waterfall
total = trans.sum().amount
trans.loc["net"]= total
blank.loc["net"] = total
#The steps graphically show the levels as well as used for label placement
step = blank.reset_index(drop=True).repeat(3).shift(-1)
step[1::3] = np.nan
#When plotting the last element, we want to show the full bar,
#Set the blank to 0
blank.loc["net"] = 0
#Plot and label
my_plot = trans.plot(kind='bar', stacked=True, bottom=blank,legend=None, figsize=(15, 5), title="2014 Sales Waterfall")
my_plot.plot(step.index, step.values,'k')
my_plot.set_xlabel("Transaction Types")
#Format the axis for dollars
my_plot.yaxis.set_major_formatter(formatter)
#Get the y-axis position for the labels
y_height = trans.amount.cumsum().shift(1).fillna(0)
#Get an offset so labels don't sit right on top of the bar
max = trans.max()
neg_offset = max / 25
pos_offset = max / 50
plot_offset = int(max / 15)
#Start label loop
loop = 0
for index, row in trans.iterrows():
# For the last item in the list, we don't want to double count
if row['amount'] == total:
y = y_height[loop]
else:
y = y_height[loop] + row['amount']
# Determine if we want a neg or pos offset
if row['amount'] > 0:
y += pos_offset
else:
y -= neg_offset
my_plot.annotate("{:,.0f}".format(row['amount']),(loop,y),ha="center")
loop+=1
#Scale up the y axis so there is room for the labels
my_plot.set_ylim(0,blank.max()+int(plot_offset))
#Rotate the labels
my_plot.set_xticklabels(trans.index,rotation=0)
my_plot.get_figure().savefig("waterfall.png",dpi=200,bbox_inches='tight')
I am creating line graphs with either the year or month along the x axis.
Here is the simplified code for the monthly line graph:
import matplotlib.pyplot as plt
import iris
import iris.coord_categorisation as iriscc
import iris.plot as iplt
import iris.quickplot as qplt
import iris.analysis.cartography
import cf_units
#this file is split into parts as follows:
#PART 1: load and format CORDEX models
#PART 2: load and format observed data
#PART 3: format data
#PART 4: plot data
def main():
#PART 1: CORDEX MODELS
#bring in all the models we need and give them a name
CCCma = '/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/AFR_44_tas/ERAINT/1979-2012/tas_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_CCCma-CanRCM4_r2_mon_198901-200912.nc'
#Load exactly one cube from given file
CCCma = iris.load_cube(CCCma)
#remove flat latitude and longitude and only use grid latitude and grid longitude to make consistent with the observed data, also make sure all of the longitudes are monotonic
lats = iris.coords.DimCoord(CCCma.coord('latitude').points[:,0], \
standard_name='latitude', units='degrees')
lons = CCCma.coord('longitude').points[0]
for i in range(len(lons)):
if lons[i]>100.:
lons[i] = lons[i]-360.
lons = iris.coords.DimCoord(lons, \
standard_name='longitude', units='degrees')
CCCma.remove_coord('latitude')
CCCma.remove_coord('longitude')
CCCma.remove_coord('grid_latitude')
CCCma.remove_coord('grid_longitude')
CCCma.add_dim_coord(lats, 1)
CCCma.add_dim_coord(lons, 2)
#we are only interested in the latitude and longitude relevant to Malawi
Malawi = iris.Constraint(longitude=lambda v: 32.5 <= v <= 36., \
latitude=lambda v: -17. <= v <= -9.)
CCCma = CCCma.extract(Malawi)
#time constraignt to make all series the same
iris.FUTURE.cell_datetime_objects = True
t_constraint = iris.Constraint(time=lambda cell: 1989 <= cell.point.year <= 2008)
CCCma = CCCma.extract(t_constraint)
#PART 2: OBSERVED DATA
#bring in all the files we need and give them a name
CRU= '/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Actual_Data/cru_ts4.00.1901.2015.tmp.dat.nc'
#Load exactly one cube from given file
CRU = iris.load_cube(CRU, 'near-surface temperature')
#define the latitude and longitude
lats = iris.coords.DimCoord(CRU.coord('latitude').points, \
standard_name='latitude', units='degrees')
lons = CRU.coord('longitude').points
#we are only interested in the latitude and longitude relevant to Malawi
Malawi = iris.Constraint(longitude=lambda v: 32.5 <= v <= 36., \
latitude=lambda v: -17. <= v <= -9.)
CRU = CRU.extract(Malawi)
#time constraignt to make all series the same
iris.FUTURE.cell_datetime_objects = True
t_constraint = iris.Constraint(time=lambda cell: 1989 <= cell.point.year <= 2008)
CRU = CRU.extract(t_constraint)
#PART 3: FORMAT DATA
#data is in Kelvin, but we would like to show it in Celcius
CCCma.convert_units('Celsius')
#bring time data into allignment
new_unit = cf_units.Unit('days since 1900-01-01', calendar = '365_day')
CCCma.coord('time').convert_units(new_unit)
#add years and months to data
iriscc.add_year(CCCma, 'time')
iriscc.add_year(CRU, 'time')
iriscc.add_month(CCCma, 'time')
iriscc.add_month(CRU, 'time')
#We are interested in plotting the data by month, so we need to take a mean of all the data by month
CCCmaYR = CCCma.aggregated_by('month', iris.analysis.MEAN)
CRUYR = CRU.aggregated_by('month', iris.analysis.MEAN)
#regridding scheme requires spatial areas, therefore the longitude and latitude coordinates must be bounded. If the longitude and latitude bounds are not defined in the cube we can guess the bounds based on the coordinates
CCCmaYR.coord('latitude').guess_bounds()
CCCmaYR.coord('longitude').guess_bounds()
CRUYR.coord('latitude').guess_bounds()
CRUYR.coord('longitude').guess_bounds()
#Returns an array of area weights, with the same dimensions as the cube
CCCmaYR_grid_areas = iris.analysis.cartography.area_weights(CCCmaYR)
CRUYR_grid_areas = iris.analysis.cartography.area_weights(CRUYR)
#We want to plot the mean for the whole region so we need a mean of all the lats and lons
CCCmaYR_mean = CCCmaYR.collapsed(['latitude', 'longitude'], iris.analysis.MEAN, weights=CCCmaYR_grid_areas)
CRUYR_mean = CRUYR.collapsed(['latitude', 'longitude'], iris.analysis.MEAN, weights=CRUYR_grid_areas)
#PART 4: PLOT LINE GRAPH
#assign the line colours and set x axis to months
qplt.plot(CCCmaYR_mean.coord('month'),CCCmaYR_mean, label='CanRCM4_ERAINT', lw=1.5, color='blue')
qplt.plot(CRUYR_mean.coord('month'), CRUYR_mean, label='Observed', lw=2, color='black')
#create a legend and set its location to under the graph
plt.legend(loc="upper center", bbox_to_anchor=(0.5,-0.05), fancybox=True, shadow=True, ncol=2)
#create a title
plt.title('Mean Near Surface Temperature for Malawi by month 1989-2008', fontsize=11)
#add grid lines
plt.grid()
#save the image of the graph and include full legend
#plt.savefig('ERAINT_Temperature_LineGraph_Annual', bbox_inches='tight')
#show the graph in the console
iplt.show()
if __name__ == '__main__':
main()
This produces a plot which looks like this:
How can I change the tick marks to show me all month names? I would also like the graph to finish at December (no white space after).
Similarly, for the yearly line graph, here is the simplified code:
import matplotlib.pyplot as plt
import iris
import iris.coord_categorisation as iriscc
import iris.plot as iplt
import iris.quickplot as qplt
import iris.analysis.cartography
#this file is split into parts as follows:
#PART 1: load and format CORDEX models
#PART 2: load and format observed data
#PART 3: format data
#PART 4: plot data
def main():
#PART 1: CORDEX MODELS
#bring in all the models we need and give them a name
CCCma = '/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/AFR_44_tas/ERAINT/1979-2012/tas_AFR-44_ECMWF-ERAINT_evaluation_r1i1p1_CCCma-CanRCM4_r2_mon_198901-200912.nc'
#Load exactly one cube from given file
CCCma = iris.load_cube(CCCma)
#remove flat latitude and longitude and only use grid latitude and grid longitude to make consistent with the observed data, also make sure all of the longitudes are monotonic
lats = iris.coords.DimCoord(CCCma.coord('latitude').points[:,0], \
standard_name='latitude', units='degrees')
lons = CCCma.coord('longitude').points[0]
for i in range(len(lons)):
if lons[i]>100.:
lons[i] = lons[i]-360.
lons = iris.coords.DimCoord(lons, \
standard_name='longitude', units='degrees')
CCCma.remove_coord('latitude')
CCCma.remove_coord('longitude')
CCCma.remove_coord('grid_latitude')
CCCma.remove_coord('grid_longitude')
CCCma.add_dim_coord(lats, 1)
CCCma.add_dim_coord(lons, 2)
#we are only interested in the latitude and longitude relevant to Malawi
Malawi = iris.Constraint(longitude=lambda v: 32.5 <= v <= 36., \
latitude=lambda v: -17. <= v <= -9.)
CCCma = CCCma.extract(Malawi)
#time constraignt to make all series the same
iris.FUTURE.cell_datetime_objects = True
t_constraint = iris.Constraint(time=lambda cell: 1989 <= cell.point.year <= 2008)
CCCma = CCCma.extract(t_constraint)
#PART 2: OBSERVED DATA
#bring in all the files we need and give them a name
CRU= '/exports/csce/datastore/geos/users/s0XXXX/Climate_Modelling/Actual_Data/cru_ts4.00.1901.2015.tmp.dat.nc'
#Load exactly one cube from given file
CRU = iris.load_cube(CRU, 'near-surface temperature')
#define the latitude and longitude
lats = iris.coords.DimCoord(CRU.coord('latitude').points, \
standard_name='latitude', units='degrees')
lons = CRU.coord('longitude').points
#we are only interested in the latitude and longitude relevant to Malawi
Malawi = iris.Constraint(longitude=lambda v: 32.5 <= v <= 36., \
latitude=lambda v: -17. <= v <= -9.)
CRU = CRU.extract(Malawi)
#time constraignt to make all series the same
iris.FUTURE.cell_datetime_objects = True
t_constraint = iris.Constraint(time=lambda cell: 1989 <= cell.point.year <= 2008)
CRU = CRU.extract(t_constraint)
#PART 3: FORMAT DATA
#data is in Kelvin, but we would like to show it in Celcius
CCCma.convert_units('Celsius')
#add years to data
iriscc.add_year(CCCma, 'time')
iriscc.add_year(CRU, 'time')
#We are interested in plotting the data by month, so we need to take a mean of all the data by month
CCCma = CCCma.aggregated_by('year', iris.analysis.MEAN)
CRU = CRU.aggregated_by('year', iris.analysis.MEAN)
#regridding scheme requires spatial areas, therefore the longitude and latitude coordinates must be bounded. If the longitude and latitude bounds are not defined in the cube we can guess the bounds based on the coordinates
CCCma.coord('latitude').guess_bounds()
CCCma.coord('longitude').guess_bounds()
CRU.coord('latitude').guess_bounds()
CRU.coord('longitude').guess_bounds()
#Returns an array of area weights, with the same dimensions as the cube
CCCma_grid_areas = iris.analysis.cartography.area_weights(CCCma)
CRU_grid_areas = iris.analysis.cartography.area_weights(CRU)
#We want to plot the mean for the whole region so we need a mean of all the lats and lons
CCCma_mean = CCCma.collapsed(['latitude', 'longitude'], iris.analysis.MEAN, weights=CCCma_grid_areas)
CRU_mean = CRU.collapsed(['latitude', 'longitude'], iris.analysis.MEAN, weights=CRU_grid_areas)
#PART 4: PLOT LINE GRAPH
#assign the line colours
qplt.plot(CCCma_mean.coord('year'), CCCma_mean, label='CanRCM4_ERAINT', lw=1.5, color='blue')
qplt.plot(CRU_mean.coord('year'), CRU_mean, label='Observed', lw=2, color='black')
#create a legend and set its location to under the graph
plt.legend(loc="upper center", bbox_to_anchor=(0.5,-0.05), fancybox=True, shadow=True, ncol=2)
#create a title
plt.title('Mean Near Surface Temperature for Malawi 1989-2008', fontsize=11)
#add grid lines
plt.grid()
#save the image of the graph and include full legend
#plt.savefig('ERAINT_Temperature_LineGraph_Annual', bbox_inches='tight')
#show the graph in the console
iplt.show()
if __name__ == '__main__':
main()
and this produces this graph:
As you can see I have limited my data from 1989 to 2008, but the axis goes from 1985 to 2010, how can I make this tighter?
Thank you!
For your monthly graph you may be able to change it by setting the xticks - this has to be numeric but you can also set labels to use instead of the numbers. Something like
plt.xticks(range(12), calendar.month_abbr[1:13])
may work (depends on the format of your data, you may need to plot month number rather than month name). You will need to import calendar to get the above working.
For your yearly graph you should just be able to set the x-axis limits using
plt.xlim((xmin, xmax))
where xmin is probably 1989 and xmax is 2008.
Imagine that there is 10 houses, where there can be one to an infinite number of persons. Each of those persons sends a number of messages, containing their userid and the house number. This can be from 1 to infinite number of messages. I want to know the average number of messages that is sent by each person, for each house, to later plot which house got the largest number of average messages.
Now, that I've explained conceptually, the houses aren't houses, but latitudes, from f.ex -90 to -89 etc. And that a person can send messages from different houses.
So I've got a database with latitude and senderID. I want to plot the density of latitudes pr unique senderID:
Number of rows/Number of unique userids at each latitude over an interval
This is an sample input:
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95,
40.72, 47.59, 54.42, 63.84, 76.77, 77.43, 78.54]
userid= [5, 7, 6, 6, 6, 6, 5, 2,
2, 2, 1, 5, 10, 9 ,8]
Here are the corresponding densities:
-80 to -90: 1
-40 to -50: 1
-30 to -40: 4
-20 to -30: 1
40 to 50: 2
50 to 60: 1
60 to 70: 1
70 to 80: 1
An other input:
lat = [70,70,70,70,70,80,80,80]
userid = [1,2,3,4,5,1,1,2]
The density for latitude 70 is 1, while the density for latitude 80 is 1.5.
If I would do this through a database query/pseudocode I would do something like:
SELECT count(latitude) FROM messages WHERE latitude < 79 AND latitude > 69
SELECT count(distinct userid) FROM messages WHERE latitude < 79 AND latitude > 69
The density would then be count(latitude)/count(distinct userid) - also to be interpreted as totalmessagesFromCertainLatitude/distinctUserIds. This would be repeated for intervals from -90 to 90, i.e -90<latitude<-89 up to 89<latitude<90
To get any help with this is probably a far stretch, but I just cant organize my thoughts to do this while I'm confident there are no errors. I would be happy for anything. I'm sorry if I was unclear.
Because this packs so neatly into pandas' built-ins, it's probably fast in pandas for big datasets.
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95,
40.72, 47.59, 54.42, 63.84, 76.77, 77.43, 78.54]
userid= [5, 7, 6, 6, 6, 6, 5, 2,
2, 2, 1, 5, 10, 9 ,8]
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection
from math import floor
df = pd.DataFrame(zip(userid,lat), columns = ['userid','lat']
)
df['zone'] = map(lambda x: floor(x) * 10,df.lat/10) # for ten-degree zones
zonewidth=10
#df['zone'] = map(floor, df.lat) # for one-degree zones
#zonewidth=1 # ditto
dfz = df.groupby('zone') #returns a dict of dataframes
#for k, v in dfz: # useful for exploring the GroupBy object
# print(k, v.userid.values, float(len(v.userid.values))/len(set(v.userid.values)))
p = [(k, float(len(v.userid.values))/len(set(v.userid.values))) for k, v in dfz]
# plotting could be tightened up -- PatchCollection?
R = [Rectangle((x, 0), zonewidth, y, facecolor='red', edgecolor='black',fill=True) for x, y in p]
fig, ax = plt.subplots()
for r in R:
ax.add_patch(r)
plt.xlim((-90, 90))
tall = max([r.get_height() for r in R])
plt.ylim((0, tall + 0.5))
plt.show()
For the first set of test data:
I'm not 100% sure I've understood the output you want, but this will produce a stepped, cumulative histogram-like plot with the x-axis being latitudes (binned) and the y axis being the density you define above.
From your sample code, you already have numpy installed and are happy to use it. The approach I would take is to get two data sets rather like what would be returned by your SQL sample and then use them to get the densities and then plot. Using your existing latitude / userid data format - it might look something like this
EDIT: Removed first version of code from here and some comments which were redundant following clarification and question edits from the OP
Following comments and OP clarification - I think this is what is desired:
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby
def draw_hist(latitudes,userids):
min_lat = -90
max_lat = 90
binwidth = 1
bin_range = np.arange(min_lat,max_lat,binwidth)
all_rows = zip(latitudes,userids)
binned_latitudes = np.digitize(latitudes,bin_range)
all_in_bins = zip(binned_latitudes,userids)
unique_in_bins = list(set(all_in_bins))
all_in_bins.sort()
unique_in_bins.sort()
bin_count_all = []
for bin, group in groupby(all_in_bins, lambda x: x[0]):
bin_count_all += [(bin, len([k for k in group]))]
bin_count_unique = []
for bin, group in groupby(unique_in_bins, lambda x: x[0]):
bin_count_unique += [(bin, len([ k for k in group]))]
# bin_count_all and bin_count_unique now contain the data
# corresponding to the SQL / pseudocode in your question
# for each latitude bin
bin_density = [(bin_range[b-1],a*1.0/u) for ((b,a),(_,u)) in zip(bin_count_all, bin_count_unique)]
bin_density = np.array(bin_density).transpose()
# plot as standard bar - note you can put uneven widths in
# as an array-like here if necessary
# the * simply unpacks the x and y values from the density
plt.bar(*bin_density, width=binwidth)
plt.show()
# can save away plot here if desired
latitudes = [-70.5, 5.3, 70.32, 70.43, 5, 32, 80, 80, 87.3]
userids = [1,1,2,2,4,5,1,1,2]
draw_hist(latitudes,userids)
Sample output with different bin widths on OP dataset
I think this solves the case, allthough it isn't efficient at all:
con = lite.connect(databasepath)
binwidth = 1
latitudes = []
userids = []
info = []
densities = []
with con:
cur = con.cursor()
cur.execute('SELECT latitude, userid FROM dynamicMessage')
con.commit()
print "executed"
while True:
tmp = cur.fetchone()
if tmp != None:
info.append([float(tmp[0]),float(tmp[1])])
else:
break
info = sorted(info, key=itemgetter(0))
for x in info:
latitudes.append(x[0])
userids.append(x[1])
x = 0
latitudecount = 0
for b in range(int(min(latitudes)),int(max(latitudes))+1):
numlatitudes = sum(i<b for i in latitudes)
if numlatitudes > 1:
tempdensities = latitudes[0:numlatitudes]
latitudes = latitudes[numlatitudes:]
tempuserids = userids[0:numlatitudes]
userids = userids[numlatitudes:]
density = numlatitudes/len(list(set(tempuserids)))
if density>1:
tempdensities = [b]*int(density)
densities.extend(tempdensities)
plt.hist(densities, bins=len(list(set(densities))))
plt.savefig('latlongstats'+'t'+str(time.strftime("%H:%M:%S")), format='png')
What follows is not a complete solution in terms of plotting the required histogram, but I think it's nevertheless worthy of being reported
The bulk of the solution, we scan the array of tuples to select the ones in the required range and we count
the number of selected tuples
the unique ids, using a trick consisting in creating a set (this discards automatically the duplicates) and computing its numerosity
eventually we return the required ratio or zero if the count of distinct ids is zero
def ratio(d, mn, mx):
tmp = [(lat, uid) for lat, uid in d if mn <= lat < mx]
nlats, nduids = len(tmp), len({t[1] for t in tmp})
return 1.0*nlats/nduids if nduids>0 else 0
The data is input and assigned, via zip, to a list of tuples
lat = [-83.76, -44.88, -38.36, -35.50, -33.99, -31.91, -27.56, -22.95,
-19.00, -12.32, -6.14, -1.11, 4.40, 10.23, 19.40, 31.18,
40.72, 47.59, 54.42, 63.84, 76.77]
userid= [52500.0, 70100.0, 35310.0, 47776.0, 70100.0, 30991.0, 37328.0, 25575.0,
37232.0, 6360.0, 52908.0, 52908.0, 52908.0, 77500.0, 345.0, 6360.0,
3670.0, 36690.0, 3720.0, 2510.0, 2730.0]
data = zip(lat,userid)
preparation of the bins
extremes = range(-90,91,10)
intervals = zip(extremes[:-1],extremes[1:])
actual computation, the result is a list of floats that can be passed to the relevant pyplot functions
ratios = [ratio(data,*i) for i in intervals]
print ratios
# [1.0, 0, 0, 0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 0]