Related
I am trying to construct a function with np.polyfit() to extrapolate data according to my need. I have some temperature and pressure observations which I have plotted. I need to fit a best-fit line in the observations so that I can extrapolate (get the temperature in each pressure level for a different surface temperature, i.e., the temperature of the last pressure level, assuming that the shape of the fit remains constant) the observations to my need. This is what I have done so far:
import pandas as pd
import glob
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import datetime
dfs = []
pressure = []
temp = []
#reading the data
for fname in glob.glob('/home/swadhin/project/radiosonde_data/in/pb/*.txt'):
df = pd.read_csv(fname, skiprows=1, delimiter='\s+',
names=['LVLpTYP', 'ETIME', 'PRESSURE','GPH','TEMP','RH','DPDP','WDIR','WSPD'])
p1 = df['PRESSURE'].to_numpy(dtype = np.float64)
t1 = df['TEMP'].to_numpy(dtype = np.float64)
pressure.append(p1)
temp.append(t1)
dfs.append(df)
p =[]
for i in pressure:
a = np.ma.masked_equal(i,-9999.) #masking the fill_values
p.append(a)
p = [i/100 for i in p] #converting the pressure to hPa
t =[]
for j in temp:
b = np.ma.masked_equal(j,-9999.)
c = np.ma.masked_equal(b,-8888.)
t.append(c)
t = [i/10 for i in t] #converting the temp to the appropriate unit
zipped = zip(p, t)
z_c = [np.polyfit(x,y,2) for x,y in zipped]
p_array = np.linspace(1000,0, num = 101)
for i in range(len(p)):
x = p[i]
y = t[i]
z = z_c[i]
xp = p_array[i]
p = np.poly1d(z)
plt.subplot(1,2,i+1)
plt.plot(y, x, '.', p(xp),xp, '-');
plt.gca().invert_yaxis()
But I am not getting any plots.
Earlier to plot the pressure and temperature from the observations I did this and got the following plot:
for i in range(len(p)):
plt.plot(t[i],p[i])
plt.gca().invert_yaxis()
plt.ylim(bottom = 1010)
plt.ylabel('Pressure(hPa)')
plt.xlabel('Temperature($^\circ$C)')
The pressure and temperature arrays have an inhomogeneous structure.
I am attaching the datafiles for reference:
Pressure data = https://drive.google.com/file/d/13e7u8iBZWvmHAj0yt9MXEtB1eniI9xOR/view?usp=sharing
Temp data = https://drive.google.com/file/d/13dysYQlutg0_a9aJnm2U3_lCecxSObFN/view?usp=sharing
I am trying to apply this code:
import h3
coords_1 = (52.2296756, 21.0122287)
coords_2 = (52.406374, 16.9251681)
distance = h3.point_dist(coords_1, coords_2, unit='km')
distance
to a pandas dataframe. This is my not working attempt.
data = {'lat1':[52.2296756],
'long1':[21.0122287],
'lat2':[52.406374],
'long2':[16.9251681],
}
df = pd.DataFrame(data)
df
df['distance'] = = h3.point_dist((df['lat1'], df['long1']), (df['lat2'], df['long2']), unit='km')
Any help would be very much appreciated. Thanks!
Assuming you have more than a single row for which you would like to compute the distance you can use apply as follows:
df['Dist'] = df.apply(lambda row: h3.point_dist((row['lat1'], row['long1']), (row['lat2'], row['long2'])), axis=1)
Which will add a column to your dataframe simi9lar to the following:
lat1 long1 lat2 long2 Dist
0 52.229676 21.012229 52.406374 16.925168 2.796556
1 57.229676 30.001176 48.421365 17.256314 6.565542
Please note, my distance calculations may not agree with yours, since I used a dummy function for h3.point_dist computation
It's working you need to just delete the second "="
data = {'lat1':[52.2296756],
'long1':[21.0122287],
'lat2':[52.406374],
'long2':[16.9251681],
}
df = pd.DataFrame(data)
df
df['distance'] = h3.point_dist((df['lat1'], df['long1']), (df['lat2'], df['long2']), unit='km')
Does anyone have any idea of how I can find the intersection of these two graphs? (image below)
energ_ac, price_compvend and energ_ac1, price_compven1 are set of x,y values.
Please note the following code which gets the values from a database and then plots the two graphs:
I can only get the intersection manually, and I want to get it automatically
import matplotlib.pyplot as plt
import pyodbc
import pandas as pd
import numpy as np
import string as str
import sys
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=sys.maxsize)
conn = pyodbc.connect(Trusted_Connection='yes', driver='{SQL Server}', server='srv03',
database='mercadoOMIE_curvas') # Ligação à BD no sqlserver
SQL_Query = pd.read_sql_query("""SELECT * FROM curva_pbc_uof_2020_1_12 WHERE ("4" = 'C' AND "0" = '1' AND "7" = 'O')""", conn)
df = pd.DataFrame(SQL_Query, columns=['0','1','2','3','4','5','6','7','8'])
df['5'] = df['5'].str.replace('.','', regex = True)
df['6'] = df['6'].str.replace('.','', regex = True)
df['5'] = pd.to_numeric(df['5'].str.replace(',','.'), errors='coerce')
df['6'] = pd.to_numeric(df['6'].str.replace(',','.'), errors='coerce')
energ_ac = np.zeros(len(df['5']))
energ_ac[0] = df['5'][0]
for x in range (1, len(df['5'])):
energ_ac[x] = energ_ac[x-1]+df['5'][x]
price_compvend = df['6'].to_numpy()
plt.plot(energ_ac,price_compvend)
SQL_Query1 = pd.read_sql_query("""SELECT * FROM curva_pbc_uof_2020_1_12 WHERE ("4" = 'V' AND "0" = '1' AND "7" = 'O')""", conn)
df1 = pd.DataFrame(SQL_Query1, columns=['0','1','2','3','4','5','6','7','8'])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(df1)
df1['5'] = df1['5'].str.replace('.','', regex = True)
df1['6'] = df1['6'].str.replace('.','', regex = True)
df1['5'] = pd.to_numeric(df1['5'].str.replace(',','.'), errors='coerce')
df1['6'] = pd.to_numeric(df1['6'].str.replace(',','.'), errors='coerce')
energ_ac1 = np.zeros(len(df1['5']))
energ_ac1[0] = df1['5'][0]
for x in range (1, len(df1['5'])):
energ_ac1[x] = energ_ac1[x-1]+df1['5'][x]
price_compvend1 = df1['6'].to_numpy()
plt.plot(energ_ac1,price_compvend1)
plt.show()
The solution is this link: np.array intersection // AttributeError: 'module' object has no attribute 'PiecewisePolynomial'
import scipy.interpolate as interpolate
import scipy.optimize as optimize
import numpy as np
x1 = np.array([1.4,2.1,3,5.9,8,9,23])
y1 = np.array([2.3,3.1,1,3.9,8,9,11])
x2 = np.array([1,2,3,4,6,8,9])
y2 = np.array([4,12,7,1,6.3,8.5,12])
# linear interpolators
opts = {'fill_value': 'extrapolate'}
f1 = interpolate.interp1d(x1,y1,**opts)
f2 = interpolate.interp1d(x2,y2,**opts)
# possible range for an intersection
xmin = np.min((x1,x2))
xmax = np.max((x1,x2))
# number of intersections
xuniq = np.unique((x1,x2))
xvals = xuniq[(xmin<=xuniq) & (xuniq<=xmax)]
# note that it's bad practice to compare floats exactly
# but worst case here is a bit of redundance, no harm
# for each combined interval there can be at most 1 intersection,
# so looping over xvals should hopefully be enough
# one can always err on the safe side and loop over a `np.linspace`
intersects = []
for xval in xvals:
x0, = optimize.fsolve(lambda x: f1(x)-f2(x), xval)
if (xmin<=x0<=xmax
and np.isclose(f1(x0),f2(x0))
and not any(np.isclose(x0,intersects))):
intersects.append(x0)
print(intersects)
print(f1(intersects))
print(f2(intersects))
Use bellow script:
diff_vector = abs(price_compvend - price_compvend1)
min_index = np.where(diff_vector == np.min(diff_vector))
print('Intersection point is ({},{})'.format(energ_ac[min_index],
price_compvend[min_index]))
You could use set.intersection() method to get intersection points of the graphs.
graph_points1 = set(zip(energ_ac,price_compvend))
graph_points2 = set(zip(energ_ac1,price_compvend1))
intersection_points = graph_points1.intersection(graph_points2)
I recently wrote some code, and it is creating the outputs I want, however it is taking an eternity... I have 700k customer lines to iterate through for 435 warehouses, and to run 100k took 3 hours.
I know this question may be vague, but im not sure why it is running so slowly. I am suspecting it is due to my nested for loops, but the data won't load in unless I chunk it.
import pandas as pd
import geopy.distance
dfware = pd.read_csv('dfware.csv', encoding = "ISO-8859-1")
dfcust = pd.read_csv(r'dfcust.csv', encoding = "ISO-8859-1")
ppmwinarray = []
#Chunk size to load in
csize=10 ** 3
Bigoutput = []
y=0
for dfcust in pd.read_csv(r'dfcust.csv', encoding = "ISO-8859-1", chunksize = csize):
#For all columns (index) and rows, in datacust, iterate through those rows.
y+=1
print(y)
ppmwinarray = []
z=0
for index,row in dfcust.iterrows():
#Assign the Lattitude variable in the row titled 'lat' to lat1
lat1 = row['Lat']
# Assign the longitude variable in the row titled 'Long' to lon1
lon1 = row['Lon']
dlist=[]
dindex=[]
print(z)
z+=1
for index2, row2 in dfware.iterrows():
y+=1
lat2 = row2['Lat']
lon2 = row2['Lon']
coords_1 = [lat1, lon1]
coords_2 = [lat2, lon2]
distance = geopy.distance.distance(coords_1, coords_2).miles
if distance > 300:
distance = 0
else:
distance = distance
dlist.append(distance)
d_i = ((300-distance)/300)
if d_i != 1:
d_i=d_i
else:
d_i=0
dindex.append(d_i)
sumdi = sum(dindex)
if sumdi == 0:
sumdi = 1
#Defining last 13 as PPM Index
ppmdindex =(dindex[-13:])
#Calculating their independent win chance
IndependentWinChance=[ x/ sumdi for x in ppmdindex]
#Store in an array
ppmarray = IndependentWinChance
#Summing independent chances to get sum chance
sumppmWinChance = sum(ppmarray)
#Appending the sum of all distance indexes
ppmarray.append(sumdi)
#Appending the sum of ppm win chance
ppmarray.append(sumppmWinChance)
ppmwinarray.append(ppmarray)
Bigoutput.extend(ppmwinarray)
Bigoutputdf = pd.DataFrame(Bigoutput)
Bigoutputdf.to_csv('customers1234.csv')
exit()
I am using some code from a workshop to extract data from netCDF files by the coordinates closest to my specified coordinates. When using just one set of coordinates I am able to extract the values I need without trouble as below:
import numpy as np
import netCDF4
from math import pi
from numpy import cos, sin
def tunnel_fast(latvar,lonvar,lat0,lon0):
'''
Find closest point in a set of (lat,lon) points to specified point
latvar - 2D latitude variable from an open netCDF dataset
lonvar - 2D longitude variable from an open netCDF dataset
lat0,lon0 - query point
Returns iy,ix such that the square of the tunnel distance
between (latval[it,ix],lonval[iy,ix]) and (lat0,lon0)
is minimum.
'''
rad_factor = pi/180.0 # for trignometry, need angles in radians
# Read latitude and longitude from file into numpy arrays
latvals = latvar[:] * rad_factor
lonvals = lonvar[:] * rad_factor
ny,nx = latvals.shape
lat0_rad = lat0 * rad_factor
lon0_rad = lon0 * rad_factor
# Compute numpy arrays for all values, no loops
clat,clon = cos(latvals),cos(lonvals)
slat,slon = sin(latvals),sin(lonvals)
delX = cos(lat0_rad)*cos(lon0_rad) - clat*clon
delY = cos(lat0_rad)*sin(lon0_rad) - clat*slon
delZ = sin(lat0_rad) - slat;
dist_sq = delX**2 + delY**2 + delZ**2
minindex_1d = dist_sq.argmin() # 1D index of minimum element
iy_min,ix_min = np.unravel_index(minindex_1d, latvals.shape)
return iy_min,ix_min
ncfile = netCDF4.Dataset('E:\wind_level2_1.nc', 'r')
latvar = ncfile.variables['latitude']
lonvar = ncfile.variables['longitude']
#_________GG turbine_________GAD10 Latitude 51.735516, GAD10 Longitude 1.942656
iy,ix = tunnel_fast(latvar, lonvar, 51.735516, 1.942656)
print('Closest lat lon:', latvar[iy,ix], lonvar[iy,ix])
refLAT=latvar[iy,ix]
refLON = lonvar[iy,ix]
#try to find the data for this location
SARwind = ncfile.variables['sar_wind'][:,:]
ModelWind = ncfile.variables['model_speed'][:,:]
print 'iy,ix' #appears to be the index of the value of Lat,lon
print SARwind[iy,ix]
ncfile.close()
Now I am trying to loop through a text file containing coordinates coord_list to extract sets of coordinates, find the data then move to the next set of coordinates in the list. This code works on it's own as below:
import csv
from decimal import Decimal
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
#coord_list = list(reader)
coord_list = [reader]
end_row = len(coord_list)
lon_ind=1
lat_ind=2
for row in range(0, end_row-1):#end_row - 1 due to the 0 index
turbine_lat = coord_list[row][lat_ind]
turbine_lon = coord_list[row][lon_ind]
turbine_lat = [Decimal(turbine_lat)]
print 'lat',turbine_lat, 'lon',turbine_lon, row
However, I want to pass coordinates from the text file to this part of the original code iy,ix = tunnel_fast(latvar, lonvar, 51.94341, 1.922094888), replacing the numbers with variables iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon). I try to combine the two codes by creating a function get_coordinates, I get the following errors
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 65, in <module>
get_coordinates(coord_list, latvar, lonvar)
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 51, in get_coordinates
iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)
File "C:/Users/mm/test_nc_bycoords_GG_turbines_AGW.py", line 27, in tunnel_fast
lat0_rad = lat0 * rad_factor
TypeError: can't multiply sequence by non-int of type 'float'
I thought this is because the turbine_lat and turbine_lon are list items so cannot be used, but this doesn't seem to be connected to the errors. I know this code needs more work anyway, but if anyone could help me spot where I am going wrong that would be very helpful. My attempt to combine the two codes is below.
import numpy as np
import netCDF4
from math import pi
from numpy import cos, sin
import csv
# edited from https://github.com/Unidata/unidata-python-workshop/blob/a56daa50d7b343c7debe93968683613642d6b9f7/notebooks/netcdf-by-coordinates.ipynb
def tunnel_fast(latvar,lonvar,lat0,lon0):
'''
Find closest point in a set of (lat,lon) points to specified point
latvar - 2D latitude variable from an open netCDF dataset
lonvar - 2D longitude variable from an open netCDF dataset
lat0,lon0 - query point
Returns iy,ix such that the square of the tunnel distance
between (latval[it,ix],lonval[iy,ix]) and (lat0,lon0)
is minimum.
'''
rad_factor = pi/180.0 # for trignometry, need angles in radians
# Read latitude and longitude from file into numpy arrays
latvals = latvar[:] * rad_factor
lonvals = lonvar[:] * rad_factor
ny,nx = latvals.shape
lat0_rad = lat0 * rad_factor
lon0_rad = lon0 * rad_factor
# Compute numpy arrays for all values, no loops
clat,clon = cos(latvals),cos(lonvals)
slat,slon = sin(latvals),sin(lonvals)
delX = cos(lat0_rad)*cos(lon0_rad) - clat*clon
delY = cos(lat0_rad)*sin(lon0_rad) - clat*slon
delZ = sin(lat0_rad) - slat;
dist_sq = delX**2 + delY**2 + delZ**2
minindex_1d = dist_sq.argmin() # 1D index of minimum element
iy_min,ix_min = np.unravel_index(minindex_1d, latvals.shape)
return iy_min,ix_min
#________________my edits___________________________________________________
def get_coordinates(coord_list, latvar, lonvar):
"this takes coordinates from a .csv and assigns them to variables"
end_row = len(coord_list)
lon_ind=1
lat_ind=2
for row in range(0, end_row-1):#end_row - 1 due to the 0 index
turbine_lat = coord_list[row][lat_ind]
turbine_lon = coord_list[row][lon_ind]
iy, ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)
print('Closest lat lon:', latvar[iy, ix], lonvar[iy, ix])
#________________________________________________________________________________________________________________________
ncfile = netCDF4.Dataset('NOGAPS_wind_level2_1.nc', 'r')
latvar = ncfile.variables['latitude']
lonvar = ncfile.variables['longitude']
#____added in to pass to get coordinates function
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
coord_list = list(reader)
#_________take latitude from coordinateas function
get_coordinates(coord_list, latvar, lonvar)
#iy,ix = tunnel_fast(latvar, lonvar, turbine_lat, turbine_lon)#get these from the 'assign_coordinates_fromlist.py
#print('Closest lat lon:', latvar[iy,ix], lonvar[iy,ix])
SARwind = ncfile.variables['sar_wind'][:,:]
ModelWind = ncfile.variables['model_speed'][:,:]
print 'iy,ix' #appears to be the index of the value of Lat,lon
print SARwind[iy,ix]
ncfile.close()
When I try to convert
You can unpack an argument list using *args (see the docs). In your case you could do tunnel_fast(latvar, lonvar, *coord_list[row]). You need to make sure that the order of arguments in coord_list[row] is correct and if coord_list[row] contains more than the two values then you need to slice it appropriately.
Thanks to help from a_guest
It was a simple problem of lat0 and lon0 being passed as
<type 'str'> to tunnel_fast when it requires <type 'float'>. This appears to come from loading the coord_list as a list.
with open('Turbine_locs_no_header.csv','rb') as f:
reader = csv.reader(f)
coord_list = list(reader)
The workaround I used was to convert lat0 and lon0 to floats at the beginning of tunnel_fast
lat0 = float(lat0)
lon0 = float(lon0)
I am sure there is a more elegant way to do this, but it works.