self.flights_list = ApiConnector('airlabs', 'flights', 'dep_icao,arr_icao,flight_number,flag,aircraft_icao').get_data_from_api()
self.airports_list = ApiConnector('airlabs', 'airports', 'icao_code,name,lat,lng')
def get_airport_cordinates(self, airport_name):
for i in self.airports_list.get_data_from_api():
if(i.get('icao_code') == airport_name):
return i['lat'], i['lng']
def list_all_flights(self):
for i in self.flights_list.get_data_from.api():
if(i.get('dep_icao') and i.get('arr_icao')):
print(f"Flight Number is {i['flight_number']} and the airline is {i['flag']} and the aircraft is {i['aircraft_icao']} going from {i['dep_icao']} to {i['arr_icao']}");
print(f'Flight distance is {Emissions().calculate_distance(ApiResponse().get_airport_cordinates(i["dep_icao"]), ApiResponse().get_airport_cordinates(i["arr_icao"]))} km');
print(f'Flight CO2 emissions is {Emissions().calculate_co2_emissions(Emissions().calculate_distance(ApiResponse().get_airport_cordinates(i["dep_icao"]), ApiResponse().get_airport_cordinates(i["arr_icao"])))} kg');
I am trying to iterate from Airlabs api. Basically two queries: one for flights, other for airports(which has latitude and longtitute - whic is extracted from iata_code from flights and match together however 2 responses have around 8mb but to iterate through all of them taking ages.
Is there any way how to speed it up ?
Speed up the for loops inside.
Generally
print(f"Flight Number is {i['flight_number']} and the airline is {i['flag']} and the aircraft is {i['aircraft_icao']} going from {i['dep_icao']} to {i['arr_icao']}");
-- this is working flawless (10000queries within a second)
However it slows down on this section.
print(f'Flight distance is {Emissions().calculate_distance(ApiResponse().get_airport_cordinates(i["dep_icao"]), ApiResponse().get_airport_cordinates(i["arr_icao"]))} km'); --> which compares results from flight_list (around 8mb) to airports_list (around 2 mb), distance itself is relatively fast. Any guidance how to speed it up ?
Problem solved by moving static queries from API to the file :)
Related
I am working with extremely high dimensional biological count data (single cell RNA sequencing where rows are cell ID and columns are genes).
Each dataset is a separate flat file (AnnData format). Each flat file can be broken down by various metadata attributes, including by cell type (eg: muscle cell, heart cell), subtypes (eg: a lung dataset can be split into normal lung and cancerous lung), cancer stage (eg: stage 1, stage 2), etc.
The goal is to pre-compute aggregate metrics for a specific metadata column, sub-group, dataset, cell-type, gene combination and keep that readily accessible such that when a person queries my web app for a plot, I can quickly retrieve results (refer to Figure below to understand what I want to create). I have generated Python code to assemble the dictionary below and it has sped up how quickly I can create visualizations.
Only issue now is that the memory footprint of this dictionary is very high (there are ~10,000 genes per dataset). What is the best way to reduce the memory footprint of this dictionary? Or, should I consider another storage framework (briefly saw something called Redis Hashes)?
One option to reduce your memory footprint but keep fast lookup is to use an hdf5 file as a database. This will be a single large file that lives on your disk instead of memory, but is structured the same way as your nested dictionaries and allows for rapid lookups by reading in only the data you need. Writing the file will be slow, but you only have to do it once and then upload to your web-app.
To test this idea, I've created two test nested dictionaries in the format of the diagram you shared. The small one has 1e5 metadata/group/dataset/celltype/gene entries, and the other is 10 times larger.
Writing the small dict to hdf5 took ~2 minutes and resulted in a file 140 MB in size while the larger dict-dataset took ~14 minutes to write to hdf5 and is a 1.4 GB file.
Querying the small and large hdf5 files similar amounts of time showing that the queries scale well to more data.
Here's the code I used to create the test dict-datasets, write to hdf5, and query
import h5py
import numpy as np
import time
def create_data_dict(level_counts):
"""
Create test data in the same nested-dict format as the diagram you show
The Agg_metric values are random floats between 0 and 1
(you shouldn't need this function since you already have real data in dict format)
"""
if not level_counts:
return {f'Agg_metric_{i+1}':np.random.random() for i in range(num_agg_metrics)}
level,num_groups = level_counts.popitem()
return {f'{level}_{i+1}':create_data_dict(level_counts.copy()) for i in range(num_groups)}
def write_dict_to_hdf5(hdf5_path,d):
"""
Write the nested dictionary to an HDF5 file to act as a database
only have to create this file once, but can then query it any number of times
(unless the data changes)
"""
def _recur_write(f,d):
for k,v in d.items():
#check if the next level is also a dict
sk,sv = v.popitem()
v[sk] = sv
if type(sv) == dict:
#this is a 'node', move on to next level
_recur_write(f.create_group(k),v)
else:
#this is a 'leaf', stop here
leaf = f.create_group(k)
for sk,sv in v.items():
leaf.attrs[sk] = sv
with h5py.File(hdf5_path,'w') as f:
_recur_write(f,d)
def query_hdf5(hdf5_path,search_terms):
"""
Query the hdf5_path with a list of search terms
The search terms must be in the order of the dict, and have a value at each level
Output is a dict of agg stats
"""
with h5py.File(hdf5_path,'r') as f:
k = '/'.join(search_terms)
try:
f = f[k]
except KeyError:
print('oh no! at least one of the search terms wasnt matched')
return {}
return dict(f.attrs)
################
# start #
################
#this "small_level_counts" results in an hdf5 file of size 140 MB (took < 2 minutes to make)
#all possible nested dictionaries are made,
#so there are 40*30*10*3*3 = ~1e5 metadata/group/dataset/celltype/gene entries
num_agg_metrics = 7
small_level_counts = {
'Gene':40,
'Cell_Type':30,
'Dataset':10,
'Unique_Group':3,
'Metadata':3,
}
#"large_level_counts" results in an hdf5 file of size 1.4 GB (took 14 mins to make)
#has 400*30*10*3*3 = ~1e6 metadata/group/dataset/celltype/gene combinations
num_agg_metrics = 7
large_level_counts = {
'Gene':400,
'Cell_Type':30,
'Dataset':10,
'Unique_Group':3,
'Metadata':3,
}
#Determine which test dataset to use
small_test = True
if small_test:
level_counts = small_level_counts
hdf5_path = 'small_test.hdf5'
else:
level_counts = large_level_counts
hdf5_path = 'large_test.hdf5'
np.random.seed(1)
start = time.time()
data_dict = create_data_dict(level_counts)
print('created dict in {:.2f} seconds'.format(time.time()-start))
start = time.time()
write_dict_to_hdf5(hdf5_path,data_dict)
print('wrote hdf5 in {:.2f} seconds'.format(time.time()-start))
#Search terms in order of most broad to least
search_terms = ['Metadata_1','Unique_Group_3','Dataset_8','Cell_Type_15','Gene_17']
start = time.time()
query_result = query_hdf5(hdf5_path,search_terms)
print('queried in {:.2f} seconds'.format(time.time()-start))
direct_result = data_dict['Metadata_1']['Unique_Group_3']['Dataset_8']['Cell_Type_15']['Gene_17']
print(query_result == direct_result)
Although Python dictionaries themselves are fairly efficient in terms of memory usage you are likely storing multiple copies of the strings you are using as dictionary keys. From your description of your data structure it is likely that you have 10000 copies of “Agg metric 1”, “Agg metric 2”, etc for every gene in your dataset. It is likely that these duplicate strings are taking up a significant amount of memory. These can be deduplicated with sys.inten so that although you still have as many references to the string in your dictionary, they all point to a single copy in memory. You would only need to make a minimal adjustment to your code by simply changing the assignment to data[sys.intern(‘Agg metric 1’)] = value. I would do this for all of the keys used at all levels of your dictionary hierarchy.
Say I have two taxi orders with Origin1、Destination1 and Origin2、Destination2(O1,O2,D1,D2).
I want to calculate the possibility of ridesharing, so I need the path between two different points.
And, here's my code:
def path_time(point1, point2):
path = ox.distance.shortest_path(road, point1, point2, weight='travel_time')
if path is None:
#If there isn't a path, a big weight will be set, and it won't be selected during the matching process.
route_time = 9999
else:
route_time = int(sum(ox.utils_graph.get_route_edge_attributes(road, path, "travel_time")))
return route_time,path
Since there is four points, I need to do this six times, where tp means travel path :
tpO1O2 = path_time(O1,O2)
tpO1D1 = path_time(O1,D1)
tpO1D2 = path_time(O1,D2)
tpO2D1 = path_time(O2,D1)
tpO2D2 = path_time(O2,D2)
tpD1D2 = path_time(D1,D2)
It's okay if I only have two points, but I got a 2 million order set, and each order has hundreds of potential matched orders. So this will take me a lot of time.
Does anyone knows how can I speed up this process? Thank you!
You can use parallelization to speed this up. Parallel route solving is built into OSMnx. Per the documentation:
You can parallelize solving multiple paths with the cpus parameter
I'm trying to find the best allocation for a portfolio based on backtesting data. As a general rule, I've divided stocks into large caps and small/mid caps and growth/value and want no more than 80% of my portfolio in large caps or 70% of my portfolio in value. I need an algorithm that will be flexible enough to use for more than two stocks. So far, what I have is (including a random class called Ticker):
randomBoolean=True
listOfTickers=[]
listOfLargeCaps=[]
listOfSmallMidCaps=[]
largeCapAllocation=0
listOfValue=[]
listOfGrowthBlend=[]
valueAllocation=0
while randomBoolean:
tickerName=input("What is the name of the ticker?")
tickerCap=input("What is the cap of the ticker?")
tickerAllocation=int(input("Around how much do you want to allocate in this ticker?"))
tickerValue=input("Is this ticker a Value, Growth, or Blend stock?")
tickerName=Ticker(tickerCap,tickerValue,tickerAllocation,tickerName)
listOfTickers.append(tickerName)
closer=input("Type DONE if you are finished. Type ENTER to continue entering tickers")
if closer=="DONE":
randomBoolean=False
for ticker in listOfTickers:
if ticker.cap==("Large" or "large"):
listOfLargeCaps.append(ticker)
else:
listOfSmallMidCaps.append(ticker)
if ticker.value==("Value" or "value"):
listOfValue.append(ticker)
else:
listOfGrowthBlend.append(ticker)
for largeCap in listOfLargeCaps:
largeCapAllocation +=largeCap.allocation
if largeCapAllocation>80:
#run a function that will readjust ticker stuff and decrease allocation to large cap stocks
for value in listOfValue:
valueAllocation+=value.allocation
if valueAllocation>70:
#run a function that will readjust ticker stuff and decrease allocation to value stocks
The "function" I have so far just iterates through -5 to 6 in a sort of
for i in range (-5,6):
ticker1AllocationPercent + i
ticker2AllocationPercent - i
#update the bestBalance if the new allocation is better
How would I modify this algorithm to work for 3, 4, 5, etc. stocks, and how would I go about changing the allocations for the large/small-mid cap stocks and such?
As mentioned in the above answer, typically Quadratic solver is used in such problems. You can use Quadratic solver available in Pyportfolio. See this link for more details.
We are trying to make a cluster analysis for a big amount of data. We are kind of new to python and found out that an iterative function is way more efficient than an recursive one. Now we are trying to change that but it is way harder than we thought.
This code underneath is the heart of our clustering function. This takes over 90 percent of the time. Can you help us to change that into a recursive one?
Some extra information: The taunach function gets neighbours of our point which will later form the clusters. The problem is that we have many many points.
def taunach(tau,delta, i,s,nach,anz):
dis=tabelle[s].dist
#delta=tau
x=data[i]
y=Skalarprodukt(data[tabelle[s].index]-x)
a=tau-abs(dis)
#LA.norm(data[tabelle[s].index]-x)
if y<a*abs(a):
nach.update({item.index for item in tabelle[tabelle[s].inner:tabelle[s].outer-1]})
anz = anzahl(delta, i, tabelle[s].inner, anz)
if dis>-1:
b=dis-tau
if y>=b*abs(b):#*(1-0.001):
nach,anz=taunach(tau,delta, i,tabelle[s].outer,nach,anz)
else:
if y<tau**2:
nach.add(tabelle[s].index)
if y < delta:
anz += 1
if tabelle[s].dist>-4:
b = dis - tau
if y>=b*abs(b):#*(1-0.001)):
nach,anz=taunach(tau,delta, i,tabelle[s].outer,nach,anz)
if tabelle[s].dist > -1:
if y<=(dis+tau)**2:
nach,anz=taunach(tau,delta, i,tabelle[s].inner,nach,anz)
return nach,anz
I'm working with a dataset that shows flights taken by different people. Each line lists a flight segment and every couple of lines makes up an itinerary. So a simplified example of what the data looks like is:
Passenger_1 First_Airport Second_Airport Distance from 1-2
Passenger_1 Second_Airport Third_Airport Distance from 2-3
Passenger_2 First_Airport Second_Airport Distance from 1-2
....
(So here Passenger_1 traveled from the First Airport to the Third stopping over in the Second, while Passenger_2 flew non-stop from First to Second)
What I want to do is pull out each itinerary (all the rows that have the same passenger in the first column), pull out the data I need and then put it all into a new file, where each line now is an itinerary rather than a flight segment. Ideally it would look something like:
Passenger Origin Destination Distance Travelled
Passenger_1 First_Airport Third_Airport Distance from 1-2+Distance from 2-3
Passenger_2 First_Airport Second_Airport Distance from 1-2
....
The way I managed to find to do this was:
d = []
for group in data.groupby(data.MKT_ID):
x=group[1]
(Do all the analysis I want to do on x)
d.append({'Var_1': Var1(x),
'Var_2': Var_2(x),})
Output=pd.DataFrame(d)
This works fine but runs very slowly, about 15min to do one dataset. The datasets are a bit over 1GB in .csv format, each containing something like 600,000 itineraries and about 1,000,000 segments. I have something like 40 of these datasets to work with, so would like to be able to do this munging faster if possible.
Is there a faster way to do this? I don't have much coding experience so not too sure how long something like this should take, but it seems rather long to me. Also it seems like I'm using .groupby differently than most uses I've found while searching, is there a better way to pull out subsets and run analysis on them like this?
Edit:
The (Do all the analysis I want to do on x) Section is:
x=group[1].reset_index(drop=True)
x=x.sort_values(['SEQ_NUM'], ascending=1)
destIndex = x.shape[0]-1
MKT_ID = x.MKT_ID[0]
ITIN_ID = x.ITIN_ID[0]
ORIGIN = x.ORIGIN[0]
DEST = x.DEST[destIndex]
ORIGIN_AIRPORT_ID = x.ORIGIN_AIRPORT_ID[0]
ORIGIN_CITY_MARKET_ID = x.ORIGIN_CITY_MARKET_ID[0]
DEST_AIRPORT_ID = x.DEST_AIRPORT_ID[destIndex]
DEST_CITY_MARKET_ID = x.DEST_CITY_MARKET_ID[destIndex]
CARRIER = x.TICKET_CARRIER[0]
DISTANCE = sum(x.DISTANCE)
PASSENGERS = x.PASSENGERS[0]
TRANSFERS = destIndex
ROUTE = sorted([ORIGIN,DEST])
ROUTE = ROUTE[0] + '-' + ROUTE[1]
STOPS=''
for XXX in x.ORIGIN[1:]:
STOPS = STOPS+'"'+XXX+'"'
if len(set(x.TICKET_CARRIER)) == 1:
INTERLINE = 0
else:
INTERLINE = 1
if (min(x.SEQ_NUM) == 1)&(max(x.SEQ_NUM) == x.COUPONS[0]):
ONEWAY = 1
else:
ONEWAY = 0
d.append({'MKT_ID': MKT_ID,
'ITIN_ID': ITIN_ID,
'ORIGIN': ORIGIN,
'DEST': DEST,
'ORIGIN_CITY_MARKET_ID': ORIGIN_CITY_MARKET_ID,
'DEST_CITY_MARKET_ID': DEST_CITY_MARKET_ID,
'CARRIER': CARRIER,
'DISTANCE': DISTANCE,
'PASSENGERS': PASSENGERS,
'TRANSFERS': TRANSFERS,
'STOPS': STOPS,
'ROUTE': ROUTE,
'ONEWAY': ONEWAY,
'INTERLINE': INTERLINE })
Edit: The datafile I am working with can be found here: https://drive.google.com/open?id=0B9sRZp8Q6aVdUVhKQW9DaWlYR0k