How do I speed up the for loop when using pandas? - python

Hastily, I once wrote the code for work. It was assumed that it would only need to be run once. But now you have to run it often. The source data unfortunately can not provide because they conferenceline. I am interested in the question, which parts of the code need to be corrected in order to get performance? The current version runs for about 2 hours, running about 25K iterations.
dfs = []
for x in tqdm_notebook(range(len(group_furn))):
good_id = group_furn.iloc[x, 0]
name_promo = group_furn.iloc[x, 1]
unique_stores = set(stores.loc[stores.store_type_id != 4]['store_id'].unique()) - \
set(promo_list.loc[(promo_list['good_id'] == good_id) & \
(promo_list['name_promo_mech'] == name_promo)].store_id.unique())
bu_stores = list(unique_stores.intersection(set(bu_furn.store_id.unique())))
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id != 4)].main_store_id.unique()
df = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.store_id.isin(main_stores))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df = pd.merge(df, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df)
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id == 4)].main_store_id.unique()
owners = bu_store.loc[bu_store.main_store_id.isin(main_stores)].main_owner_id.unique()
df_2 = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.owner_id.isin(owners))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df_2 = pd.merge(df_2, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df_2)

Related

Python Pandas Create Excel from Loop

Hello I'm trying to create a program which filtering my data from more than 20 excel file and I want to create a new excel file for export values from my loop. My code little bit complicated I'm new at python.
for index, row in data.iterrows():
def tarih_aralik(Depo):
try:
deneme5 = data.loc[(data['Tarih'] >= data_time) & (data['Tarih'] <= data_time2) & (data['Depo'] == Depo)]
right = deneme5.groupby(['Tarih', deneme5['Adet'] > 0])['Adet'].sum().unstack()
right = right.rename(columns={True: 'Positive', False: 'Negative'})
deneme5 = deneme5.join(right, on=None, how='right')
deneme5 = deneme5['Positive'].sum()
deneme5 = int(deneme5)
print(deneme5)
except:
print("0")
return result
k101 = data.loc[data['Depo'] == 'K101', 'Adet'].sum()
k104 = data.loc[data['Depo'] == 'K104', 'Adet'].sum()
a = print("-->",row.T_kod, "-", row.Açıklama, "<--","\n", k101, "adt K101 toplam",",", k104, "adt K104 toplam","\n",data_time,"--",data_time2)

TypeError: sequence item 0: expected str instance, float found in Pyhton

Below is the Data set I was using(syn-retweet-done.csv in my code). And The above error did come off
,Unnamed: 0,created_at,tweet,category
0,0,2021-07-29 02:40:00,People Gather in numbers,Other
1,0,2021-07-29 02:40:00,No real sign of safety,Other
2,1,2021-07-27 10:40:00,President is On fire,Politics
3,1,2021-07-27 10:40:00,Election is to be held next month,Politics
Below is the codebase I worked on. It would be very helfil if someone can figure out the issue which is pointing aggregated()
def aggregated():
tweets = pd.read_csv(r'syn-retweet-done.csv')
df = pd.DataFrame(tweets, columns=['created_at', 'tweet','category'])
out = df.groupby(['created_at', 'category'], sort=False, as_index=False)['tweet'] \
.apply(lambda x: ' '.join(x))[df.columns]
# print(out)
return out
def saveFile():
df = pd.read_csv('test_1.csv');
categories = df['category'].unique()
for category in categories:
df[df['category'] == category].to_csv(category + '.csv')
# Driver code
if __name__ == '__main__':
print(aggregated())
aggregated().to_csv(r'test_1.csv',index = True, header=True)
saveFile()

Deleting empty columns with a few columns between data

I'm fetching data from a Google sheet:
values1 = pd.DataFrame(values)
aux = values1.head(1)
values1.drop(index={0}, inplace=True)
senal1 = (values1[2] == "SEÑAL")
senal = values1[senal1]
senal.dropna(axis=1, inplace=True)
print(senal)
This is my result after running the code:

Panda DataFrame Row Items IF Comparison doesnt return correct result

I retrieve data from quandl and load it to a pandas DF object.
Afterwards I calculate SMA values (SMA21, SMA55) based on "Last Price".
Adding those SMA values as a column do my DF object.
I iterate through DF to catch a buy signal.
I know the buy condition is holding true for some dates but my code does not printing anything out. I am expecting to print the buy condition at the very least.
as below you can see the following condition:
kitem['SMA21'] >= kitem['Last']
My code:
import requests
import pandas as pd
import json
class URL_Params:
def __init__ (self, endPoint, symboll, startDate, endDate, apiKey):
self.endPoint = endPoint
self.symboll = symboll
self.startDate = startDate
self.endDate = endDate
self.apiKey = apiKey
def createURL (self):
return self.endPoint + self.symboll + '?start_date=' + self.startDate + '&end_date=' + self.endDate + '&api_key=' + self.apiKey
def add_url(self, _url):
self.url_list
my_portfolio = {'BTC':1.0, 'XRP':0, 'DSH':0, 'XMR':0, 'TotalBTCValue':1.0}
_endPoint = 'https://www.quandl.com/api/v3/datasets/BITFINEX/'
_symbolls = ['BTCEUR','XRPBTC','DSHBTC','IOTBTC','XMRBTC']
_startDate = '2017-01-01'
_endDate = '2019-03-01'
_apiKey = '' #needs to be set for quandl
my_data = {}
my_conns = {}
my_col_names = ['Date', 'High', 'Low', 'Mid', 'Last', 'Bid', 'Ask', 'Volume']
orderbook = []
#create connection and load data for each pair/market.
#load them in a dict for later use
for idx_symbol in _symbolls:
my_url_params = URL_Params(_endPoint,idx_symbol,_startDate,_endDate,_apiKey)
response = requests.get(my_url_params.createURL())
my_data[idx_symbol] = json.loads(response.text)
#Prepare Data
my_raw_data_df_xrpbtc = pd.DataFrame(my_data['XRPBTC']['dataset']['data'], columns= my_data['XRPBTC']['dataset']['column_names'])
#Set Index to Date Column and Sort
my_raw_data_df_xrpbtc['Date'] = pd.to_datetime(my_raw_data_df_xrpbtc['Date'])
my_raw_data_df_xrpbtc.index = my_raw_data_df_xrpbtc['Date']
my_raw_data_df_xrpbtc = my_raw_data_df_xrpbtc.sort_index()
#Drop unrelated columns
my_raw_data_df_xrpbtc.drop(['Date'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Ask'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Bid'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Low'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['High'], axis=1, inplace=True)
my_raw_data_df_xrpbtc.drop(['Mid'], axis=1, inplace=True)
#Calculate SMA values to create buy-sell signal
my_raw_data_df_xrpbtc['SMA21'] = my_raw_data_df_xrpbtc['Last'].rolling(21).mean()
my_raw_data_df_xrpbtc['SMA55'] = my_raw_data_df_xrpbtc['Last'].rolling(55).mean()
my_raw_data_df_xrpbtc['SMA200'] = my_raw_data_df_xrpbtc['Last'].rolling(200).mean()
#Check for each day if buy signal holds BUY if sell signal holds SELL
for idx,kitem in my_raw_data_df_xrpbtc.iterrows():
if (kitem['SMA21'] >= kitem['Last']) is True: #buy signal
print("buy0")
if my_portfolio['BTC'] > 0 is True:
print("buy1")
if (kitem['Last'] * my_portfolio['XRP']) >= (my_portfolio['BTC'] * 1.05) is True: #sell signal
print("sell0")
if my_portfolio['XRP'] > 0 is True:
print("sell1")
I know that there are lots of rows that holds true but my code never enters this path of code so it does not print out what I expect.
Could anyone please help/comment what might be wrong?
The reason is that your comparison is wrong. The result of kitem['SMA21'] >= kitem['Last'] will be a numpy.bool_. When you use is to compare it to True this will fail as it is not the same object.
If you change the comparison to == it will work as expected:
if (kitem['SMA21'] >= kitem['Last']) == True:

OutOfMemory while using Jupyter notebook with spark

I am currently using IBM Data Scientist Workbench with Jupyter notebooks and Spark.
I am trying to read several CSV files to a DF and then applying some transformations to it in order to create a final dataframe with merged data from the different CSV files, but for some reason I am getting this error:
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415)
at java.lang.StringBuilder.append(StringBuilder.java:132)
The code I am using is as follows:
i=0
count = 0
var_name = []
schema = StructType([])
df1 = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_ocurrences = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_count = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_merged = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_complete = sqlContext.createDataFrame(sc.emptyRDD(), schema)
FINAL = sqlContext.createDataFrame(sc.emptyRDD(), schema)
for file in os.listdir('/resources/data/test_variables/'):
df1 = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/resources/data/test_variables/"+file)
#SKIP SERIES WITH ONLY 0s
count = df1.groupBy().sum("Bit_value")
if count.select("sum(Bit_value)").collect()[0][0] == 0:
continue
#
i+=1
# AGGREGATION
df1 = df1.withColumn("Interval", ((df1.Timestamp.cast("long") / 1).cast("long") * 1).cast("timestamp"))
# COUNT 1s
df1_ocurrences = df1.groupBy("Interval").sum("Bit_value").sort("Interval")
df1_ocurrences = df1_ocurrences.withColumnRenamed("sum(Bit_value)", "Sum_df1")
# COUNT TOTAL
df1_count = df1.groupBy("Interval").count().sort("Interval")
df1_count = df1_count.withColumnRenamed("count", "Total_df1")
# MERGING
df1_merged = df1_ocurrences.join(df1_count, ["Interval"]).sort("Interval")
var_name = file.split(".")
df1_complete = df1_merged.withColumn(var_name[0], df1_merged.Sum_df1 / df1_merged.Total_df1)
df1_complete = df1_complete.drop('Sum_df1')
df1_complete = df1_complete.drop('Total_df1')
#FINAL DATAFRAME
if i == 1:
FINAL = df1_complete
else:
FINAL = FINAL.join(df1_complete, ["Interval"]).sort("Interval")
Any advice on this? Maybe I am not writing the most efficient code but I am new to Spark.
Too much time spent on GC and too little memory is freed up: https://developer.ibm.com/hadoop/2016/02/16/beginners-guide-apache-spark-troubleshooting/
In addition to recomendation in above article what worked for me in jyputer is this:
spark = SparkSession.builder \
.appName("GBT Model") \
.config("spark.executor.memory", "2000mb") \
.master("local[*]") \
.config("spark.executor.cores", "4") \
.config("spark.yarn.executor.memoryOverhead",200) \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.default.parallelism", "4") \
.getOrCreate()
Note spark.yarn.executor.memoryOverhead is set to 10% of executor memory.

Categories

Resources