How to add progress bar with multiprocessing - python

I have following function - mp_process(); would like to add progress bar, but running into a lot issue.
Look for help on how to add tqdm in mp_process
from gzip import READ
import http.client
import pandas as pd
import xml.etree.cElementTree as ET
import multiprocessing as mp
from tqdm import tqdm
def mp_rocess(df):
N_ROWS = 100 # number of rows in each dataframe
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
start_index = 0
for i in range(n):
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
start_index += N_ROWS
if remainder:
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)

def mp_rocess(df):
N_ROWS = 2 # number of rows in each dataframe
total_row = len(df)
pbar = tqdm(total=total_row)
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
def update_bar(result):
pbar.update(N_ROWS) # this is just for the fancy progress bar
start_index = 0
for i in range(n):
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],) , callback=update_bar ))
start_index += N_ROWS
if remainder:
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],), callback=update_bar ))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)
return ret_df

Related

Add new columns to a dataframe in for loop

I am able to write a for loop that adds a row to a dataframe each time as with the following example:
from random import randint
import numpy as np
dataframe = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols=['mean']
result=pd.DataFrame(dataframe, columns = cols)
result
This outputs a dataframe that looks like:
mean
8
8
9
9
How could I output a dataframe that looks like
mean_1 mean_2 mean_3 mean_4
8 8 9 9
I made the dataframe a pandas dataframe from the beginning. And then there are multiple ways to add an column. Add Column
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
counter = 0
for i in range(2):
value = randint(0,10)
for j in range(2):
counter += 1
mean = np.mean(value)
column_name = "mean_" + str(counter)
df.loc[1, column_name] = mean
As Answer to the comment and I also moved the line where value is set. There it depends of course if you want the same number per column or everywhere a new number:
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
for i in range(2):
for j in range(2):
value = randint(0,10)
mean = np.mean(value)
column_name = "mean_" + str(i + 1)
df.loc[j + 1, column_name] = mean
import pandas as pd
from random import randint
import numpy as np
m = 2
n = 2
dataframe = pd.DataFrame([0], columns = ['mean_1'])
for i in range(m):
value = randint(0,10)
for j in range(n):
mean = np.mean(value)
dataframe['mean_'+str(n*i+j+1)] = mean
I tried to keep what wrote by adding few details: a counter for columns, and a assembling of your final dataframe. This is not the most optimised way but I mentionned tried to keep your logic.
from random import randint
import numpy as np
import pandas as pd
dataframe = []
count = 1
cols = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols.append('mean_'+str(count))
count = count + 1
df=pd.DataFrame(columns=cols)
a_series = pd.Series(dataframe, index = cols)
d = df.append(a_series, ignore_index=True)
Output:

Replace for loop? This function works but it takes to long time. I'm looking for ways to impove it

It works but takes 40 seconds to work 1 stock 1 simple moving average. I'm a beginner, Is there any ways to replace those for loops or more efficient way to run this? I'm reading about numpy but I don't understand how it could replace a loop.
I'm trying to make a csv to store all the indicatorvalues from current period to the start of my dataframe.
I currently only have one moving average but with this speed its pointless to add anything else :)
def runcheck(df,adress):
row_count = int(0)
row_count=len(df)
print(row_count)
lastp = row_count-1
row_count2 = int(0)
mabuild = int(0)
ma445_count = int(0)
ma_count2 = int(0)
row_count5 = int(0)
row_count3 = int(0)
row_count4 = int(0)
resultat = int(0)
timside_count = int(0)
slott_count = int(0)
sick_count = int(0)
rad_data = []
startT = time.time()
## denna kollar hela vägen till baka t.ex idag. sen igår i förrgår
for row in df.index:
row_count2 += 1
timside_count = row_count-row_count2
if timside_count >= 445:
for row in df.index:
row_count5 = row_count-row_count2
slott_count = row_count5-row_count3
mabuild = mabuild+df.iloc[slott_count,5]
row_count3 += 1
row_count4 += 1
if row_count4 == 445:
resultat = mabuild/row_count4
rad_data.append(resultat)
row_count3 = int(0)
row_count4 = int(0)
mabuild = int(0)
resultat = 0
break
## sparar till csv innan loop börjar om
with open(adress, "a") as fp:
wr = csv.writer(fp,)
wr.writerow(rad_data)
rad_data.clear()
print('Time was :', time.time()-startT)
stop=input('')
Try this:
import numpy as np
from functools import reduce
def runcheck(df,adress):
startT = time.time()
rad_data = map(lambda i: reduce(lambda x, y: x + y, map(lambda z: df.iloc[z, 5], np.arange(i-445, i)))/445, np.arange(445, len(df.index)))
'''
Explanation
list_1 = np.arange(445, len(def.index) -> Create a list of integers from 445 to len(def.index)
rad_data = map(lambda i: function, list_1) -> Apply function (see below) to each value (i) in the generated list_1
function = reduce(lambda x, y: x + y, list_2)/445 -> Take 2 consecutive values (x, y) in list_2 (see below) and sum them, repeat until one value left (i.e. sum of list_2), then divide by 445
list_2 = map(lambda z: df.iloc[z, 5], list_3) -> Map each value (z) in list_3 (see below) to df.iloc[z, 5]
list_3 = np.arange(i-445, i) -> Create a list of integers from i-445 to i (value i from list_1)
'''
# writing to your csv file outside the loop once you have all the values is better, as you remove the overhead of re-opening the file each time
with open(adress, "a") as fp:
wr = csv.writer(fp,)
for data in rad_data:
wr.writerow([data])
print('Time was :', time.time()-startT)
stop=input('')
Not sure it works, as I don't have sample data. Let me know if there are errors and I'll try to debug!

What is the fastest and most efficient way to append rows to a DataFrame?

I have a large dataset which I have to convert to .csv format, it consists of 29 columns and 1M+ lines. I figured that as the dataframe gets larger, appending any rows to it is getting more and more time consuming. I wonder if there is any faster way to this, sharing the relevant snippet from the code.
Any recommendations are welcome though.
df = DataFrame()
for startID in range(0, 100000, 1000):
s1 = time.time()
tempdf = DataFrame()
url = f'https://******/products?startId={startID}&size=1000'
r = requests.get(url, headers={'****-Token': 'xxxxxx', 'Merchant-Id': '****'})
jsonList = r.json() # datatype= list, contains= dict
normalized = json_normalize(jsonList)
# type(normal) = pandas.DataFrame
print(startID / 1000) # status indicator
for series in normalized.iterrows():
series = series[1] # iterrows returns tuple (index, series)
offers = series['offers']
series = series.drop(columns='offers')
length = len(offers)
for offer in offers:
n = json_normalize(offer).squeeze() # squeeze() casts DataFrame into Series
concatinated = concat([series, n]).to_frame().transpose()
tempdf = tempdf.append(concatinated, ignore_index=True)
del normalized
df = df.append(tempdf)
f1 = time.time()
print(f1 - s1, ' seconds')
df.to_csv('out.csv')
As Mohit Motwani suggested fastest way is to collect data into dictionary then load all into data frame. Below some speed measurements examples:
import pandas as pd
import numpy as np
import time
import random
end_value = 10000
Measurement for creating a list of dictionaries and at the end load all into data frame
start_time = time.time()
dictionary_list = []
for i in range(0, end_value, 1):
dictionary_data = {k: random.random() for k in range(30)}
dictionary_list.append(dictionary_data)
df_final = pd.DataFrame.from_dict(dictionary_list)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 0.090153 seconds
Measurements for appending data into list and concat into data frame:
start_time = time.time()
appended_data = []
for i in range(0, end_value, 1):
data = pd.DataFrame(np.random.randint(0, 100, size=(1, 30)), columns=list('A'*30))
appended_data.append(data)
appended_data = pd.concat(appended_data, axis=0)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 4.183921 seconds
Measurements for appending data frames:
start_time = time.time()
df_final = pd.DataFrame()
for i in range(0, end_value, 1):
df = pd.DataFrame(np.random.randint(0, 100, size=(1, 30)), columns=list('A'*30))
df_final = df_final.append(df)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 11.085888 seconds
Measurements for insert data by usage of loc:
start_time = time.time()
df = pd.DataFrame(columns=list('A'*30))
for i in range(0, end_value, 1):
df.loc[i] = list(np.random.randint(0, 100, size=30))
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 21.029176 seconds

Python pd.read_csv, .to_sql different length than actual data

I have a database with ~ 50million rows. After reading to a database I only get 21,000 rows. What am I doing wrong? Thanks.
chunksize = 100000
csv_database = create_engine('sqlite:///csv_database.db', pool_pre_ping=True)
i=0
j=0
q=0
for df in pd.read_csv(filename, chunksize = chunksize, iterator = False):
# df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.index += j
i+= 1
df.to_sql('table', csv_database, if_exists='append')
j = df.index[-1] +1
q+=1
print("q: " + repr(q))
columnx = df.iloc[:,0]
columny = df.iloc[:,1]
columnz = df.iloc[:,2]
columnmass = df.iloc[:,3]
out: [21739 rows x 1 columns] etc etc.
in[19]: len(df)
Out[19]: 21739
'df' doesn't contain the entire csv file as you specified chunk size to 100000, and 21739 is the number of rows inserted in the last iteration.
If you do a count(1) of your table, I bet you'll get something like 5_21739.
Following code is working for me.
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
DIR = 'C:/Users/aslams/Desktop/checkpoint/'
FILE = 'SUBSCRIBER1.csv'
file = '{}{}'.format(DIR, FILE)
csv_database = create_engine('sqlite:///csv_database.db')
chunksize = 10000
i = 0
j = 0
for df in pd.read_csv(file, chunksize=chunksize, iterator=True):
df = df.rename(columns= {c: c.replace(' ', '') for c in df.columns})
df.index +=3
df.to_sql('data_use', csv_database, if_exists = 'append')
j = df.index[-1]+1
print('| index: {}',format(j))

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Categories

Resources