How to add progress bar with multiprocessing

How to add progress bar with multiprocessing - python

I have following function - mp_process(); would like to add progress bar, but running into a lot issue.
Look for help on how to add tqdm in mp_process
from gzip import READ
import http.client
import pandas as pd
import xml.etree.cElementTree as ET
import multiprocessing as mp
from tqdm import tqdm
def mp_rocess(df):
N_ROWS = 100 # number of rows in each dataframe
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
start_index = 0
for i in range(n):
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
start_index += N_ROWS
if remainder:
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)

def mp_rocess(df):
N_ROWS = 2 # number of rows in each dataframe
total_row = len(df)
pbar = tqdm(total=total_row)
with mp.Pool(10) as pool: # use 3 processes
# break up dataframe into smaller daraframes of N_ROWS rows each
cnt = len(df.index)
n, remainder = divmod(cnt, N_ROWS)
results = []
def update_bar(result):
pbar.update(N_ROWS) # this is just for the fancy progress bar
start_index = 0
for i in range(n):
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+N_ROWS-1, :],) , callback=update_bar ))
start_index += N_ROWS
if remainder:
#results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],)))
results.append(pool.apply_async(process_frame, args=(df.loc[start_index:start_index+remainder-1, :],), callback=update_bar ))
new_dfs = [result.get() for result in results]
# reassemble final dataframe:
ret_df = pd.concat(new_dfs, ignore_index=True)
return ret_df

Related

Add new columns to a dataframe in for loop

I am able to write a for loop that adds a row to a dataframe each time as with the following example:
from random import randint
import numpy as np
dataframe = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols=['mean']
result=pd.DataFrame(dataframe, columns = cols)
result
This outputs a dataframe that looks like:
mean
8
8
9
9
How could I output a dataframe that looks like
mean_1 mean_2 mean_3 mean_4
8 8 9 9

I made the dataframe a pandas dataframe from the beginning. And then there are multiple ways to add an column. Add Column
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
counter = 0
for i in range(2):
value = randint(0,10)
for j in range(2):
counter += 1
mean = np.mean(value)
column_name = "mean_" + str(counter)
df.loc[1, column_name] = mean
As Answer to the comment and I also moved the line where value is set. There it depends of course if you want the same number per column or everywhere a new number:
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
for i in range(2):
for j in range(2):
value = randint(0,10)
mean = np.mean(value)
column_name = "mean_" + str(i + 1)
df.loc[j + 1, column_name] = mean

import pandas as pd
from random import randint
import numpy as np
m = 2
n = 2
dataframe = pd.DataFrame([0], columns = ['mean_1'])
for i in range(m):
value = randint(0,10)
for j in range(n):
mean = np.mean(value)
dataframe['mean_'+str(n*i+j+1)] = mean

I tried to keep what wrote by adding few details: a counter for columns, and a assembling of your final dataframe. This is not the most optimised way but I mentionned tried to keep your logic.
from random import randint
import numpy as np
import pandas as pd
dataframe = []
count = 1
cols = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols.append('mean_'+str(count))
count = count + 1
df=pd.DataFrame(columns=cols)
a_series = pd.Series(dataframe, index = cols)
d = df.append(a_series, ignore_index=True)
Output:

Replace for loop? This function works but it takes to long time. I'm looking for ways to impove it

It works but takes 40 seconds to work 1 stock 1 simple moving average. I'm a beginner, Is there any ways to replace those for loops or more efficient way to run this? I'm reading about numpy but I don't understand how it could replace a loop.
I'm trying to make a csv to store all the indicatorvalues from current period to the start of my dataframe.
I currently only have one moving average but with this speed its pointless to add anything else :)
def runcheck(df,adress):
row_count = int(0)
row_count=len(df)
print(row_count)
lastp = row_count-1
row_count2 = int(0)
mabuild = int(0)
ma445_count = int(0)
ma_count2 = int(0)
row_count5 = int(0)
row_count3 = int(0)
row_count4 = int(0)
resultat = int(0)
timside_count = int(0)
slott_count = int(0)
sick_count = int(0)
rad_data = []
startT = time.time()
## denna kollar hela vägen till baka t.ex idag. sen igår i förrgår
for row in df.index:
row_count2 += 1
timside_count = row_count-row_count2
if timside_count >= 445:
for row in df.index:
row_count5 = row_count-row_count2
slott_count = row_count5-row_count3
mabuild = mabuild+df.iloc[slott_count,5]
row_count3 += 1
row_count4 += 1
if row_count4 == 445:
resultat = mabuild/row_count4
rad_data.append(resultat)
row_count3 = int(0)
row_count4 = int(0)
mabuild = int(0)
resultat = 0
break
## sparar till csv innan loop börjar om
with open(adress, "a") as fp:
wr = csv.writer(fp,)
wr.writerow(rad_data)
rad_data.clear()
print('Time was :', time.time()-startT)
stop=input('')

Try this:
import numpy as np
from functools import reduce
def runcheck(df,adress):
startT = time.time()
rad_data = map(lambda i: reduce(lambda x, y: x + y, map(lambda z: df.iloc[z, 5], np.arange(i-445, i)))/445, np.arange(445, len(df.index)))
'''
Explanation
list_1 = np.arange(445, len(def.index) -> Create a list of integers from 445 to len(def.index)
rad_data = map(lambda i: function, list_1) -> Apply function (see below) to each value (i) in the generated list_1
function = reduce(lambda x, y: x + y, list_2)/445 -> Take 2 consecutive values (x, y) in list_2 (see below) and sum them, repeat until one value left (i.e. sum of list_2), then divide by 445
list_2 = map(lambda z: df.iloc[z, 5], list_3) -> Map each value (z) in list_3 (see below) to df.iloc[z, 5]
list_3 = np.arange(i-445, i) -> Create a list of integers from i-445 to i (value i from list_1)
'''
# writing to your csv file outside the loop once you have all the values is better, as you remove the overhead of re-opening the file each time
with open(adress, "a") as fp:
wr = csv.writer(fp,)
for data in rad_data:
wr.writerow([data])
print('Time was :', time.time()-startT)
stop=input('')
Not sure it works, as I don't have sample data. Let me know if there are errors and I'll try to debug!

What is the fastest and most efficient way to append rows to a DataFrame?

I have a large dataset which I have to convert to .csv format, it consists of 29 columns and 1M+ lines. I figured that as the dataframe gets larger, appending any rows to it is getting more and more time consuming. I wonder if there is any faster way to this, sharing the relevant snippet from the code.
Any recommendations are welcome though.
df = DataFrame()
for startID in range(0, 100000, 1000):
s1 = time.time()
tempdf = DataFrame()
url = f'https://******/products?startId={startID}&size=1000'
r = requests.get(url, headers={'****-Token': 'xxxxxx', 'Merchant-Id': '****'})
jsonList = r.json() # datatype= list, contains= dict
normalized = json_normalize(jsonList)
# type(normal) = pandas.DataFrame
print(startID / 1000) # status indicator
for series in normalized.iterrows():
series = series[1] # iterrows returns tuple (index, series)
offers = series['offers']
series = series.drop(columns='offers')
length = len(offers)
for offer in offers:
n = json_normalize(offer).squeeze() # squeeze() casts DataFrame into Series
concatinated = concat([series, n]).to_frame().transpose()
tempdf = tempdf.append(concatinated, ignore_index=True)
del normalized
df = df.append(tempdf)
f1 = time.time()
print(f1 - s1, ' seconds')
df.to_csv('out.csv')

As Mohit Motwani suggested fastest way is to collect data into dictionary then load all into data frame. Below some speed measurements examples:
import pandas as pd
import numpy as np
import time
import random
end_value = 10000
Measurement for creating a list of dictionaries and at the end load all into data frame
start_time = time.time()
dictionary_list = []
for i in range(0, end_value, 1):
dictionary_data = {k: random.random() for k in range(30)}
dictionary_list.append(dictionary_data)
df_final = pd.DataFrame.from_dict(dictionary_list)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 0.090153 seconds
Measurements for appending data into list and concat into data frame:
start_time = time.time()
appended_data = []
for i in range(0, end_value, 1):
data = pd.DataFrame(np.random.randint(0, 100, size=(1, 30)), columns=list('A'*30))
appended_data.append(data)
appended_data = pd.concat(appended_data, axis=0)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 4.183921 seconds
Measurements for appending data frames:
start_time = time.time()
df_final = pd.DataFrame()
for i in range(0, end_value, 1):
df = pd.DataFrame(np.random.randint(0, 100, size=(1, 30)), columns=list('A'*30))
df_final = df_final.append(df)
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 11.085888 seconds
Measurements for insert data by usage of loc:
start_time = time.time()
df = pd.DataFrame(columns=list('A'*30))
for i in range(0, end_value, 1):
df.loc[i] = list(np.random.randint(0, 100, size=30))
end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))
Execution time = 21.029176 seconds

Python pd.read_csv, .to_sql different length than actual data

I have a database with ~ 50million rows. After reading to a database I only get 21,000 rows. What am I doing wrong? Thanks.
chunksize = 100000
csv_database = create_engine('sqlite:///csv_database.db', pool_pre_ping=True)
i=0
j=0
q=0
for df in pd.read_csv(filename, chunksize = chunksize, iterator = False):
# df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.index += j
i+= 1
df.to_sql('table', csv_database, if_exists='append')
j = df.index[-1] +1
q+=1
print("q: " + repr(q))
columnx = df.iloc[:,0]
columny = df.iloc[:,1]
columnz = df.iloc[:,2]
columnmass = df.iloc[:,3]
out: [21739 rows x 1 columns] etc etc.
in[19]: len(df)
Out[19]: 21739

'df' doesn't contain the entire csv file as you specified chunk size to 100000, and 21739 is the number of rows inserted in the last iteration.
If you do a count(1) of your table, I bet you'll get something like 5_21739.

Following code is working for me.
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
DIR = 'C:/Users/aslams/Desktop/checkpoint/'
FILE = 'SUBSCRIBER1.csv'
file = '{}{}'.format(DIR, FILE)
csv_database = create_engine('sqlite:///csv_database.db')
chunksize = 10000
i = 0
j = 0
for df in pd.read_csv(file, chunksize=chunksize, iterator=True):
df = df.rename(columns= {c: c.replace(' ', '') for c in df.columns})
df.index +=3
df.to_sql('data_use', csv_database, if_exists = 'append')
j = df.index[-1]+1
print('| index: {}',format(j))

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.

You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()

You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3

Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to add progress bar with multiprocessing - python

Related

Add new columns to a dataframe in for loop

Replace for loop? This function works but it takes to long time. I'm looking for ways to impove it

What is the fastest and most efficient way to append rows to a DataFrame?

Python pd.read_csv, .to_sql different length than actual data

Take n rows from a spark dataframe and pass to toPandas()

Categories

Resources