Correctly use multiprocessing - python

I am attempting to use multiprocessing for the first time and have to extract ~500,000 records(Right now I have variables set for 500). The original loop would have taken a very long time so I am trying multiprocessing instead. Right now I have 10 processes running, and it works, but it is still going to take around 4 hours to complete. I would like to run 20 or so processes, but I am afraid there may be a performance issue with my computer, and I would hate to wake up in the morning to see that the program crashed. Am I using this correctly or is there a better way?
Full Code:
from pyETT import ett_parser
import pandas as pd
import time
from datetime import datetime
from multiprocessing import Process
import sys
c = 10
x1,y1 = 1,50
x2,y2 = 51,100
x3,y3 = 101,150
x4,y4 = 151,200
x5,y5 = 201,250
x6,y6 = 251,300
x7,y7 = 301,350
x8,y8 = 351,400
x9,y9 = 401,450
x10,y10 = 451,500
m_cols = ('user-name','elo','rank','wins','losses','last-online')
def run1():
print('Running query 1...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x1, y1):
try:
if int(i) % int(c) == 0:
print('Loop1 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_1:",i )
#Export to excel
file_name = 'export_file1_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run2():
print('Running query2...')
m_cols = ('user-name','elo','rank','wins','losses','last-online')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x2, y2):
try:
if int(i) % int(c) == 0:
print('Loop2 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_2:",i )
#Export to excel
file_name = 'export_file2_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run3():
print('Running query3...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x3, y3):
try:
if int(i) % int(c) == 0:
print('Loop3 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_3:",i )
#Export to excel
file_name = 'export_file3_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run4():
print('Running query4...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x4, y4):
try:
if int(i) % int(c) == 0:
print('Loop4 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_4:",i )
#Export to excel
file_name = 'export_file4_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run5():
print('Running query5...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x5, y5):
try:
if int(i) % int(c) == 0:
print('Loop5 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_5:",i )
#Export to excel
file_name = 'export_file5_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run6():
print('Running query6...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x6, y6):
try:
if int(i) % int(c) == 0:
print('Loop6 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_6:",i )
#Export to excel
file_name = 'export_file6_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run7():
print('Running query7...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x7, y7):
try:
if int(i) % int(c) == 0:
print('Loop7 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_7:",i )
#Export to excel
file_name = 'export_file7_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run8():
print('Running query8...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x8, y8):
try:
if int(i) % int(c) == 0:
print('Loop8 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_8:",i )
#Export to excel
file_name = 'export_file8_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run9():
print('Running query9...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x9, y9):
try:
if int(i) % int(c) == 0:
print('Loop9 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_9:",i )
#Export to excel
file_name = 'export_file9_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def run10():
print('Running query10...')
df_master = pd.DataFrame(columns = m_cols)
for i in range(x10, y10):
try:
if int(i) % int(c) == 0:
print('Loop10 is at:', i)
user_id = i
line = ett.ett_parser.get_user(user_id)
temp_df = pd.DataFrame(line, index=[i])
df_master = df_master.append(temp_df, ignore_index = True)
except Exception:
print("Error_10:",i )
#Export to excel
file_name = 'export_file10_' + datetime.now().strftime("%H_%M_%S") + '.xlsx'
df_master.to_excel(file_name, index = False)
print('DataFrame(' + file_name + ') is written to Excel File successfully.')
def main():
p = Process(target=run1)
p.start()
#p.join()
p2 = Process(target=run2)
p2.start()
p3 = Process(target=run3)
p3.start()
p4 = Process(target=run4)
p4.start()
p5 = Process(target=run5)
p5.start()
p6 = Process(target=run6)
p6.start()
p7 = Process(target=run7)
p7.start()
p8 = Process(target=run8)
p8.start()
p9 = Process(target=run9)
p9.start()
p10 = Process(target=run10)
p10.start()
p10.join()
if __name__ == '__main__':
start = time.time()
print('starting main')
main()
print('finishing main',time.time()-start)
Updated Code
Using swaggg's answer, this code does what I want and is much shorter.
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from pyETT import ett_parser
import pandas as pd
import time
def main():
USER_ID_COUNT = 50
MAX_WORKERS = 2 * cpu_count() + 1
dataframe_list = []
#user_array = []
user_ids = list(range(1, USER_ID_COUNT))
def obtain_user_record(user_id):
return ett_parser.get_user(user_id)
with ThreadPoolExecutor(MAX_WORKERS) as executor:
for user_id, user_record in zip(user_ids, executor.map(obtain_user_record, user_ids)):
if user_record:
dataframe_list.append(user_record)
df_master = pd.DataFrame.from_dict(dataframe_list,orient='columns')
print(df_master)
if __name__ == '__main__':
start = time.time()
print('starting main')
main()
print('finishing main', time.time() - start)

Simply use the multiprocessing.Pool, which by default uses all of your cores and doesn't need any complicated management to make it operate at 100% efficiency. Spawning too many processes is not the way to go.
It will also make your code much neater as you only need only need to provide it with one function and a map of the data.
Also:
You can use time.strftime("%H_%M_%S") instead of datetime.now().strftime("%H_%M_%S"). The datetime module is really slow.
If you use primitives instead of pandas, especially just the basic Python open combined with string.split or less optimally the Python xls.reader/writer, you will see a huge performance increase. Pandas is really slow. There are certainly many cases where its useful features make up for its slow performance, but if the only thing you need to do is extract some data from a csv/etc file(s) and you're working with a huge dataset, you're better off not using it.
EDIT:
I've had a look at the actual module you use, and it involves sending requests, while I thought you were reading the data from the disk.
Btw, correct me if I'm wrong but I think the ett. in ett.ett_parser.get_user(user_id) is wrong, the actual call should be just to ett_parser.
Because your code involves sending multiple requests over the net and then writing the data to disk, you should probably use multithreading instead of multiprocessing, or you can still use multiprocessing, but experimenting with a different amount of processes, because in this case the task is I/O bound and you should be trying to send more requests than there are logical cores in your cpu. At the same time, Pandas is slow like I said and your other option is to use the xlsxwriter module, which is guaranteed to be faster (since you're really only using pandas to write the strings obtained from your json requests to an .xls file.) You can also using try the .csv writer module, if you're fine with the csv format. But above all, the main bottleneck are going to be the requests, not the processing power it takes to write the data to disk.
Secondly, Booboo's solution adapts your code to be used with the pool, which is great. But it may be more convenient not to split the data into chunks and then divide these chunks between processes. Instead, because the requests are the bottleneck, you can use multithreading and append the results to a global array. If you just use a multithreading map, you only need to pass it the user ids and the function. Therefore, you could do the following:
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from pyETT import ett_parser
USER_ID_COUNT = 50000
MAX_WORKERS = 2 * cpu_count() + 1
user_array = [] #Can be adapted to Pandas or the xlwriter module
user_ids = list(range(1, USER_ID_COUNT))
def obtain_user_record(user_id):
return ett_parser.get_user(user_id)
with ThreadPoolExecutor(MAX_WORKERS) as executor:
for user_id, user_record in zip(user_ids, executor.map(obtain_user_record, user_ids)):
user_array.append(user_record)
Then, you would write the array to disk, splitting it as you like, and you could certainly use multiprocessing for that if you felt it was worth it.
EDIT2:
Okay, in that case use Pandas. But, note the documentation page for pandas.DataFrame.append states:
Iteratively appending rows to a DataFrame can be more computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate the list with the original DataFrame all at once.
Oh, and by the way, I think the pyETT function returns a dataframe and not a string, is that right? I'm not quite sure as to the indexing of the return values, but you can figure that out. Do this:
dataframe_list = []
with ThreadPoolExecutor(MAX_WORKERS) as executor:
for user_id, user_record in zip(user_ids, executor.map(obtain_user_record, user_ids)):
if user_record:
dataframe_list.append(user_record)
#If the dataframes aren't indexed, you could create a pre-filled
#data_frame_list and overwrite specific indices instead of appending,
#then pass ignore_index=True to pd.concat
df_master = pd.concat(dataframe_list) #We're done, time to merge our dframes
#into one big dframe.
EDIT3:
Okay, then we need to convert the data to a pandas dataframe.
user_df = pd.DataFrame.from_dict({user_id:user_record}, orient='index', columns=m_cols)
dataframe_list.append(user_df)
This way, the frames will be indexed by the user ids we pass to the function earlier. And of course, you could have multiple lists and merge them into multiple dataframes, then just write the dataframes to separate .xls files or you could go for some other approach.
EDIT4:
Interesting it will let you use pd.DataFrame.from_dict on a list of dicts. The correct function to call would be pd.DataFrame.from_records instead. You don't really need to pass orient='index' to it. What's the advantage of converting the dicts to dataframes, then merging them? Well, you can control the indexing like I showed in Edit3. You could accomplish the same with
pd.DataFrame.from_records(dataframe_list,index=index_list)
df_master.sort_index()
Index_list being a list of indices (that is user_ids), in the same order as dataframe_list.
You can also make a dict instead of a list:
dataframe_dict = {}
with ThreadPoolExecutor(MAX_WORKERS) as executor:
for user_id, user_record in zip(user_ids, executor.map(obtain_user_record, user_ids)):
if user_record:
dataframe_dict[user_id] = user_record
Then the correct function to call is from_dict and you can once again index the records like so:
df_master = pd.DataFrame.from_dict(dataframe_dict, orient='index')
df_master.sort_index()
Not having to convert the dicts to dataframes each time is better performance-wise, so good call.

Update
First of all, when you process ranges range(1, 50), range(51, 100), range(101, 150), etc. the actual values that user_id will take on are:
1, 2, ... 49, 51, 52, ... 99, 101, 102, ... 149
That is, you will not be processing values 50, 100, 150, etc. because given range(start, stop, step) then for a positive step, the contents of a range r are determined by the formula r[i] = start + step*i where i >= 0 and r[i] < stop.
I question whether it was your intention to omit those values. I will assume that it was not your intention.
It appears that processing can be divided into an I/O-bound portion (ett.ett_parser.get_user(user_id), which is retrieving a URL) and a more CPU-intensive portion (creation of the pandas dataframe and exporting to Excel). For the former a large multithreading pool should be used and for the later a multiprocessing pool should be used.
The following code creates both pools and passes the multiprocessing pool to the worker function run, which retrieves all the URLs specified in its passed range and then passes the list of lines thus retrieved to the multiprocessing pool for assembly and conversion to Excel.
from pyETT import ett_parser
import pandas as pd
import time
from datetime import datetime
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from functools import partial
import sys
def run(multiprocessing_pool, run_number, the_range):
c = 10
print(f'Running query {run_number}...', flush=True)
lines = []
for user_id in the_range:
try:
if user_id % c == 0:
print('Loop is at:', user_id, flush=True)
line = ett.ett_parser.get_user(user_id)
lines.append((user_id, line))
except Exception:
print("Error:", user_id, flush=True)
if lines:
multiprocessing_pool.apply(process_lines, args=(run_number, lines))
def process_lines(run_number, lines):
m_cols = ('user-name','elo','rank','wins','losses','last-online')
df_master = pd.DataFrame(columns = m_cols)
for user_id, line in lines:
temp_df = pd.DataFrame(line, index=[user_id])
df_master = df_master.append(temp_df, ignore_index=True)
#Export to excel
file_name = f'export_file{run_number}_{datetime.now().strftime("%H_%M_%S")}.xlsx'
df_master.to_excel(file_name, index=False)
print(f'DataFrame({file_name}) is written to Excel File successfully.', flush=True)
def split(lst, n): # function to split list in n even parts
k, m = divmod(len(lst), n)
return list(lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
def main():
MIN_USER_ID = 1
MAX_USER_ID = 500
NUM_SPLITS = 10 # 50 user ids per split
THREAD_POOL_SIZE = min(NUM_SPLITS, 100) # Or maybe even 500!
# split this range into thread_pool_size smaller ranges:
# Next line will process 500 user ids: 1, 2, ... 500
ranges = split(range(MIN_USER_ID, MAX_USER_ID+1), NUM_SPLITS)
multiprocessing_pool = Pool()
multithreading_pool = ThreadPool(THREAD_POOL_SIZE)
multithreading_pool.starmap(partial(run, multiprocessing_pool) , enumerate(ranges, start=1))
if __name__ == '__main__':
start = time.time()
print('starting main')
main()
print('finishing main', time.time() - start)

Related

Efficient way to combine multiple csv

I have over 100K CSV (total file size north of 150 GB) which I need to join. All have standard column names although the sequence of columns may not match and some csv have a few columns missing.
Now I just created a dataframe and kept concating the datframe from each csv in each iteration to have a standard dataframe containing all columns which I eventually intended to save as csv
I tried making a dataframe with 1000 sample csv and noticed as the dataframe size increased, the number of iteration dropped down from 10 to 1.5 per second which probably means that it would follow a similar trend if I got all-in with 100k csv thus taking days if not months to combine them.
Is there a better way of combining huge number of csv files?
Here is my code
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Possible improvement
Method 1: Using Pandas
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
Method 1a
Using Pandas to continuously append to CSV output file
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
Method 2: Binary Files
Reading/Writing binary and using memory-map should be faster.
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)

python point inside polygon (point cloud data)

number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))
I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?

How to make my bulk uploader code more efficient?

I wrote a python script to bulk upload files from a folder into postgresql. While the script works, I do not think it's super efficient. Can anyone tell me how to improve it?
It takes a very long time for the files to actually be uploaded.
Spacing/indentation is slightly off in posting, this is not an issue in actual script.
def addFilesToDatabase(directory):
uploadedFiles = []
errorFiles = []
rows_to_chunk_by = 1000
for filename in os.listdir(directory):
try:
filename_used = filename.upper()
if filename_used.endswith(".CSV"):
file_directory = os.path.join(directory, filename)
tableName = filename_used.replace('.CSV','')
df = pd.read_csv(file_directory, header=0, nrows = 1)
columns = df.columns
while 1==1:
for skiprows in range(100000000):
if(skiprows == 0):
df = pd.read_csv(file_directory, header=0, nrows = rows_to_chunk_by, skiprows = skiprows*rows_to_chunk_by)
df.to_sql(name=tableName, con=engine, if_exists='append', schema=None, index=False)
else:
df = pd.read_csv(file_directory, header=None, nrows = rows_to_chunk_by, skiprows = skiprows*rows_to_chunk_by)
df.columns = columns
df.to_sql(name=tableName, con=engine, if_exists='append', schema=None, index=False)
if(len(df)<rows_to_chunk_by):
break
uploadedFiles.append(filename)
break
except Exception as e:
if str(e) == "No columns to parse from file":
uploadedFiles.append(filename)
elif str(e)[0:16] == "Length mismatch:":
uploadedFiles.append(filename)
else:
errorFiles.append(filename)
print('Error with ' + filename)
print(e)
continue

XLSXWriter refuses to create a second Excel File

I'm working on a program to split excel files into sections of 1000. I can't seem to get it to create a second excel file, as xlsxwriter doesn't create the second file.
from os.path import join, dirname, abspath
from xlrd.sheet import ctype_text
import csv
import os
import sys
import xlrd
import xlsxwriter
import xlwt
file_paths = sys.argv[1:]
draganddrop = ''.join(file_paths)
beginGrab = 0
counting = 0
endGrab = 1000
thousands = 0
if draganddrop == "":
fileName = raw_input("\nInput the file with extension\n>")
else:
fileName = draganddrop
stopPoint = fileName.index('.')
prepRev = fileName[stopPoint:]
preName = fileName[:stopPoint]
if prepRev == ".csv":
excelFile = xlsxwriter.Workbook(preName + '.xlsx')
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
content = csv.reader(f)
for index_col, data_in_col in enumerate(content):
for index_row, data_in_cell in enumerate(data_in_col):
worksheet.write(index_col,index_row,data_in_cell)
excelFile.close()
fileName = (preName + '.xlsx')
delMe = 1
print("Temporary Convert to xlsx done.\n")
stopPoint = fileName.index('.')
prepRev = fileName[0:stopPoint]
fname = join(dirname(abspath(__file__)), fileName)
xl_workbook = xlrd.open_workbook(fname)
sheet_names = xl_workbook.sheet_names()
xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
book = xlwt.Workbook(encoding="utf-8")
worksheet = book.add_sheet("Results", cell_overwrite_ok=True)
workbook = xlrd.open_workbook(fileName)
for sheet in workbook.sheets():
for row in range(sheet.nrows):
row = int(row)
if(int(row)>1000):
subDivide = int(row) / 1000
while(thousands != subDivide + 1):
thousands = thousands + 1
counting = 0
totalName = preName + "_" + str(thousands) + ".xlsx"
print(totalName)
excelFile = xlsxwriter.Workbook(str(totalName))
worksheet = excelFile.add_worksheet()
with open(totalName,'rb') as f:
col = xl_sheet.col_slice(0,1,10101010)
for idx, cell_obj in enumerate(col, start=beginGrab):
counting = counting + 1
if(counting == 1000):
break
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
cell_obj_str = str(cell_obj)
telePhone = (cell_obj_str[7:19])
worksheet.write(idx+1, 0, "1" + telePhone)
worksheet.write(0,0, "Telephone Number")
beginGrab = thousands * 1000
endGrab = beginGrab + 1000
excelFile.close()
excelFile = None
else:
print("Mate, this is Tiny!")
print ("Ding! Job Done!")
I've been rubber ducking this and I can't find where I'm at fault.
EDIT:
SOLVED!!
By creating a sheet and then closing it, the program can then grasp it. I will probably make a git issue about this.
if prepRev == ".csv":
totalName = preName + '.xlsx'
excelFile = xlsxwriter.Workbook(totalName)
excelFile.close()
Closing it lets open see it while it still contains the same info.
excelFile = xlsxwriter.Workbook(totalName)
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
Doesn't the save/close line need to be within the while loop? Otherwise it looks like it will only save either the first/last item:
while(thousands != subDivide + 1):
# write file
excelFile.close()
that line is probably the reason why you cannot read back your file and your script crashes:
fname = join(dirname(abspath('__file__')), '%s' % fileName)
'__file__' shouldn't have quotes. I'd do:
fname = join(dirname(abspath(__file__)), fileName)

Multiprocessing Pandas eats memory

I'm using multiprocessing to try to speed up the processing of about 1000 ~500MB csv files using Pandas. I'm trying to apply a simple string regex to one column. The program works, but it seems to not free memory properly, and it eventually comes to eat up 40-80GB per process, despite none of the files being over 10GB. Do you have any idea why this could be? I've tried a number of ways to clear memory, to no avail.
import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle
oldc = ""
newc = ""
NUMPROC = 8
rep = None
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
def main(filename, oldcol, newcol):
global oldc
global newc
global rep
names = np.empty([1,1])
oldc = oldcol
newc = newcol
df = pd.read_csv(filename)
names = df.as_matrix()
del df
rep = {}
rep[newc] = {}
for row in names[1:]:
oldname = r"^"+str(row[0])+r"( .*|$)"
newname = str(row[1]) + r"\1"
rep[newc][oldname]=newname
if not os.path.exists("./standardized/"):
print("Making dir!")
os.makedirs("./standardized/")
files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
shuffle(files)
allfiles = [f for f in os.listdir('.') if ".csv" in f]
for f in allfiles:
if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
files.append(f)
print(len(files))
bundle = [(idx, f) for idx, f in enumerate(files)]
pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
r = pool.map_async(process, bundle)
pool.close()
pool.join()
def process(bundle):
global oldc
global rep
global newc
fname = bundle[1]
idx = bundle[0]
try:
print(idx)
libc.malloc_trim(0)
curfile = pd.read_csv(fname, dtype="str")
curfile[newc] = curfile[oldc].str.lower()
curfile.replace(to_replace=rep, regex=True, inplace=True)
curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
del curfile
except:
print("error on: " + str(fname))
finally:
gc.collect()
libc.malloc_trim(0)
main("lookup.csv","namefrst","stndfrst")

Categories

Resources