Multiprocessing Pandas eats memory

Multiprocessing Pandas eats memory - python

I'm using multiprocessing to try to speed up the processing of about 1000 ~500MB csv files using Pandas. I'm trying to apply a simple string regex to one column. The program works, but it seems to not free memory properly, and it eventually comes to eat up 40-80GB per process, despite none of the files being over 10GB. Do you have any idea why this could be? I've tried a number of ways to clear memory, to no avail.
import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle
oldc = ""
newc = ""
NUMPROC = 8
rep = None
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
def main(filename, oldcol, newcol):
global oldc
global newc
global rep
names = np.empty([1,1])
oldc = oldcol
newc = newcol
df = pd.read_csv(filename)
names = df.as_matrix()
del df
rep = {}
rep[newc] = {}
for row in names[1:]:
oldname = r"^"+str(row[0])+r"( .*|$)"
newname = str(row[1]) + r"\1"
rep[newc][oldname]=newname
if not os.path.exists("./standardized/"):
print("Making dir!")
os.makedirs("./standardized/")
files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
shuffle(files)
allfiles = [f for f in os.listdir('.') if ".csv" in f]
for f in allfiles:
if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
files.append(f)
print(len(files))
bundle = [(idx, f) for idx, f in enumerate(files)]
pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
r = pool.map_async(process, bundle)
pool.close()
pool.join()
def process(bundle):
global oldc
global rep
global newc
fname = bundle[1]
idx = bundle[0]
try:
print(idx)
libc.malloc_trim(0)
curfile = pd.read_csv(fname, dtype="str")
curfile[newc] = curfile[oldc].str.lower()
curfile.replace(to_replace=rep, regex=True, inplace=True)
curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
del curfile
except:
print("error on: " + str(fname))
finally:
gc.collect()
libc.malloc_trim(0)
main("lookup.csv","namefrst","stndfrst")

Related

How to remove sheets based on array with python openpyxl

I need to remove sheets that names are in array. Unfortunetely this: tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex]) , and this:
del tempWb[wsToRemoveNameArray[wsToRemoveIndex]] dont want to work with my code:
Anyone know how to deal with it?
def splitExcelFiles(InputPath, OutputPath, fileNameArray):
for file in range(0, len(fileNameArray)):
tempFile = InputPath + '\\' +fileNameArray[file]
tempWb = load_workbook(tempFile)
wsToRemoveNameArray = []
if(len(tempWb.sheetnames)==1):
#new wb
tempWb.save(str(OutputPath) + '\\' + str(tempWb.sheetnames) + '.xlsx')
else:
for ws in range (0,len(tempWb.sheetnames)):
newName = tempWb.sheetnames[ws]
wsToRemoveNameArray = []
#copyWs = tempWb.copy_worksheet[ws]
# #This section will save the names to remove other sheets from ws
for wsToRemoveName in range (0,len(tempWb.sheetnames)):
if newName != tempWb.sheetnames[wsToRemoveName]:
#print(tempWb.sheetnames[wsToRemoveName])
wsToRemoveNameArray.append(str(tempWb.sheetnames[wsToRemoveName]))
for wsToRemoveIndex in range (0, len(wsToRemoveNameArray)):
# tem
#tempWb.remove(wsToRemoveNameArray[wsToRemoveIndex])
#del tempWb[wsToRemoveNameArray[wsToRemoveIndex]]
# tempWb.
# print(len(wsToRemoveNameArray))
tempWb.save(str(OutputPath) + '\\' + newName + '.xlsx')

First things first, some general tips:
Use the pathlib library whenever you deal with Paths. It makes things a lot easier. There is never a good reason to include the path delimiter in your code.
In python, it's common to write variables and functions with an underscore: save_path instead of savePath
Now that we have that out of the way, here is a tested example. Just change the directories to match yours.
from openpyxl import load_workbook, Workbook
from pathlib import Path
import shutil
def make_absolute_path(path, name, suffix=".xlsx"):
input_file_path = path / name
return input_file_path.with_suffix(suffix)
def remove_all_sheets_except_filename(input_path, output_path, filenames):
output_files_path = []
for i in range(0, len(filenames)):
input_file_path = make_absolute_path(input_path, filenames[i])
output_file_path = make_absolute_path(output_path, filenames[i])
if not Path.is_file(input_file_path):
print(f"Skipping {input_file_path}: Not valid file " f"path. ")
continue
shutil.copyfile(input_file_path, output_file_path)
wb_source: Workbook = load_workbook(filename=output_file_path)
sheets = wb_source.worksheets
if len(sheets) == 1:
save_path = make_absolute_path(output_path, str(wb_source.sheetnames[0]))
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
for sheet in wb_source.sheetnames:
if not sheet == input_file_path.stem:
wb_source.remove(wb_source[sheet])
if len(wb_source.worksheets) == 1:
save_path = make_absolute_path(
output_path, str(wb_source.sheetnames[0])
)
wb_source.save(save_path)
output_files_path.append(str(save_path))
else:
print(
f"Failed to process {input_file_path} with following "
f"sheets: {','.join(wb_source.worksheets)}."
)
raise ValueError("")
return output_files_path
def main():
# Adjust to where you have the xlsx files and where you want them
input_directory = Path("path/to/your/xlsx/files")
output_directory = Path("path/to/the/output/directory")
file_names = ["input", "foo", "bar"]
paths = remove_all_sheets_except_filename(
input_directory, output_directory, file_names
)
if __name__ == "__main__":
main()
``

python point inside polygon (point cloud data)

number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))

I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?

File names on CSV with bad line errors

I'm using the following code below to capture bad line errors when reading a csv through pandas. I'm having trouble getting the filename to be included. I tried using a list to append during the loop but resulted in every file showing an error instead of just the files with errors.
How can I get the filename included?
import os
import glob
import sys
from io import StringIO
import pandas as pd
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mydict = {}
mylist = []
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
x = ''
for f in glob.glob(zipPath):
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False)
result_string = result.getvalue()
f_name = os.path.basename(f)
if len(result_string) > 1 :
with open('bad_lines.txt', 'w') as bad_lines:
for line in result_string.split(r'\n'):
if len(line)> 5:
bad_lines.write(line.replace('\n','').replace('b','').replace("'",''))
bad_lines.write('\n')
sys.stderr = old_stderr
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))

I was able to get the following working code :
import os
import sys
import glob
import pandas as pd
from io import StringIO
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mylist = []
for f in glob.glob(zipPath):
f_name = os.path.basename(f)
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False ,warn_bad_lines=True)
result_string = result.getvalue()
sys.stderr = old_stderr
if len(result_string) > 5 :
mylist.append([result_string,f_name ])
mynewlist = []
for i in mylist:
i[0] = i[0].replace('b','').replace("'",'')
for x in i[0].replace('\n','').split('\\n'):
if len(x) > 1 :
mynewlist.append([x , i[1]])
df = pd.DataFrame(mynewlist, columns = ['Error', 'File'])
print(df)
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))

Export Excel Module via Python

I'm trying to replicate the exporting of a Code Module from an Excel sheet in Python.
The following works in VBA:
Public Sub ExportModules()
Dim wb As Workbook
Set wb = ThisWorkbook
Dim D As String
Dim N
D = ThisWorkbook.Path
For Each VBComp In wb.VBProject.VBComponents
If (VBComp.Type = 1) Then
N = D + "\" + VBComp.Name + ".txt"
VBComp.Export N
End If
Next
End Sub
And I have the following in Python:
import os
import sys
import glob
from win32com.client import Dispatch
scripts_dir = 'folder address'
com_instance = Dispatch("Excel.Application")
com_instance.Visible = False
com_instance.DisplayAlerts = False
for script_file in glob.glob(os.path.join(scripts_dir, "*.xlsm")):
print "Processing: %s" % script_file
(file_path, file_name) = os.path.split(script_file)
objworkbook = com_instance.Workbooks.Open(script_file)
for xlmodule in objworkbook.VBProject.VBComponents:
xlmodule.Export('export file name')
My question is, what do I have to do in Python to replicate the Export of the file as per the VBA code?

Use the default oletools xltrails provides a good way to extract .bas files from .xlsm or other excel files
import os
import shutil
from oletools.olevba3 import VBA_Parser
EXCEL_FILE_EXTENSIONS = ('xlsb', 'xls', 'xlsm', 'xla', 'xlt', 'xlam',)
def parse(workbook_path):
vba_path = workbook_path + '.vba'
vba_parser = VBA_Parser(workbook_path)
vba_modules = vba_parser.extract_all_macros() if vba_parser.detect_vba_macros() else []
for _, _, _, content in vba_modules:
decoded_content = content.decode('latin-1')
lines = []
if '\r\n' in decoded_content:
lines = decoded_content.split('\r\n')
else:
lines = decoded_content.split('\n')
if lines:
name = lines[0].replace('Attribute VB_Name = ', '').strip('"')
content = [line for line in lines[1:] if not (
line.startswith('Attribute') and 'VB_' in line)]
if content and content[-1] == '':
content.pop(len(content)-1)
lines_of_code = len(content)
non_empty_lines_of_code = len([c for c in content if c])
if non_empty_lines_of_code > 0:
if not os.path.exists(os.path.join(vba_path)):
os.makedirs(vba_path)
with open(os.path.join(vba_path, name + '.bas'), 'w') as f:
f.write('\n'.join(content))
if __name__ == '__main__':
for root, dirs, files in os.walk('.'):
for f in dirs:
if f.endswith('.vba'):
shutil.rmtree(os.path.join(root, f))
for f in files:
if f.endswith(EXCEL_FILE_EXTENSIONS):
parse(os.path.join(root, f))
I have tried it and it works great.
Ref: https://www.xltrail.com/blog/auto-export-vba-commit-hook

How to join arrays with mpi4py?

I'm new using MPI and I'm trying to parallelize the following code with MPI4PY. It's a code that read more or less 100 csv files and rearrange them.
The problem comes after the parallelization how to join in order the arrays 'data' and 'mapData'
the original code without parallelization:
import csv
import sys
import numpy as np
import StringIO
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#def sprintf(buf, fmt, *args):
# buf.write(fmt % args)
from os import listdir
from os.path import isfile, join
import os,glob
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + i
files_in_dir.append(i)
filelist = [[]] * len(files_in_dir)
for file in xrange(len(files_in_dir)):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
#next(reader)
filelist[file].append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
# DO NOT!!! use P
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data[i] = [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData[i] = filelist[0][i][4:7]
with open('mapdata.csv', 'wb') as mapdatares:
writer = csv.writer(mapdatares)
writer.writerows(mapData)
and this is what I could have done. my problem is when I call 'comm.allreduce'
import csv
import sys
import numpy as np
import StringIO
import re
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
from os import listdir
from os.path import isfile, join
import os,glob
import math
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + 1
files_in_dir.append(i)
#count number of csv files
print "number of files is: ", len(files_in_dir)
#divide files for processors
filesrank=len(files_in_dir)/size
intero=math.trunc(filesrank)
fremain=len(files_in_dir) % size
if fremain > rank:
sizeproc=intero + 1
else:
sizeproc=intero
#extreme of intervals
a=rank * sizeproc
b=a + sizeproc
#parallelize
filelist = [[]] * len(files_in_dir[a:b])
for file in xrange(len(files_in_dir[a:b])):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) #read cvs file skipping reader
for row in reader:
#next(reader)
filelist[file].np.append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data= [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData = filelist[0][i][4:7]
totmapData = []
totData = []
comm.allreduce(mapData, totmapData, op=MPI.SUM)
comm.allreduce(data, totData, op=MPI.SUM)
Any suggestion? thanks

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multiprocessing Pandas eats memory - python

Related

How to remove sheets based on array with python openpyxl

python point inside polygon (point cloud data)

File names on CSV with bad line errors

Export Excel Module via Python

How to join arrays with mpi4py?

Categories

Resources