I'm new using MPI and I'm trying to parallelize the following code with MPI4PY. It's a code that read more or less 100 csv files and rearrange them.
The problem comes after the parallelization how to join in order the arrays 'data' and 'mapData'
the original code without parallelization:
import csv
import sys
import numpy as np
import StringIO
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#def sprintf(buf, fmt, *args):
# buf.write(fmt % args)
from os import listdir
from os.path import isfile, join
import os,glob
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + i
files_in_dir.append(i)
filelist = [[]] * len(files_in_dir)
for file in xrange(len(files_in_dir)):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
#next(reader)
filelist[file].append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
# DO NOT!!! use P
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data[i] = [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData[i] = filelist[0][i][4:7]
with open('mapdata.csv', 'wb') as mapdatares:
writer = csv.writer(mapdatares)
writer.writerows(mapData)
and this is what I could have done. my problem is when I call 'comm.allreduce'
import csv
import sys
import numpy as np
import StringIO
import re
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
from os import listdir
from os.path import isfile, join
import os,glob
import math
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + 1
files_in_dir.append(i)
#count number of csv files
print "number of files is: ", len(files_in_dir)
#divide files for processors
filesrank=len(files_in_dir)/size
intero=math.trunc(filesrank)
fremain=len(files_in_dir) % size
if fremain > rank:
sizeproc=intero + 1
else:
sizeproc=intero
#extreme of intervals
a=rank * sizeproc
b=a + sizeproc
#parallelize
filelist = [[]] * len(files_in_dir[a:b])
for file in xrange(len(files_in_dir[a:b])):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) #read cvs file skipping reader
for row in reader:
#next(reader)
filelist[file].np.append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data= [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData = filelist[0][i][4:7]
totmapData = []
totData = []
comm.allreduce(mapData, totmapData, op=MPI.SUM)
comm.allreduce(data, totData, op=MPI.SUM)
Any suggestion? thanks
Related
I want to store numpyarrays(images), data and a timestamp to a file in Python.
Later i want to scroll through the images and show the data.
I wrote my data to a json file, but it takes long and with one image(640x480) and one set of data it gets 10 Mb in size.
Here is my aproach:
import json as json
import base64
import io
import sys
import cv2
import numpy as np
import os
from numpy import asarray
from json import JSONEncoder
path = r'C:/Videotool/Data'
name = 'testfile'
cam = cv2.VideoCapture(0)
result, image = cam.read()
if result:
print(f'image:{image}')
else:
print('no image')
def json_write_to_file(path, name, data):
file_exists = os.path.exists(name+'.json')
file = path + '/' + name + '.json'
readed_data = []
data_list = []
sum_dict = []
if file_exists:
print(f'file {name}.json exist\n')
with open(file) as fp:
readed_data = json.load(fp)
print(f'readed_data:{readed_data}\n')
print(f'data:{data}\n')
readed_data.append(data)
sum_dict = readed_data
print(f'sum_dict:{sum_dict}\n')
with open (file, 'w') as fp:
json.dump(sum_dict, fp, indent=2)
print(f'length of readed_data: {len(readed_data)}\n')
#fp.write('\n')
else:
sum_dict.append(data)
print(f'file {name}.json not exists\n')
with open (file, 'w') as fp:
json.dump(sum_dict, fp, indent=2)
print(f'Number of Dimensions of image:{image.ndim}')
print(f'Shape of image:{image.shape}')
print(f'Size of image:{image.size}')
low_image_array = image.reshape(-1)
ori_array = np.zeros((2,3,4))
print(f'Number of Dimensions of ori_array:{ori_array.ndim}')
print(f'Shape of ori_array:{ori_array.shape}')
print(f'Size of ori_array:{ori_array.size}')
print(f'ori_array:{ori_array}')
low_1_array = ori_array.reshape(-1)
print(f'Number of Dimensions of low_1_array:{low_1_array.ndim}')
print(f'Shape of low_1_array:{low_1_array.shape}')
print(f'Size of low_1_array:{low_1_array.size}')
print(f'low_1_array:{low_1_array}')
data_set = {}
data_set['Stream_1'] = low_1_array.tolist()
data_set['Stream_2'] = low_image_array.tolist()
print(f'data_set: {data_set}')
json_write_to_file(path, name, data_set)
Is there a better way to store images and data in a file?
I'm using python3 and I want to merge few csv files by columns.
Is it possible to do without pandas?
For example if I have this two csv
df1:
Name Surname PCName
Max Petrov wrs123
Ivan Ivanov wrs321
df2:
Surname Name PCName
Sidorov Vasily wrs223
Dmitriev Alex wrs331
With pandas I've got this solution:
import os
import pandas as pd # $ pip install pandas
import time
def cls():
os.system('cls' if os.name=='nt' else 'clear')
cls()
today = time.strftime("%y%m%d")
fldpath = 'C:/tmp2/test/'
filepath = fldpath+today+"_merged.csv"
print(os.listdir(fldpath))
print("type begining of file names")
fmask = input()
file_list = [fldpath + f for f in os.listdir(fldpath) if f.startswith(fmask)]
csv_list = []
for file in sorted(file_list):
csv_list.append(pd.read_csv(file).assign(File_Name = os.path.basename(file)))
csv_merged = pd.concat(csv_list, ignore_index=True)
csv_merged.to_csv(filepath, index=False)
You could use a Python DictReader() and DictWriter() to do this as follows:
import csv
import os
import time
def cls():
os.system('cls' if os.name=='nt' else 'clear')
cls()
today = time.strftime("%y%m%d")
fldpath = 'C:/tmp2/test/'
filepath = fldpath + today + "_merged.csv"
print(os.listdir(fldpath))
print("type beginning of file names")
fmask = input()
file_list = [fldpath + f for f in os.listdir(fldpath) if f.startswith(fmask)]
with open(filepath, 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=["Name", "Surname", "PCName"])
csv_output.writeheader()
for file in sorted(file_list):
with open(file) as f_input:
csv_input = csv.DictReader(f_input)
csv_output.writerows(csv_input)
For your given example, this would produce an output of:
Name,Surname,PCName
Max,Petrov,wrs123
Ivan,Ivanov,wrs321
Vasily,Sidorov,wrs223
Alex,Dmitriev,wrs331
This assumes each CSV file has the same field names (order is not important)
I have 118 CSVs, I need to go into each CSV and change F1, F2, F3 and so on to 0.
For example, in csv1, F1 = 0, in csv2, F2 = 0, in csv3, F3 = 0 and so on.
The CSV has headers:
I am assuming all of your CSV files have the same format, and that you are trying to set column F to be 0 for all of them.
You can use Python CSV library to help you as follows:
import csv
import glob
for filename in glob.glob('*.csv'):
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
rows = [[*row[:5], '0'] for row in csv_input]
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
This reads all .csv files from a given folder and changes the Multi Col 2 values to 0. It does this for all rows but leaves the header the same.
Thank you all, I made my own solution, it is a lot less classy than the ones posted here. But I automated it from the point of needing x number of files to amending the col/row.
#==============================================================================
# Import the necessary packages
import os
#import glob
import shutil
import pathlib
import pandas as pd
#import numpy as np
#==============================================================================
InputPath = 'F:\\cells\\bc_dbase\\bc_dbase1.csv'
OutputPath = 'F:\\cells\\bc_dbase'
str1 = 'Name '
str2 = 'Mult Col 2'
NoStart = 1
NoEnd = 119
#==============================================================================
# Create complete path of folders
def CreatePath(FullPath,File=False):
Parts = pathlib.Path(FullPath).parts
for [n1,Folder] in enumerate(Parts):
if File==True and n1==len(Parts)-1 and "." in Parts[n1]:
continue
elif n1==0:
FolderPath = Parts[n1]
else:
FolderPath = os.path.join(FolderPath,Folder)
if os.path.exists(FolderPath)==False:
os.mkdir(FolderPath)
#==============================================================================
# Delete folder
def DeleteFolder(FullPath):
FullPath = pathlib.Path(FullPath)
try:
shutil.rmtree(FullPath)
except:
pass
#==============================================================================
CreatePath(OutputPath,File=False)
[FolderPath,File] = os.path.split(InputPath)
[FileName,FileExt] = os.path.splitext(os.path.basename(InputPath))
ReversedFileName = FileName[::-1]
AdjFileName = FileName
for n1 in reversed(range(len(AdjFileName))):
char = FileName[n1]
if char.isdigit():
AdjFileName = AdjFileName[:n1] + AdjFileName[(n1+1):]
else: break;
Data1 = pd.read_csv(InputPath)
Data2 = pd.DataFrame.copy(Data1)
NameCols = Data1.columns
if str2 in NameCols:
Data2.loc[:,str2] = 1
for n1 in range(NoStart,NoEnd+1):
NewFile = AdjFileName + str(n1) + FileExt
NewFilePath = os.path.join(OutputPath,NewFile)
Data3 = pd.DataFrame.copy(Data2)
index = Data3[Data3[str1]==n1].index[0]
Data3.loc[index,str2] = 0
Data3.to_csv(NewFilePath, index=False)
print('[INFO] Storing file:',NewFilePath)
#==============================================================================
Mr. Evans has pretty neat code using Python CSV library, so I will expand on it a bit to answer y
our specific question.
import csv
import glob
file_count = 0
for filename in glob.glob('*.csv'):
file_count += 1
print(f'Processing: {filename}')
with open(filename) as f_input:
csv_input = csv.reader(f_input)
header = next(csv_input)
line_count = 0
rows = []
for row in csv_input:
line_count += 1
if line_count == file_count:
rows.append([*row[:5], '0'])
else:
rows.append([*row[:6]])
with open(filename, 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(header)
csv_output.writerows(rows)
Note: the code will run for all the .csv files in the working directory and will run through the files in an alphabetic order.
number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))
I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?
I'm using the following code below to capture bad line errors when reading a csv through pandas. I'm having trouble getting the filename to be included. I tried using a list to append during the loop but resulted in every file showing an error instead of just the files with errors.
How can I get the filename included?
import os
import glob
import sys
from io import StringIO
import pandas as pd
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mydict = {}
mylist = []
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
x = ''
for f in glob.glob(zipPath):
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False)
result_string = result.getvalue()
f_name = os.path.basename(f)
if len(result_string) > 1 :
with open('bad_lines.txt', 'w') as bad_lines:
for line in result_string.split(r'\n'):
if len(line)> 5:
bad_lines.write(line.replace('\n','').replace('b','').replace("'",''))
bad_lines.write('\n')
sys.stderr = old_stderr
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))
I was able to get the following working code :
import os
import sys
import glob
import pandas as pd
from io import StringIO
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mylist = []
for f in glob.glob(zipPath):
f_name = os.path.basename(f)
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False ,warn_bad_lines=True)
result_string = result.getvalue()
sys.stderr = old_stderr
if len(result_string) > 5 :
mylist.append([result_string,f_name ])
mynewlist = []
for i in mylist:
i[0] = i[0].replace('b','').replace("'",'')
for x in i[0].replace('\n','').split('\\n'):
if len(x) > 1 :
mynewlist.append([x , i[1]])
df = pd.DataFrame(mynewlist, columns = ['Error', 'File'])
print(df)
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))