number of points 100,000,000 (4GB)
I am reading a CSV file and saving the data separate CSV file.
I'm using import csv.reader, which is working fine. But this code I noticed that it takes too much time.
How can I improve the performance of my task?
Please provide me with alternative options.
Performance is the main concern here.
from shapely.geometry import Point, Polygon
import csv
import os
req1 = input("path of the CSV file: ")
file_name = os.path.splitext(req1)
file_name = os.path.split(file_name[0])
path = file_name[0]
file_name = file_name[1]
with open(req1, "r") as f:
reader = csv.reader(f)
next(reader) # skip header
os.makedirs(path + "/" + file_name + "_output", exist_ok=True)
outpath = path + "/" + file_name + "_output" + "/"
coords = [[19.803499,15.2265],[-35.293499,33.7495],
[-49.6675,33.726501],[-48.022499,20.4715],
[-36.336498,-4.925],[-32.6105,-45.494499],
[-10.5275,-38.3815],[-11.93835,-20.8235],
[26.939501,-18.095501],[19.803499,15.2265]]
poly = Polygon(coords)
for row in reader:
geom = Point(float(row[0]),float(row[1])) # Considering the order of elements that you gave
x = float(row[0])
y = float(row[1])
z = float(row[2])
r = int(row[3])
g = int(row[4])
b = int(row[5])
i = int(row[6])
result = geom.within(poly)
if str(result) == 'True':
with open(outpath + file_name + "_TRUE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
print('True', str(x),str(y),str(z))
else:
with open(outpath + file_name + "_FALSE.csv", "a", newline = "") as file:
writeData = ([str(x),',',str(y),',',str(z),',',str(r),',',str(g),',',str(b),',',str(i),('\n')])
file.writelines(writeData)
#print('False', str(x),str(y),str(z))
I used [pd.read_csv] instead of [import csv.reader].
So the performance has been improved a bit.
However, I tried to do Python multiprocessing,
but I don't understand it well.
Process result time (1234 sec -> 31 sec)
import pandas as pd
from shapely.geometry import *
data = pd.read_csv("/sample.csv")
poly = Polygon([(-0.7655,-22.758499), (17.0525,-21.657499), (16.5735,-26.269501), (0.4755,-28.6635)])
cord = data.values.tolist()
for i in cord:
print(poly.intersects(Point(i[0], i[1])), i)
for example code of Python Multiprocessing Pools
enter link description here
import time
from multiprocessing import Pool
def f(x):
time.sleep(2) # Wait 2 seconds
print(x*x)
p = Pool(8)
p.map(f, [1, 2, 3, 4])
p.close()
p.join()
How should I apply this?
Related
i try get requeses multi threading and save to data
but when the multi threading in use itss not save the csv
what can do ?
Must add time.sleep? He can not do the
saving and at the same time I would be happy for help thanks
this the script
import requests
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
import json
url_list = []
with open('bookslinks.csv', newline='') as f:
reader = csv.reader(f)
# urls = list(reader)
for row in reader:
url_list.append(row[0])
def download_file(url):
string = url
say = "="
after = string[string.index(say) + len(say):]
bugeycheck = "https://xxxx/" + after + "data=" + after
j = json.loads(requests.get(bugeycheck, timeout=20,stream=True).content);
dataname = "bbbb/" + str(after) + "bbbb" + ".txt"
print(j["xx"])
with open('beneficiary.csv', 'a') as newFile:
newFileWriter = csv.writer(newFile)
newFileWriter.writerow([after, j["xx"]])
return
start = time()
processes = []
with ThreadPoolExecutor(max_workers=100) as executor:
for url in url_list:
processes.append(executor.submit(download_file, url))
for task in as_completed(processes):
print("test")
print(f'Time taken: {time() - start}')
the console python when the multi threading use
#####
false
false
falsefalse
false
####
false = add to csv file
false = add to csv file
falsefalse = add only one to csv file
false = add to csv file
I'm a blender novice and have been using the following script to dump all the blendshape weights per frame for an object into a text file - each new line bring a frame in the animation sequence.
import bpy
sce = bpy.context.scene
ob = bpy.context.object
filepath = "blendshape_tracks.txt"
file = open(filepath, "w")
for f in range(sce.frame_start, sce.frame_end+1):
sce.frame_set(f)
vals = ""
for shapeKey in bpy.context.object.data.shape_keys.key_blocks:
if shapeKey.name != 'Basis':
v = str(round(shapeKey.value, 8)) + " "
vals += v
vals = vals[0:-2]
file.write(vals + "\n");
As you can see this is super easy in Blender but now I'm trying to do the same thing in Maya. Beforehand I attempted to just bring the 3d model over in different formats; DAE and FBX (tried both ascii and bin and different year versions) but Blender just won't import them (receive numerous errors each time).
So basically what I am asking is how to do this same thing in maya via python or MEL? I checked out the motion builder api but haven't a clue where to get started.
Cheers in advance.
Edit: Okay I figured it out. Surprisingly just as easy once you get to grips with the cmds lib.
import maya.cmds as cmds
filepath = "blendshape_tracks.txt"
file = open(filepath, "w")
startFrame = cmds.playbackOptions(query=True,ast=True)
endFrame = cmds.playbackOptions(query=True,aet=True)
for i in range(int(startFrame), int(endFrame)):
vals = ""
cmds.currentTime(int(i), update=True)
weights = cmds.blendShape('blendshapeName',query=True,w=True)
vals = ""
for w in weights:
v = str(round(w, 8)) + " "
vals += v
vals = vals[0:-2]
file.write(vals + "\n")
Answering own question.
import maya.cmds as cmds
filepath = "blendshape_tracks.txt"
file = open(filepath, "w")
startFrame = cmds.playbackOptions(query=True,ast=True)
endFrame = cmds.playbackOptions(query=True,aet=True)
for i in range(int(startFrame), int(endFrame)):
vals = ""
cmds.currentTime(int(i), update=True)
weights = cmds.blendShape('blendshapeName',query=True,w=True)
vals = ""
for w in weights:
v = str(round(w, 8)) + " "
vals += v
vals = vals[0:-2]
file.write(vals + "\n")
I'm working on a program to split excel files into sections of 1000. I can't seem to get it to create a second excel file, as xlsxwriter doesn't create the second file.
from os.path import join, dirname, abspath
from xlrd.sheet import ctype_text
import csv
import os
import sys
import xlrd
import xlsxwriter
import xlwt
file_paths = sys.argv[1:]
draganddrop = ''.join(file_paths)
beginGrab = 0
counting = 0
endGrab = 1000
thousands = 0
if draganddrop == "":
fileName = raw_input("\nInput the file with extension\n>")
else:
fileName = draganddrop
stopPoint = fileName.index('.')
prepRev = fileName[stopPoint:]
preName = fileName[:stopPoint]
if prepRev == ".csv":
excelFile = xlsxwriter.Workbook(preName + '.xlsx')
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
content = csv.reader(f)
for index_col, data_in_col in enumerate(content):
for index_row, data_in_cell in enumerate(data_in_col):
worksheet.write(index_col,index_row,data_in_cell)
excelFile.close()
fileName = (preName + '.xlsx')
delMe = 1
print("Temporary Convert to xlsx done.\n")
stopPoint = fileName.index('.')
prepRev = fileName[0:stopPoint]
fname = join(dirname(abspath(__file__)), fileName)
xl_workbook = xlrd.open_workbook(fname)
sheet_names = xl_workbook.sheet_names()
xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
book = xlwt.Workbook(encoding="utf-8")
worksheet = book.add_sheet("Results", cell_overwrite_ok=True)
workbook = xlrd.open_workbook(fileName)
for sheet in workbook.sheets():
for row in range(sheet.nrows):
row = int(row)
if(int(row)>1000):
subDivide = int(row) / 1000
while(thousands != subDivide + 1):
thousands = thousands + 1
counting = 0
totalName = preName + "_" + str(thousands) + ".xlsx"
print(totalName)
excelFile = xlsxwriter.Workbook(str(totalName))
worksheet = excelFile.add_worksheet()
with open(totalName,'rb') as f:
col = xl_sheet.col_slice(0,1,10101010)
for idx, cell_obj in enumerate(col, start=beginGrab):
counting = counting + 1
if(counting == 1000):
break
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
cell_obj_str = str(cell_obj)
telePhone = (cell_obj_str[7:19])
worksheet.write(idx+1, 0, "1" + telePhone)
worksheet.write(0,0, "Telephone Number")
beginGrab = thousands * 1000
endGrab = beginGrab + 1000
excelFile.close()
excelFile = None
else:
print("Mate, this is Tiny!")
print ("Ding! Job Done!")
I've been rubber ducking this and I can't find where I'm at fault.
EDIT:
SOLVED!!
By creating a sheet and then closing it, the program can then grasp it. I will probably make a git issue about this.
if prepRev == ".csv":
totalName = preName + '.xlsx'
excelFile = xlsxwriter.Workbook(totalName)
excelFile.close()
Closing it lets open see it while it still contains the same info.
excelFile = xlsxwriter.Workbook(totalName)
worksheet = excelFile.add_worksheet()
with open(fileName,'rb') as f:
Doesn't the save/close line need to be within the while loop? Otherwise it looks like it will only save either the first/last item:
while(thousands != subDivide + 1):
# write file
excelFile.close()
that line is probably the reason why you cannot read back your file and your script crashes:
fname = join(dirname(abspath('__file__')), '%s' % fileName)
'__file__' shouldn't have quotes. I'd do:
fname = join(dirname(abspath(__file__)), fileName)
I'm using multiprocessing to try to speed up the processing of about 1000 ~500MB csv files using Pandas. I'm trying to apply a simple string regex to one column. The program works, but it seems to not free memory properly, and it eventually comes to eat up 40-80GB per process, despite none of the files being over 10GB. Do you have any idea why this could be? I've tried a number of ways to clear memory, to no avail.
import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle
oldc = ""
newc = ""
NUMPROC = 8
rep = None
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
def main(filename, oldcol, newcol):
global oldc
global newc
global rep
names = np.empty([1,1])
oldc = oldcol
newc = newcol
df = pd.read_csv(filename)
names = df.as_matrix()
del df
rep = {}
rep[newc] = {}
for row in names[1:]:
oldname = r"^"+str(row[0])+r"( .*|$)"
newname = str(row[1]) + r"\1"
rep[newc][oldname]=newname
if not os.path.exists("./standardized/"):
print("Making dir!")
os.makedirs("./standardized/")
files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
shuffle(files)
allfiles = [f for f in os.listdir('.') if ".csv" in f]
for f in allfiles:
if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
files.append(f)
print(len(files))
bundle = [(idx, f) for idx, f in enumerate(files)]
pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
r = pool.map_async(process, bundle)
pool.close()
pool.join()
def process(bundle):
global oldc
global rep
global newc
fname = bundle[1]
idx = bundle[0]
try:
print(idx)
libc.malloc_trim(0)
curfile = pd.read_csv(fname, dtype="str")
curfile[newc] = curfile[oldc].str.lower()
curfile.replace(to_replace=rep, regex=True, inplace=True)
curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
del curfile
except:
print("error on: " + str(fname))
finally:
gc.collect()
libc.malloc_trim(0)
main("lookup.csv","namefrst","stndfrst")
I'm new using MPI and I'm trying to parallelize the following code with MPI4PY. It's a code that read more or less 100 csv files and rearrange them.
The problem comes after the parallelization how to join in order the arrays 'data' and 'mapData'
the original code without parallelization:
import csv
import sys
import numpy as np
import StringIO
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#def sprintf(buf, fmt, *args):
# buf.write(fmt % args)
from os import listdir
from os.path import isfile, join
import os,glob
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + i
files_in_dir.append(i)
filelist = [[]] * len(files_in_dir)
for file in xrange(len(files_in_dir)):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
#next(reader)
filelist[file].append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
# DO NOT!!! use P
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data[i] = [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData[i] = filelist[0][i][4:7]
with open('mapdata.csv', 'wb') as mapdatares:
writer = csv.writer(mapdatares)
writer.writerows(mapData)
and this is what I could have done. my problem is when I call 'comm.allreduce'
import csv
import sys
import numpy as np
import StringIO
import re
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
from os import listdir
from os.path import isfile, join
import os,glob
import math
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + 1
files_in_dir.append(i)
#count number of csv files
print "number of files is: ", len(files_in_dir)
#divide files for processors
filesrank=len(files_in_dir)/size
intero=math.trunc(filesrank)
fremain=len(files_in_dir) % size
if fremain > rank:
sizeproc=intero + 1
else:
sizeproc=intero
#extreme of intervals
a=rank * sizeproc
b=a + sizeproc
#parallelize
filelist = [[]] * len(files_in_dir[a:b])
for file in xrange(len(files_in_dir[a:b])):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) #read cvs file skipping reader
for row in reader:
#next(reader)
filelist[file].np.append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data= [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData = filelist[0][i][4:7]
totmapData = []
totData = []
comm.allreduce(mapData, totmapData, op=MPI.SUM)
comm.allreduce(data, totData, op=MPI.SUM)
Any suggestion? thanks