File names on CSV with bad line errors - python

I'm using the following code below to capture bad line errors when reading a csv through pandas. I'm having trouble getting the filename to be included. I tried using a list to append during the loop but resulted in every file showing an error instead of just the files with errors.
How can I get the filename included?
import os
import glob
import sys
from io import StringIO
import pandas as pd
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mydict = {}
mylist = []
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
x = ''
for f in glob.glob(zipPath):
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False)
result_string = result.getvalue()
f_name = os.path.basename(f)
if len(result_string) > 1 :
with open('bad_lines.txt', 'w') as bad_lines:
for line in result_string.split(r'\n'):
if len(line)> 5:
bad_lines.write(line.replace('\n','').replace('b','').replace("'",''))
bad_lines.write('\n')
sys.stderr = old_stderr
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))

I was able to get the following working code :
import os
import sys
import glob
import pandas as pd
from io import StringIO
from pathlib import Path
UnzipFilePoint = Path(str(os.getcwd()) + '/Unzipped/')
def FindBadLines(zipPath):
mylist = []
for f in glob.glob(zipPath):
f_name = os.path.basename(f)
old_stderr = sys.stderr
result = StringIO()
sys.stderr = result
df = pd.read_csv(f, dtype=str,encoding = "ISO-8859-1", error_bad_lines=False ,warn_bad_lines=True)
result_string = result.getvalue()
sys.stderr = old_stderr
if len(result_string) > 5 :
mylist.append([result_string,f_name ])
mynewlist = []
for i in mylist:
i[0] = i[0].replace('b','').replace("'",'')
for x in i[0].replace('\n','').split('\\n'):
if len(x) > 1 :
mynewlist.append([x , i[1]])
df = pd.DataFrame(mynewlist, columns = ['Error', 'File'])
print(df)
zipPath = UnzipFilePoint / "*"
FindBadLines(str(zipPath))

Related

import modin.pandas and ray() don't close file

I'm trying to use modin and ray() but I can't move file after read it. In line shutil.move(f"./IMPORT/"+file,f"./IMPORTED/"+file)
file is still open, there is some way to close it and move it in other folder?
Here is entire code:
import os
from pathlib import Path
import shutil
import ray
import ray.util
ray.init()
import modin.pandas as pd
current_directory = os.getcwd()
import_folder_path = os.path.join(current_directory, 'IMPORT')
folder_path: Path = Path(import_folder_path)
file_list = []
file_list = list(
filter(lambda x: x if x.endswith('.xlsx') else None,
os.listdir(folder_path))
)
df2 = []
if len(file_list):
excl_list=[]
excl_merged = pd.DataFrame()
imported_file_path = os.path.join(current_directory, 'IMPORTED\\')
for file in file_list:
file_path = os.path.join(folder_path,file)
df=pd.read_excel(file_path)
df = df[df['Delivery Status'] != 'Delivered']
df2 = df.append(df)
shutil.move(f"./IMPORT/"+file,f"./IMPORTED/"+file)
output_file_path = os.path.join(folder_path,'output.xlsx')
df2.to_excel(output_file_path, index=False)
else:
print("No excel file found")
Thank you for your help
There is a mention of this problem in https://github.com/pandas-dev/pandas/issues/29803. The suggested workaround is to manage the file handle lifetime yourself:
...
for file in file_list:
file_path = os.path.join(folder_path,file)
with open(file_path,"rb") as xlfile:
df=pd.read_excel(xlfile)
Pandas can read from a file handle, and this way the with ensures the handle is closed.

How do I retrieve the datemodtime of each file based on filename in python?

My current process involves looping through my source directory and adding the name of each file to my dataframe in python. I want to get the datemodified for each of these files as well
import datetime
import os
import pandas as pd
#set src directory
os.chdir('C:/Users/jj/Desktop/do/Claims/globmove')
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
folder_path = os.path.abspath('C:/Users/jj/Desktop/do/Claims/globmove')
files = [file for file in os.listdir(folder_path) if file.endswith(".xlsx")]
dfooc = read_files(files)
I am able to run this without errors, but the datemodified timestamp currently does not append to the final dataframe- dfooc. How can I get the datemodified to append?
Edit: Getting an indent error after changing order of my original code above
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
def modification_date(filename):
t = os.path.getmtime(filename)
return datetime.datetime.fromtimestamp(t)
file['ModificationDate'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
return pd.concat(result, ignore_index=True)
^
IndentationError: unexpected indent
Here's how I do it.
import os
from pathlib import Path
import pandas as pd
import pendulum
class FileDates:
def __init__(self, **kwargs):
self.file_type = kwargs.get("file_type")
self.file_path = kwargs.get("file_path")
self.path = kwargs.get("path")
self.tz = pendulum.now().timezone.name
def main(self) -> pd.DataFrame:
files = self.get_files()
dates = self.get_dates(files)
return pd.DataFrame(list(zip([str(Path(x)).split("/")[-1] for x in files], dates)), columns=["file", "date"])
def get_files(self) -> list:
files = [str(x) for x in self.file_path.rglob("*") if x.is_file()]
return [x for x in files if self.file_type in x]
def get_dates(self, files: list) -> list:
return [pendulum.from_timestamp(os.path.getmtime(Path(x))).in_tz(self.tz).to_date_string() for x in files]
file_type = ".xlsx"
file_path = Path(f"{Path.home()}/Desktop/do/Claims/globmove/")
data = FileDates(file_type=file_type, file_path=file_path).main()

how to convert folder of pickle files into single csv file

I have a directory containing about 1700 pickle file, that every file is all Twitter post of the user, I want to convert it into a folder of CSV files, that every CSV file name is the name of the pickle file and each row contains one tweet of user...
after that, I want just the top 20 CSV with more samples than others... how can I do that?
# khabarlist = open_file_linebyline(pkl_path)
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
# if len(file.name.split()) > 1:
# continue
# if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf8') as f:
items = [i.strip() for i in f.read().split(",")]
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
open_dir_in_dict("Raw/")
I Wrote the sample code for it and it did not work...
def open_dir_in_dict(input_path):
files = os.scandir(input_path)
my_dict = {}
for file in files:
if len(file.name.split()) > 1:
continue
if file.split('.')[-1] != "pkl":
with open(file, 'r', encoding='utf-8', errors='replace') as f:
print(f.readlines())
items = [i.strip() for i in f.read().split(",")] # encode('utf-8').strip()
my_dict[file.replace(".pkl", "")] = items
df = pd.DataFrame(my_dict)
df.to_excel(file.replace(".pkl", "") + "xlsx")
# open_dir_in_dict("Raw/")
and a better answer...
import os
import pandas as pd
import regex as re
data_path = "/content/drive/My Drive/twint/Data/pkl/Data/"
for path in os.listdir(data_path):
my_tweets = []
df = pd.read_pickle(data_path + path)
for tweet in df.tweet:
url = re.findall(r"http\S+", tweet)
if url == []:
my_tweets.append(tweet)
new_df = pd.DataFrame({"tweets": my_tweets, "author": path.replace(".pkl", "")}) # path[:-4]
new_df.to_csv("/content/drive/My Drive/twint/final.csv", index=False, mode="a", )

Multiprocessing Pandas eats memory

I'm using multiprocessing to try to speed up the processing of about 1000 ~500MB csv files using Pandas. I'm trying to apply a simple string regex to one column. The program works, but it seems to not free memory properly, and it eventually comes to eat up 40-80GB per process, despite none of the files being over 10GB. Do you have any idea why this could be? I've tried a number of ways to clear memory, to no avail.
import pandas as pd
import numpy as np
import os
import multiprocessing
import gc
from ctypes import cdll, CDLL
from random import shuffle
oldc = ""
newc = ""
NUMPROC = 8
rep = None
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
def main(filename, oldcol, newcol):
global oldc
global newc
global rep
names = np.empty([1,1])
oldc = oldcol
newc = newcol
df = pd.read_csv(filename)
names = df.as_matrix()
del df
rep = {}
rep[newc] = {}
for row in names[1:]:
oldname = r"^"+str(row[0])+r"( .*|$)"
newname = str(row[1]) + r"\1"
rep[newc][oldname]=newname
if not os.path.exists("./standardized/"):
print("Making dir!")
os.makedirs("./standardized/")
files = [f for f in os.listdir('.') if (os.path.isfile(f) and ".csv" in f and not (f==filename or "household" in str(f) or os.path.exists("./standardized/"+f[:-4]+"_stnd.csv")))]
shuffle(files)
allfiles = [f for f in os.listdir('.') if ".csv" in f]
for f in allfiles:
if os.path.exists("./standardized/"+f[:-4]+"_stnd.csv"):
if os.path.getsize(f) > os.path.getsize("./standardized/"+f[:-4]+"_stnd.csv"):
files.append(f)
print(len(files))
bundle = [(idx, f) for idx, f in enumerate(files)]
pool = multiprocessing.Pool(processes=NUMPROC, maxtasksperchild=1)
r = pool.map_async(process, bundle)
pool.close()
pool.join()
def process(bundle):
global oldc
global rep
global newc
fname = bundle[1]
idx = bundle[0]
try:
print(idx)
libc.malloc_trim(0)
curfile = pd.read_csv(fname, dtype="str")
curfile[newc] = curfile[oldc].str.lower()
curfile.replace(to_replace=rep, regex=True, inplace=True)
curfile.to_csv("./standardized/"+fname[:-4]+"_stnd.csv")
del curfile
except:
print("error on: " + str(fname))
finally:
gc.collect()
libc.malloc_trim(0)
main("lookup.csv","namefrst","stndfrst")

How to join arrays with mpi4py?

I'm new using MPI and I'm trying to parallelize the following code with MPI4PY. It's a code that read more or less 100 csv files and rearrange them.
The problem comes after the parallelization how to join in order the arrays 'data' and 'mapData'
the original code without parallelization:
import csv
import sys
import numpy as np
import StringIO
import re
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
#def sprintf(buf, fmt, *args):
# buf.write(fmt % args)
from os import listdir
from os.path import isfile, join
import os,glob
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + i
files_in_dir.append(i)
filelist = [[]] * len(files_in_dir)
for file in xrange(len(files_in_dir)):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
#next(reader)
filelist[file].append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
# DO NOT!!! use P
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data[i] = [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData[i] = filelist[0][i][4:7]
with open('mapdata.csv', 'wb') as mapdatares:
writer = csv.writer(mapdatares)
writer.writerows(mapData)
and this is what I could have done. my problem is when I call 'comm.allreduce'
import csv
import sys
import numpy as np
import StringIO
import re
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
numbers = re.compile(r'(\d+)')
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
from os import listdir
from os.path import isfile, join
import os,glob
import math
folder = os.getcwd()+'/'
files_in_dir=[]
for i in sorted(glob.glob('*.csv'), key=numericalSort):
print "Current File Being Processed is: " + 1
files_in_dir.append(i)
#count number of csv files
print "number of files is: ", len(files_in_dir)
#divide files for processors
filesrank=len(files_in_dir)/size
intero=math.trunc(filesrank)
fremain=len(files_in_dir) % size
if fremain > rank:
sizeproc=intero + 1
else:
sizeproc=intero
#extreme of intervals
a=rank * sizeproc
b=a + sizeproc
#parallelize
filelist = [[]] * len(files_in_dir[a:b])
for file in xrange(len(files_in_dir[a:b])):
fileName = folder+files_in_dir[file]
with open(fileName, 'rt') as f:
has_header = csv.Sniffer().has_header(f.read(1024))
f.seek(0) # rewind
incsv = csv.reader(f)
if has_header:
next(incsv)
reader = csv.reader(f, quoting=csv.QUOTE_NONNUMERIC) #read cvs file skipping reader
for row in reader:
#next(reader)
filelist[file].np.append(row) #save each row
# for row in
f.close()
filelist = [sorted(filelist[ijk],key = lambda x: x[7]) for ijk in xrange(len(filelist))] #order row by numerical index
data = [[]]*len(filelist[0]) #len(filelist[0])=lunghezza colonna
mapData = [[]]*len(filelist[0])
for i in xrange(len(data)):
data= [filelist[k][i][0:4] for k in xrange(len(filelist))]
mapData = filelist[0][i][4:7]
totmapData = []
totData = []
comm.allreduce(mapData, totmapData, op=MPI.SUM)
comm.allreduce(data, totData, op=MPI.SUM)
Any suggestion? thanks

Categories

Resources