import modin.pandas and ray() don't close file - python

I'm trying to use modin and ray() but I can't move file after read it. In line shutil.move(f"./IMPORT/"+file,f"./IMPORTED/"+file)
file is still open, there is some way to close it and move it in other folder?
Here is entire code:
import os
from pathlib import Path
import shutil
import ray
import ray.util
ray.init()
import modin.pandas as pd
current_directory = os.getcwd()
import_folder_path = os.path.join(current_directory, 'IMPORT')
folder_path: Path = Path(import_folder_path)
file_list = []
file_list = list(
filter(lambda x: x if x.endswith('.xlsx') else None,
os.listdir(folder_path))
)
df2 = []
if len(file_list):
excl_list=[]
excl_merged = pd.DataFrame()
imported_file_path = os.path.join(current_directory, 'IMPORTED\\')
for file in file_list:
file_path = os.path.join(folder_path,file)
df=pd.read_excel(file_path)
df = df[df['Delivery Status'] != 'Delivered']
df2 = df.append(df)
shutil.move(f"./IMPORT/"+file,f"./IMPORTED/"+file)
output_file_path = os.path.join(folder_path,'output.xlsx')
df2.to_excel(output_file_path, index=False)
else:
print("No excel file found")
Thank you for your help

There is a mention of this problem in https://github.com/pandas-dev/pandas/issues/29803. The suggested workaround is to manage the file handle lifetime yourself:
...
for file in file_list:
file_path = os.path.join(folder_path,file)
with open(file_path,"rb") as xlfile:
df=pd.read_excel(xlfile)
Pandas can read from a file handle, and this way the with ensures the handle is closed.

Related

read in csv files from a folder and create html files

I'm new to python and hoping for some help to read in csv files from a folder and converting each file to a html folder...this is what I have so far:
import pandas as pd
import os
import glob
path = "htmlplots"
csv_files = glob.glob(os.path.join(path, "*.csv"))
for file in csv_files:
# read the csv file
df = pd.read_csv(file)
# print the filename
print('File Name:', file.split("\\")[-1])
# print the content
display(df)
Ideally I then need to create html files from the resulting csv files that have a 'next' and 'previous' link from one to two, two to three (next) and three to two, two to one (previous).
Use:
import pandas as pd
import os
import glob
path = ""
csv_files = glob.glob(os.path.join(path, "*.csv"))
for i, file in enumerate(csv_files):
df = pd.read_csv(file, header = None)
name = file.split('.')[-1]
if i>0:
prev = csv_files[i-1]
df.loc['prev',:]=f'http://{prev}'
else:
df.loc['prev',:]=''
if i!=len(csv_files)-1:
next = csv_files[i+1]
df.loc['next',:]=f'http://{next}'
else:
df.loc['next',:]=''
df.to_html(f"{file}.html", render_links = True)
Input csv file:
Output html:

Renaming multiple csv files within a folder in Python

I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results

XSLX Conversion to CSV file whenever a new file is added into the folder

I try to convert a .xlsx file into .csv file whenever a new file is added into the Inputfolder and put the conversion .csv file in the OutputFolder.
import glob
import time
import os
import pandas as pd
#Get timestamp
timestr = time.strftime("%Y%m%d_%H%M%S")
#Input file path
input_filepath = 'C:/Documents/InputFile'
folderSize = 0
#Function to convert file
def format_csv(latest_file):
#Output file path
filenamepath = 'C:/Documents/OutputFile/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
while True:
checkFolder = folderSize
folderSize = 0
#Check the size of the Input Folder
for path, dirs, files in os.walk(input_filepath):
for f in files:
fp = os.path.join(path, f)
folderSize += os.path.getsize(fp)
print(folderSize)
#Create new .csv file if the Input folder has new file added
if(folderSize > checkFolder):
list_of_files = glob.glob('C:/Documents/InputFile/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
print(latest_file)
time.sleep(15)
Right now the program will only convert the first .xlsx file only. If I add a new .xlsx file into InputFolder, the file is not converted.
You could try something like reading through the folder for all .xlsx files if it finds one convert that to .csv
Here we are reading through the directory for all xlsx files, converting them by creating copies in csv version and then deleting the original xlsx version
import pandas as pd
import os
path = 'C:/Documents/InputFile'
files = os.listdir(path)
for file in files:
if '.xlsx' in file:
filename = file[:-5]
new_filename = path + "/" + filename + ".csv"
if filename + ".csv" in files:
pass
else:
df = pd.read_excel(file)
df.to_csv(new_filename)
I already improvise my original code. So, whenever I put a new excel file into InputFolder, the program will convert the file to .csv format and insert the formatted file in OutputFolder
import glob
import time
import os
import pandas as pd
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
#Function if new file is created in the folder
def on_created(event):
list_of_files = glob.glob('C:/Users/Documents/InputFolder/*.xlsx')
latest_file = max(list_of_files, key=os.path.getctime)
format_csv(latest_file)
#Function to convert .xlsx to .csv
def format_csv(latest_file):
# Get timestamp
timestr = time.strftime("%d%m%Y_%H%M%S")
#Output file path
filenamepath = 'C:/Users/Documents/OutputFolder/' + timestr + '.csv'
read_Excelfile = pd.read_excel(latest_file)
read_Excelfile.to_csv(filenamepath, index=None, header=True)
print(filenamepath)
if __name__ == "__main__":
event_handler = FileSystemEventHandler()
#Calling function for file insertion
event_handler.on_created = on_created
#Input Folder
path = 'C:/Users/Documents/InputFolder'
#Function to observe file
observer = Observer()
observer.schedule(event_handler, path, recursive=True)
observer.start()
try:
#Check every one second
while True:
time.sleep(1)
except KeyboardInterrupt:
#Program stop if keyboard interupt
observer.stop()
observer.join()

Python Pandas: Doing loop for every file in directory

So I have code which works for one particular file in directory.
I want to make a loop which will do the following but for every .csv file in directory
1) Open file 2) Add one column 3) Save file to new location
My code
import pandas as pd
import os
import glob
plik = pd.read_csv('C:\Python\zrodlo\CSCO.csv', delimiter=";")
plik['Change'] = ((plik['Close'] - plik['Open'])/plik['Open']*100)
plik.to_csv('C:\Python\zrodlo\csv_nowe_pliki\ew_file.csv')
Lines which might come in handy for the loop. I did not know how to make good use of them
os.chdir('C:\Python\zrodlo')
print(os.getcwd())
for filename in os.listdir('C:\Python\zrodlo'):
if filename.endswith(".csv"):
print(filename)
2)
path = "C:\Python\zrodlo\*.csv"
for fname in glob.glob(path):
print(fname)```
Thank you for your input
EDIT, Question is what to put into last line for loop to save multiple files?
import pandas as pd
import os
import glob
path = "C:\Python\zrodlo\*.csv"
for fname in glob.glob(path):
print(fname)
plik = pd.read_csv(fname, delimiter=";")
plik['Change'] = ((plik['Close'] - plik['Open']) / plik['Open'] * 100)
plik.to_csv('C:\Python\zrodlo\csv_nowe_pliki\ew_file.csv')
EDIT, SOLUTION
import pandas as pd
import os
import glob
os.chdir('C:\Python\zrodlo')
print(os.getcwd())
for filename in os.listdir('C:\Python\zrodlo'):
if filename.endswith(".csv"):
print(filename)
plik = pd.read_csv('C:\Python\zrodlo\\'+filename, delimiter=";")
plik['Change'] = ((plik['Close'] - plik['Open'])/plik['Open']*100)
os.chdir('C:\Python\zrodlo\csv_nowe_pliki')
plik.to_csv(filename)
os.chdir('C:\Python\zrodlo')
Is this what you want?
import pandas as pd
import os
import glob
os.chdir('C:\Python\zrodlo')
print(os.getcwd())
for filename in os.listdir('C:\Python\zrodlo'):
if filename.endswith(".csv"):
print(filename)
plik = pd.read_csv('C:\Python\zrodlo\'+filename, delimiter=";")
plik['Change'] = ((plik['Close'] - plik['Open'])/plik['Open']*100)
# You need to change the filename below otherwise you are rewriting every time after you created it
plik.to_csv(glob.glob("C:\Python\zrodlo\csv_nowe_pliki\ew_file.csv"))

Removing python path hardcode and passing list

**Update 1/8/2019 0945 EST
I have passed the script through the function given by bhakta0007 but received a path error "The system cannot find the path specified:".
After review, I added the below statement to the end of the script to pass the list through the function and the code works.
for f in fList:
excel_csv(fList)
I have added an answer to the question below.
I have a small script that I run to convert excel files to .csv. Currently , I have to repeat the script with the paths hardcoded in. The current paths have the exact same structure with the exceptions of a 3 digit identifier which I would like to create a list that I can call from. Below is my code. You will see I have variables that have the paths and I pass these variables where needed.I have looked into os.path, glob, and pathlib, but I can't find a good solution for the problem.
Original Code
import os
import glob
import pandas as pd
import shutil
Target_Path = os.path.join(os.path.dirname('//fs/Unprocessed/261/Edlog/Working/'))
Move_Path = os.path.join(os.path.dirname('//fs/Unprocessed/261/Edlog/ToProcess/'))
Process_Path = os.path.join(os.path.dirname('//fs/Unprocessed/261/Edlog/Processed/'))
os.chdir(Target_Path)
try:
for f in glob.glob('*.xls'):
out = f.split('.')[0]+'.csv'
df = pd.read_excel(f,)
df.to_csv(out, index=False)
finally:
for f in glob.glob('*.xlsx'):
out = f.split('.')[0]+'.csv'
df = pd.read_excel(f,)
df.to_csv(out, index=False)
xlsCounter = len(glob.glob1(Target_Path,"*.xls"))
xlsxCounter = len(glob.glob1(Target_Path,"*.xlsx"))
csvcounter = len(glob.glob1(Target_Path,"*.csv"))
if csvcounter == xlsCounter + xlsxCounter :
print('Complete Convert')
else:
print('Failed Convert')
for files in glob.glob('*.csv'):
shutil.move(files, Move_Path)
for files in glob.glob('*.xls'):
shutil.move(files, Process_Path)
for files in glob.glob('*.xlsx'):
shutil.move(files, Process_Path)
if len(os.listdir(Target_Path) ) == 0:
print('Complete Move')
else:
print('Failed Move')
I have used the function created from Bhakta0007, but received "The system cannot find the path specified:" error.
-Revisions added-
I added in a "For" clause at the end of the script and passed the list through the function and was able to run the script successfully in all directories.
I also used an fstring for the "Facility" instead of .format(facility)
Below is the working Code
import os
import glob
import pandas as pd
import shutil
def excel_csv(facility):
for f in facility:
Target_Path = os.path.join(os.path.dirname(f'//fs/Unprocessed/{facility}/Edlog/Working/'))
Move_Path = os.path.join(os.path.dirname(f'//fs/Unprocessed/{facility}/Edlog/ToProcess/'))
Process_Path = os.path.join(os.path.dirname(f'//fs/Unprocessed/{facility}/Edlog/Processed/'))
os.chdir(Target_Path)
try:
for f in glob.glob('*.xls'):
out = f.split('.')[0]+'.csv'
df = pd.read_excel(f,)
df.to_csv(out, index=False)
finally:
for f in glob.glob('*.xlsx'):
out = f.split('.')[0]+'.csv'
df = pd.read_excel(f,)
df.to_csv(out, index=False)
xlsCounter = len(glob.glob1(Target_Path,"*.xls"))
xlsxCounter = len(glob.glob1(Target_Path,"*.xlsx"))
csvcounter = len(glob.glob1(Target_Path,"*.csv"))
if csvcounter == xlsCounter + xlsxCounter :
print('Complete Convert')
else:
print('Failed Convert')
for files in glob.glob('*.csv'):
shutil.move(files, Move_Path)
for files in glob.glob('*.xls'):
shutil.move(files, Process_Path)
for files in glob.glob('*.xlsx'):
shutil.move(files, Process_Path)
if len(os.listdir(Target_Path) ) == 0:
print('Complete Move')
else:
print('Failed Move')
fList = ['261', '262', '278', '300']
for f in fList:
excel_csv(fList)
import os
import glob
import pandas as pd
import shutil
def process(folders):
for f in folders:
Target_Path = os.path.join(os.path.dirname('//fs/Unprocessed/{}/Edlog/Working/').format(folder))
Move_Path = os.path.join(os.path.dirname('//fs/Unprocessed/{}/Edlog/ToProcess/').format(folder))
Process_Path = os.path.join(os.path.dirname('//fs/Unprocessed/{}/Edlog/Processed/').format(folder))
os.chdir(Target_Path)
<Rest of our code>
fList = [261, 262, 278, 300]
process(fList)

Categories

Resources