Invalid extension for engine problem, iterating through directories and files - python

I have a code, which is working properly if I manually insert strings for path, directory and file name, here is the code:
path = r"test//ab3b//ab3b_all_anal.xlsx"
directory = "test"
file1 = "test//ab3b//ab3b80.csv"
df1 = all_calc_80(file1, directory)
file2 = "test//ab3b//ab3b80m.csv"
df2 = all_calc_80m(file2, directory)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()
Test directory has subdirectories named as ab3b, bg3a, ge3b etc. and in each of subdirectories there are files named in same way: ab3b80.csv, ab3b80m.csv; bg3a80.csv, bg3a80m.csv; ge3b80.csv, ge3b80m.csv.
Each of files based on ending 80.csv or 80m.csv use different function for analysing.
The final output is one excel workbook with sheets names after ending of csv files.
Now I am working on iterating through whole directory test, where I just give the name of the directory and everything is proceed automatically from there. So far I have:
import os
import xlsxwriter
rootdir = 'test'
slovar = {}
for subdir, dirs, files in os.walk(rootdir):
slovar[subdir] = files
for key, value in slovar.items():
if len(key) > 4: #to get just subdirectories I need
end = key[-4:]
path = 'r' + '\'' + key + '\\\\' + end + '_all_anal.xlsx' + '\''
print(path)
for vrednost in value:
if vrednost.endswith('80.csv'):
file1 = vrednost
df1 = all_calc_80(file1, rootdir)
elif vrednost.endswith('80m.csv'):
file2 = vrednost
df2 = all_calc_80m(file2, rootdir)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()
But I got error message: Invalid extension for engine '<property object at 0x000002123659D0E0>': 'xlsx''.
I think there might be some problems due to /and \ in windows paths or types of object, even though when I print out just keys and values, I get usefull output, also name of the path is written properly.
But I don't really understand why manually everything works and automated not.

If someone will still search for this answer, I had found a solution.
Main discovery was regarding how to append path and file name to the list.
It is done with os.path.join(dirpath, filename), if you use os.walk.
Here is the working code:
seznam80 = []
seznam80m = []
seznam120 = []
seznam120m = []
seznam150 = []
seznam150m = []
seznamSMT = []
dirp = []
for dirpath, dirnames, filenames in os.walk(directory): #directory with all folders of participants
for filename in [f for f in filenames if f.endswith("80.csv")]: #search for all 80 files
seznam80.append(os.path.join(dirpath, filename))
dirp.append(dirpath)
for dirpath, dirnames, filenames in os.walk(directory): #directory with all folders of participants
for filename in [f for f in filenames if f.endswith("80m.csv")]: #search for all 80m files
seznam80m.append(os.path.join(dirpath, filename))
for vsak80, vsak80m pot in zip(seznam80, seznam80m, dirp):
path = pot + '_all_anal.xlsx'
file1 = vsak80
df1 = all_calc_80(file1, directory)
file2 = vsak80m
df2 = all_calc_80m(file2, directory)
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
df1.to_excel(writer, sheet_name = '80')
df2.to_excel(writer, sheet_name = '80m')
writer.close()

Related

Combing multiple csv files from multiple subfolders in one folder

Im trying to combine multiple files located in a directory. Each of the the files is located 3 subfolders(each subfolder has another folder or file) down from the main folder and I am unable to combine all of them. The best I can do is combine the ones in each bottom most subfolder. I can get a list of every specific file I want to combine from scanning but I can't combine them. I've gone through several methods and tutorials and can't find a way to do this. The code I have is below:
import pandas as pd
import os
import glob
os.getcwd()
path_of_the_directory = 'C:\\Users\\user\\Downloads\\top_folder'
ext = ('.csv')
for files in os.listdir(path_of_the_directory):
if files.endswith(ext):
print(files)
else:
continue
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
filepath = root + os.sep + name
if filepath.endswith(".csv"):
r.append(os.path.join(root, name))
return r
print(r)
files = []
for file in r:
#for dir, dir_name, file_list in os.walk(path):
files.append(os.path.join(path,file))
combined_df = pd.concat([pd.read_csv(file) for file in files])
df = pd.concat([pd.read_csv(f) for f in files])
df.to_csv("merged.csv")
print(files)
list_files(data_dir)
data_dir = r'C:\\Users\\user\\Downloads\top_folder'
sub_folders = os.listdir(data_dir)
sub_folders
path = os.path.join(data_dir, sub_folders[2])
os.chdir(path)
files = glob.glob(path + ".\*\*.csv")
files
df = pd.concat([pd.read_csv(f) for f in chat_files])
df.to_csv("merged.csv")
Any help or direction would be extremely appreciated.

Pyinstaller one file works in python shell but fails as an exe

I have a script that takes a few csv files and concat them, it works as intended when running it in the python shell, but it fails when I make an one file exe with pyinstaller.
This is the error I get when I run my script:
The part that seems to fail is this part:
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
This is my whole for context script:
# import necessary libraries
import pandas as pd
import os
import glob
from datetime import datetime
#Set filename
file_name = 'daglig_LCR_RLI'
# in the folder
path = os.path.dirname(os.path.abspath(__file__))
# Delete CSV file
# first check whether file exists or not
# calling remove method to delete the csv file
# in remove method you need to pass file name and type
del_file = path+"\\" + file_name +'.csv'
## If file exists, delete it ##
if os.path.isfile(del_file):
os.remove(del_file)
print("File deleted")
else: ## Show an error ##
print("File not found: " +del_file)
# use glob to get all the csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))
df_list= list()
#format columns
dict_conv={'line_item': lambda x: str(x),
'column_item': lambda x: str(x)}
# loop over the list of csv files
for f in csv_files:
# read the csv file
df = pd.read_csv(f, sep=";", converters = dict_conv, encoding='latin1') #test latin1
df_list.append(df)
#print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
#add data frames to a list
RLI_combined = pd.concat(df_list, axis=0)
#Write date to approval_text
now = datetime.now()
# dd/mm/YY
print_date = now.strftime("%d/%m/%Y")
RLI_combined.loc[:, 'approval_text'] = print_date
#replace value_text with n/a
RLI_combined.loc[:, 'value_text'] = "n/a"
#Sum columns
m = RLI_combined['column_item'].isin(['0030', '0050', '0080'])
RLI_combined_sum = RLI_combined[~m].copy()
RLI_combined_sum['amount'] = RLI_combined_sum.groupby(['report_name', 'line_item', 'column_item'])['amount'].transform('sum')
RLI_combined_sum = RLI_combined_sum.drop_duplicates(['report_name', 'line_item', 'column_item'])
RLI_combined = pd.concat([RLI_combined_sum, RLI_combined[m]])
#export to csv
RLI_combined.to_csv(path + "//" + file_name + '.csv', index=False, sep=";", encoding='latin1')
#Make log
# Create the directory
directory = "Log"
parent_dir = path
# Path
path_log = os.path.join(parent_dir, directory)
try:
os.mkdir(path_log)
print('Log folder dannet')
except OSError as error:
print('Log folder eksisterer')
#export to csv
log_name = now.strftime("%d-%m-%Y_%H-%M-%S")
print(log_name)
RLI_combined.to_csv(path + "//" + 'Log' +"//" + file_name+'_' + log_name + '.csv', index=False, sep=";", encoding='latin1')
I hope you can point me in the right direction.
with the pyinstaller one file executable you will often ran into problems like that. When starting the *.exe it is extracted to a temporary directory and this is for example the start-location for relative path definitions.
So even if you get your script running and export your *.csv it will often be somewhere on your HD and not at the place of the *.exe where you perhaps expect it to be.
I think in your case the variable df_list stays empty because there are no files listed in csv_files. This is because in the temp dir (location is written in the top of the output) there are no *.csv files.
Please try printing the content of csv_files when running the one-file *.exe if this is the right guess
If this is the case start by running a one-dir *.exe and if this works you know that you have a problem with your path definitions

Update a specific sheet name in all the excels inside a folder

In a folder, I have 50 excel files with multiple sheets in each file. I have to update the name of the sheet in these files where ever the sheet_name contains "XYZ".
So for each file, if the sheet_name has "XYZ", change that sheet_name to "ABC". I tried looping through the files using the following code but could not write code to change sheet names :
filelist=[]
for path, subdirs, files in os.walk(directory):
for file in files:
if (file.endswith('.xlsx') or file.endswith('.xls') or file.endswith('.XLS')):
filelist.append(os.path.join(path, file))
You can simplify like that to list files:
import os
mypath = r'C:\your\files\path'
filenames = [x for x in os.listdir(mypath) if x.endswith('.xls') or x.endswith('.xlsx') or x.endswith('.XLS')]
for filename in filenames:
a = filename.replace('XYZ','ABC')
os.rename(mypath+"/"+filename,mypath+"/"+a)
You can use openpyxl and glob
import glob
from openpyxl import load_workbook
paths = glob.glob("directory*xls*") + glob.glob("directory*XLS*")
for path in paths:
wb = load_workbook(path)
for sheetname in wb.sheetnames:
ws = wb[sheetname]
if "XYZ" in ws.title:
ws.title = "ABC"
wb.save(path)

Create a list of all file names and their file extension in a directory

I am trying to create a dataset using pd.DataFrame to store file name and file extension of all the files in my directory. I eventually want to have two variables named Name and Extension. The name variable will have a list of file names and the extension variable should have a file type such as xlsx, and png.
I am new to python and was only able to get to this. This gives me a list of file names but I don't know how to incorporate the file extension part. Could anyone please help?
List = pd.DataFrame()
path = 'C:/Users/documnets/'
filelist = []
filepath = []
# r=root, d=directories, f = files
for subdir, dirs, files in os.walk(path):
for file in files:
filelist.append(file)
filename, file_extension = os.path.splitext('/path/to/somefile.xlsx')
filepath.append(file_extension)
List = pd.DataFrame(flielist, filepath)
Also, for this part: os.path.splitext('/path/to/somefile.xlsx'), can I leave what's in the parenthesis as it is or should I replace with my directory path?
Thank you
You can do this:
import os
import pandas as pd
path = 'C:/Users/documnets/'
filename = []
fileext = []
for file in os.listdir(path):
name, ext = file.split('.')
filename.append(name)
fileext.append(ext)
columns = ["Name", "Extension"]
data = [filename, fileext]
df = pd.DataFrame(data, columns).transpose()

trouble with FeatureClassToGeodatabase_conversion in arcpy, using .da.Walk

With this code, I am trying to read all files a directory and all its subdirectories. I have another list of files names, if the search finds files in the directories that are on the other list, I want to copy those feature classes to another location. When the code gets to FeatureClasstoGeodatabase, I keep getting an error that the input features data type is not supported or does not exist. I wasn't sure if I needed to somehow get the path as well as the filename so I created a couple of lists to capture that separately, but I'm kind of stuck here:
import arcpy
import os
workspace = r'F:\SF_HMP - transferred to Ydrive'
output_loc = r'C:\temp\temp.gdb'
mssng_files = r'F:\SF_HMP - transferred to Ydrive\Maps\broken_links_missing_files.txt'
files_to_find = []
layers_list = []
layers_path = []
with open(mssng_files) as filelist:
for line in filelist:
files_to_find.append(line.strip())
for dirpath, dirnames, filenames in arcpy.da.Walk(workspace,datatype="FeatureClass"):
for filename in filenames:
layers_list.append(filename)
layers_path.append(os.path.join(dirpath,filename))
for lyr in layers_list:
if lyr in files_to_find:
arcpy.FeatureClassToGeodatabase_conversion(lyr,output_loc)
I realized I needed to specify the workspace for each file to be copied over. I also repeated the code to search for and copy over rasters and tables:
import arcpy,os, easygui,sys
mssng_files = r'L:\SF_HMP - transferred to Ydrive\Maps\broken_links_missing_files.txt'
wkspc = easygui.enterbox("Enter workspace path:",title='Search for Files')
output_loc = easygui.enterbox("Output location:",title='Copy Files')
with open(mssng_files) as filelist:
for line in filelist:
files_to_find.append(line.strip())
for dirpath, dirnames, filenames in arcpy.da.Walk(wkspc,datatype='FeatureClass'):
for filename in filenames:
if filename in files_to_find:
ws_l = os.path.join(dirpath,filename)
arcpy.env.workspace = ws_l
arcpy.FeatureClassToGeodatabase_conversion(ws_l,output_loc)
for dirpath, dirnames, filenames in arcpy.da.Walk(wkspc,datatype='RasterDataset'):
for filename in filenames:
if filename in files_to_find:
ws_r = os.path.join(dirpath,filename)
arcpy.env.workspace = ws_r
arcpy.RasterToGeodatabase_conversion(ws_r,output_loc)
for dirpath, dirnames, filenames in arcpy.da.Walk(wkspc,datatype='Table'):
for filename in filenames:
if filename in files_to_find:
ws_t = os.path.join(dirpath,filename)
arcpy.env.workspace = ws_t
arcpy.TableToGeodatabase_conversion(ws_t,output_loc)

Categories

Resources