I have a folder with 50 .csv files. The .csv files are auto-generated and a results/ output from a process-based model (long and automatically named). For example, sandbox_username_vetch_scaleup_IA_1.csv; sandbox_username_vetch_scaleup_IA_2.csv, and it continues till sandbox_username_vetch_scaleup_IA_50.csv.
I am trying to shorten the file names in a way so that the files are names are IA_1, IA_2 ...up to IA_50 and subsequently the new .csv file name gets added as a column to the data frame. Here is what I have tried so far
# import necessary libraries
import pandas as pd
import os
import glob
import sys
from pathlib import Path
import re
data_p = "/Users/Username/Documents/HV_Scale/CWAD"
output_p = "/Users/Username/Documents/HV_Scale/CWAD"
retval = os.getcwd()
print (retval) # see in which folder you are
os.chdir(data_p) # move to the folder with your data
os.getcwd()
filenames = sorted(glob.glob('*.csv'))
fnames = list(filenames) # get the names of all your files
#print(fnames)
#Loop over
for f in range(len(fnames)):
print(f'fname: {fnames[f]}\n')
pfile = pd.read_csv(fnames[f], delimiter=",") # read in file
#extract filename
filename = fnames[f]
parts = filename.split(".") # giving you the number in file name and .csv
only_id = parts[0].split("_") # if there is a bracket included
# get IA from your file
filestate = pfile["IA"][0] # assuming this is on the first row
filestate = str(filestate)
# get new filename
newfilename = only_id[0]+"-"+filestate+parts[1]
# save your file (don't put a slash at the end of your directories on top)
pfile.to_csv(output_p+"/"+newfilename, index = False, header = True)
Here is the code for adding the csv file name as a column
import glob
import os
import shutil
import sys
import pandas as pd
path = '/Users/Username/Documents/HV_Scale/IA_CWAD/short'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in glob.glob(path+'\*.csv')]
df = pd.DataFrame()
for file_ in all_files:
file_df = pd.read_csv(file_,sep=';', parse_dates=[0], infer_datetime_format=True,header=None )
file_df['file_name'] = file_
df = df.append(file_df)
#However, this adds the old csv file name and not the renamed one
In order to rename and move these files, all you need is:
import glob
import os
import shutil
import sys
SOURCE = '<Your source directory>'
TARGET = '<Your target directory>'
for file in glob.glob(os.path.join(SOURCE, '*_IA_*.csv')):
idx = file.index('_IA_')
filename = file[idx+1:]
target = os.path.join(TARGET, filename)
if os.path.exists(target):
print(f'Target file {target} already exists', file=sys.stderr)
else:
shutil.copy(file, target)
As there's nothing in the OP's question that tries to handle modification of the CSV files, that is left as an exercise for the OP.
Source and target directories should be different otherwise this can lead to ambiguous results
Related
I'm new to python and hoping for some help to read in csv files from a folder and converting each file to a html folder...this is what I have so far:
import pandas as pd
import os
import glob
path = "htmlplots"
csv_files = glob.glob(os.path.join(path, "*.csv"))
for file in csv_files:
# read the csv file
df = pd.read_csv(file)
# print the filename
print('File Name:', file.split("\\")[-1])
# print the content
display(df)
Ideally I then need to create html files from the resulting csv files that have a 'next' and 'previous' link from one to two, two to three (next) and three to two, two to one (previous).
Use:
import pandas as pd
import os
import glob
path = ""
csv_files = glob.glob(os.path.join(path, "*.csv"))
for i, file in enumerate(csv_files):
df = pd.read_csv(file, header = None)
name = file.split('.')[-1]
if i>0:
prev = csv_files[i-1]
df.loc['prev',:]=f'http://{prev}'
else:
df.loc['prev',:]=''
if i!=len(csv_files)-1:
next = csv_files[i+1]
df.loc['next',:]=f'http://{next}'
else:
df.loc['next',:]=''
df.to_html(f"{file}.html", render_links = True)
Input csv file:
Output html:
hi guys currently moving file based on filename on my csv file but it always move the files first and then read the filename so it always got error already exist like this
Error: Destination path 'Sortir/Membuka kertas contekan/aug1_Data16_133_86.jpg' already exists
CODE
import pandas as pd
data = pd.read_csv('train.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
import shutil
import os
for index, row in data.iterrows():
print(row['filename'], os.path.join("Sortir",row['class']))
if not os.path.exists(os.path.join("Sortir",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("Sortir",row['class'])))
os.mkdir(os.path.join("Sortir",row['class']))
shutil.move(os.path.join("images",row["filename"]), os.path.join("Sortir",row['class']))
Anyone know how to do the read the row first and then move the file? or maybe keep continue to read other row even if the file that I want to move being already moved?
Found the Answer Code here :
import shutil
import os
import pandas as pd
data = pd.read_csv('test.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
for index, row in data.iterrows():
if not os.path.exists(os.path.join("SortirTest",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("SortirTest",row['class'])))
os.mkdir(os.path.join("SortirTest",row['class']))
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("SortirTest", row['class'], row['filename'])
if os.path.exists(input_name):
dest = shutil.move(input_name, output_name)
print("This File Has Been Moved:", input_name)
else:
print("This File Doesnt Exist :", input_name)
continue
In shutil.move() function you have to add the filename in the new directory too:
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("Sortir", row['class'], row['filename'])
shutil.move(input_name, output_name)
Have you tried to clear the 'Sortir' folder before running the script?
I am trying to read a xlsx file, compare all the reference numbers from a column to files inside a folder and if they correspond, rename them to an email associate with the reference number.
Excel File has fields such as:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
... .....
My folder applicants just contains doc files named as the Reference column:
How can I compare the contents of the applicantsCVs folder, to the Reference field inside my excel file and if it matches, rename all of the files as the corresponding email address ?
Here is What I've tried so far:
import os
import pandas as pd
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
references = dfOne['Reference']
emailAddress = dfOne['EmailAddress']
cleanedEmailList = [x for x in emailAddress if str(x) != 'nan']
print(cleanedEmailList)
excelArray = []
filesArray = []
for root, dirs, files in os.walk("applicantCVs"):
for filename in files:
print(filename) #Original file name with type 1233.doc
reworkedFile = os.path.splitext(filename)[0]
filesArray.append(reworkedFile)
for entry in references:
excelArray.append(str(entry))
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I compare the reference names to the folder contents and print it out if it's the same:
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I've tried to rename it with os.rename(filename, cleanedEmailList ) but it didn't work because cleanedEmailList is an array of emails.
How can I match and rename the files?
Update:
from os.path import dirname
import pandas as pd
from pathlib import Path
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)
references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")
for file in files:
new_name = references.get(file.stem, file.stem)
file.rename(file.with_name(f"{new_name}{file.suffix}"))
based on sample data:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
nan jane.smith#example.com
1334 samuel.manuel#yahoo.com
First you assemble a dict with the set of references as keys and the new names as values:
references = dict(df.dropna(subset=["Reference","EmailAddress"]).set_index("Reference")["EmailAddress"])
{'1123': 'bob.smith#yahoo.com',
'1233': 'john.drako#gmail.com',
'1334': 'samuel.manuel#yahoo.com'}
Note that the references are strs here. If they aren't in your original database, you can use astype(str)
Then you use pathlib.Path to look for all the files in the data directory:
files = Path("../data/renames").glob("*")
[WindowsPath('../data/renames/1123.docx'),
WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/1233.txt')]
The renaming can be made very simple:
for file in files:
new_name = references.get(file.stem, file.stem )
file.rename(file.with_name(f"{new_name}{file.suffix}"))
The references.get asks for the new filename, and if it doesn't find it, use the original stem.
[WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/bob.smith#yahoo.com.docx'),
WindowsPath('../data/renames/john.drako#gmail.com.txt')]
How about adding the "email associate" (your new name i guess?) into an dictionary, where the keys are your reference numbers?
This could look something like:
cor_dict = {}
for i in excelArray:
if i in filesArray:
cor_dict[i] =dfOne['EmailAddress'].at[dfOne.Reference == i]
for entry in cor_dict.items():
path = 'path to file...'
filename = str(entry[0])+'.doc'
new_filename = str(entry[1]).replace('#','_') + '_.doc'
filepath = os.path.join(path, filename)
new_filepath = os.path.join(path,new_filename)
os.rename(filename, new_filename)
This is one approach using a simple iteration.
Ex:
import os
#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith#yahoo.com", "john.drako#gmail.com", "samuel.manuel#yahoo.com", np.nan, "samuel.manuel#yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True) #Drop rows with NaN
for root, dirs, files in os.walk("applicantsCVs"):
for file in files:
file_name, ext = os.path.splitext(file)
email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
if email.values:
os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))
Or if you have only .docx file to rename
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne["Reference"] = dfOne["Reference"].astype(str)
dfOne.dropna(inplace=True) #Drop rows with NaN
ext = ".docx"
for root, dirs, files in os.walk("applicantsCVs"):
files = r"\b" + "|".join(os.path.splitext(i)[0] for i in files) + r"\b"
for email, ref in dfOne[dfOne['Reference'].astype(str).str.contains(files, regex=True)].values:
os.rename(os.path.join(root, ref+ext), os.path.join(root, email+ext))
You could do it directly in your dataframe using df.apply():
import glob
import os.path
#Filter out null addresses
df = df.dropna(subset=['EmailAddress'])
#Add a column to check if file exists
df2['Existing_file'] = df2.apply(lambda row: glob.glob("applicantsCVs/{}.*".format(row['Reference'])), axis=1)
df2.apply(lambda row: os.rename(row.Existing_file[0], 'applicantsCVs/{}.{}'.format( row.EmailAddress, row.Existing_file[0].split('.')[-1])) if len(row.Existing_file) else None, axis = 1)
print(df2.Existing_file.map(len), "existing files renamed")
EDIT :
works now with any extension (.doc, .docx) by using glob module
Let consider our sample data in excel sheet is following:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
nan python#gmail.com
There are following steps involved to solve this problem.
Step 1
import the data properly from excel sheet "my.xlsx". Here I am using the sample data
import pandas as pd
import os
#import data from excel sheet and drop rows with nan
df = pd.read_excel('my.xlsx').dropna()
#check the head of data if the data is in desirable format
df.head()
You will see that the data type in the references are in float type here
Step 2
Change the data type in the reference column to integer and then into string
df['Reference']=df.Reference.astype(int, inplace=True)
df = df.astype(str,inplace=True)
df.head()
Now the data is in desirable format
Step 3
Renaming the files in the desired folder. Zip the lists of 'Reference' and 'EmailAddress' to use in for loop.
#absolute path to folder. I consider you have the folder "application cv" in the home directory
path_to_files='/home/applicant cv/'
for ref,email in zip(list(df['Reference']),list(df['EmailAddress'])):
try:
os.rename(path_to_files+ref+'.doc',path_to_files+email+'.doc')
except:
print ("File name doesn't exist in the list, I am leaving it as it is")
Step 1: import the data from excel sheet "Book1.xlsx"
import pandas as pd
df = pd.read_excel (r'path of your file here\Book1.xlsx')
print (df)
Step 2: Choose path that your ".docx" files are in and store their names.
Get only relevent part of filename to compare.
mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]
This is how I stored the files
Step 3: Run loop to check if name matches with the Reference. And just use rename(src,dest) function from os
for i in range(3):
#print(currentfilename,df['ref'][i])
if str(currentfilename)==str(df['Reference'][i]):
corrosponding_email=df['EmailAddress'][i]
#print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")
checkout the code with example:https://github.com/Vineet-Dhaimodker
I am new with Python but trying to write a code which add a column on multiple .xlsx files and saves this files with the origin name to a new folder.
I have started with some coding beneath, but missing some code in open all files and saving to my DestPath. Would be pleased if any has a solution for this:
from os import listdir, path
import pandas as pd
import xlrd
SourcePath = 'C:\' #Source Path
DestPath = 'C:\' #Destination Path
# Listing up all .xlsx files from Source
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
filenames = listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
filenames = find_xlsx_filenames(SourcePath)
fname = path.join(SourcePath, filenames[0]) # Tar første fil i mappa.
outname = path.join(outputdata, filenames[0])
for i in range(len(filenames)):
fname = path.join(SourcePath, filenames[i])
df = pd.read_excel(fname) #Read Excel file as a DataFrame
df['new_col'] = 'Sort Data' #Adding a new column named <Sort Data>
#To save it back as Excel
df.to_excel(DestPath, outname) #Write DateFrame back as Excel file
Thanks in Advance
check if this works
import os
import pandas as pd
path = 'C:/'
for roots, dirs, files in os.walk(path):
xlsfile = [ _ for _ in files if _.endswith('.xlsx')]
for xlsf in xlsfile:
df = pd.read_excel(os.path.join(roots, xlsf))
df['Sort Data'] = ' '
df.to_excel(os.path.join(roots, xlsf), index = False)
I have an Excel file that contains long product tag name like(for now, just working on 3 of them):
4049047000037
4049047000044
4049047118954
and i have a folder on my desktop called "1" containing .jpg files with tag names like:
4049047000037.jpg
4049047000044.jpg
4049047118954.jpg
i want to write a code, if tag name in my excel, i want to copy that .jpg file to an other folder.
import os
import pandas as pd
movdir = ["C:\Users\muhammedcan\Desktop\1"]
basedir = "C:\Users\muhammedcan\Desktop\2"
i=0
#to see what i have in my folder
print os.listdir("C:/Users/muhammedcan/Desktop/1/")
df= pd.read_excel("deneme.xls", sheetname= "sayfa4")
df1= df.columns[1]
listSepalWidth = df[df1]
print listSepalWidth
#to make file name and product tag name same
for i in listSepalWidth:
i=str(i)+(".jpg")
print i
can you help me with copying file into an other file if it is exist in my excel?
this is my result so far:
['4049047000037.jpg', '4049047000044.jpg', '4049047000068.jpg',
'4049047000075.jpg', '4049047000082.jpg', '4049047000105.jpg',
'4049047118947.jpg', '4049047118954.jpg']
4049047000037.jpg
4049047000044.jpg
4049047118954.jpg
4049047000068.jpg
4049047000075.jpg
4049047000082.jpg
4049047118947.jpg
4049047000105.jpg
I used following code, and I am recieving error.
from shutil import copyfile
copyfile("C:\Users\muhammedcan\Desktop\1", "C:\Users\muhammedcan\Desktop\2")
Error is:
C:\Python27\python.exe "C:/Users/muhammedcan/Desktop/summer
courses/programing/MP4/solution/my_work.py"
Traceback (most recent call last):
File "C:/Users/muhammedcan/Desktop/summer courses/programing/MP4/solution/my_work.py", line 3, in <module>
copyfile("C:\Users\muhammedcan\Desktop\1",
Process finished with exit code 1
The following should do what you are looking for:
import os
import glob
import pandas as pd
import shutil
source_folder = r"C:\Users\muhammedcan\Desktop\1"
destination_folder = r"C:\Users\muhammedcan\Desktop\2"
available_filenames = [os.path.basename(fn) for fn in glob.glob(os.path.join(source_folder, '*.jpg'))]
df = pd.read_excel("deneme.xls", sheetname="sayfa4")
for tag_name in df.iloc[:,1]:
filename = "{}.jpg".format(tag_name)
if filename in available_filenames:
print "{} - found".format(filename)
shutil.copyfile(os.path.join(source_folder, filename), os.path.join(destination_folder, filename))
else:
print "{} - not found".format(filename)
If first creates a list of .jpg filenames found in the source_folder. It then loads the Excel file into pandas and iterates over the second column. If the tag name is found in the list of available_filenames the shutil.copyfile() function is used to copy the file from 1 to 2. Note os.path.join() is used to safely join parts of a file together.
To make it into a function to let you also do 'pdf' you could do:
import os
import glob
import pandas as pd
import shutil
source_folder = r"C:\Users\muhammedcan\Desktop\1"
destination_folder = r"C:\Users\muhammedcan\Desktop\2"
df = pd.read_excel("deneme.xls", sheetname="sayfa4")
def copy_files(source_folder, destination_folder, extension):
available_filenames = [os.path.basename(fn) for fn in glob.glob(os.path.join(source_folder, '*.{}'.format(extension)))]
for tag_name in df.iloc[:,1]:
filename = "{}.{}".format(tag_name, extension)
if filename in available_filenames:
print "{} - found".format(filename)
shutil.copyfile(os.path.join(source_folder, filename), os.path.join(destination_folder, filename))
else:
print "{} - not found".format(filename)
copy_files(source_folder, destination_folder, 'jpg')
copy_files(source_folder, destination_folder, 'pdf')
This assumes the same deneme.xls is used for both. If not it could be passed as another argument to the function.