I am running following code which imports csv files and append all data into single DATA array. But while storing this array into HDF5, I am keep getting error AttributeError: 'list' object has no attribute 'to_hdf'.
Please help me understand what I am missing.
import pandas as pd
import h5py
import glob
import os
path = "Z:\Test Data"
def get_CSV_files(path):
results = []
for root, dirs, files in os.walk(path):
for file in files:
fileExt=os.path.splitext(file)[-1]
if fileExt.lower() == '.csv':
results.append(os.path.join(root, file))
for directory in dirs:
results += get_CSV_files(os.path.join(root, directory))
return results
def store_all_data_hdf5(path):
files = get_CSV_files(path)
DATA=[]
for file_name in files:
data = pd.DataFrame.from_csv(file_name, sep="\t")
DATA.append(data)
store = pd.HDFStore('STORE.h5')
DATA.to_hdf('STORE.h5','table', append=True)
store.close()
return DATA
DATA is a list you define by - DATA=[] and it does not have attribute to_hdf.
You can find some example of how to use pandas HDFStore here
And you would probably need something like -
def store_all_data_hdf5(path):
files = get_CSV_files(path)
DATA=[]
store = pd.HDFStore('STORE.h5')
for file_name in files:
data = pd.DataFrame.from_csv(file_name, sep="\t")
DATA.append(data)
store.put('my_file', data, append=True)
store.close()
return DATA
Related
hi guys currently moving file based on filename on my csv file but it always move the files first and then read the filename so it always got error already exist like this
Error: Destination path 'Sortir/Membuka kertas contekan/aug1_Data16_133_86.jpg' already exists
CODE
import pandas as pd
data = pd.read_csv('train.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
import shutil
import os
for index, row in data.iterrows():
print(row['filename'], os.path.join("Sortir",row['class']))
if not os.path.exists(os.path.join("Sortir",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("Sortir",row['class'])))
os.mkdir(os.path.join("Sortir",row['class']))
shutil.move(os.path.join("images",row["filename"]), os.path.join("Sortir",row['class']))
Anyone know how to do the read the row first and then move the file? or maybe keep continue to read other row even if the file that I want to move being already moved?
Found the Answer Code here :
import shutil
import os
import pandas as pd
data = pd.read_csv('test.csv')
filenames = data['filename'].values
filenames = filenames.tolist()
classes = data['class'].values
classes = classes.tolist()
print(filenames)
print(classes)
for index, row in data.iterrows():
if not os.path.exists(os.path.join("SortirTest",row['class'])):
print("[INFO] 'creating {}' directory".format(os.path.join("SortirTest",row['class'])))
os.mkdir(os.path.join("SortirTest",row['class']))
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("SortirTest", row['class'], row['filename'])
if os.path.exists(input_name):
dest = shutil.move(input_name, output_name)
print("This File Has Been Moved:", input_name)
else:
print("This File Doesnt Exist :", input_name)
continue
In shutil.move() function you have to add the filename in the new directory too:
input_name = os.path.join("images", row["filename"])
output_name = os.path.join("Sortir", row['class'], row['filename'])
shutil.move(input_name, output_name)
Have you tried to clear the 'Sortir' folder before running the script?
I am trying to create 2 functions with python
first function zip multiple excel files that exist in the given
path.
second function read the content of the zip file than merge all
existing file into one excel file.(all files has same structure.)
The problem is that when i run the script it crashs when it comes to read the zip file and display the below error:
AttributeError: 'ZipFile' object has no attribute 'seek'
code:
import pandas as pd
import numpy as np
import zipfile
import os
def get_all_file_path(directory):
file_paths=[]
for root,directories,files in os.walk(directory):
for filename in files:
filepath = os.path.join(root,filename)
file_paths.append(filepath)
return file_paths
# Excel file merge function
def excel_file_merge(zip_file_name):
df = pd.DataFrame()
archive = zipfile.ZipFile(zip_file_name, 'r')
with zipfile.ZipFile(zip_file_name, "r") as f:
for file in f.namelist():
xlfile = archive.open(file)
if file.endswith('.xlsx'):
# Add a note indicating the file name that this dataframe originates from
df_xl = pd.read_excel(xlfile, engine='openpyxl')
df_xl['Note'] = file
# Appends content of each Excel file iteratively
df = df.append(df_xl, ignore_index=True)
return df
uploaded_file = 'F:/AIenv/test_zip'
file_paths = get_all_file_path(uploaded_file)
print("following files will be zipped: ")
for file_name in file_paths:
print(file_name)
with zipfile.ZipFile("my _python_files.zip","w")as f:
for file in file_paths:
f.write(file)
f.close()
print("All Files Zipped successfully")
df = excel_file_merge(f)
print(df)
I am trying to rename my excel files in a folder based on the content of cell a1, I am using pandas to parse the files and store the value of cell a1, then using that value to rename the file.
The code is as follows:
import os
import pandas as pd
import glob
source_dir = r'C:\Users\Ahmed_Abdelmuniem\Desktop\RenameFolder'
file_names = glob.glob(os.path.join(source_dir, '*.xlsx'))
for file_name in file_names:
df1 = pd.read_excel(file_name)
new_name = df1.iat[0,0]
# print (new_name)
file_name.rename(file_name.with_name(new_name))
I get the following traceback:
C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\python.exe C:/Users/Ahmed_Abdelmuniem/PycharmProjects/Renamer/main.py
Traceback (most recent call last):
File "C:\Users\Ahmed_Abdelmuniem\PycharmProjects\Renamer\main.py", line 13, in <module>
file_name.rename(file_name.with_name(f"new_name"))
AttributeError: 'str' object has no attribute 'rename'
Process finished with exit code 1
I am not sure what it means by AttributeError: 'str' object has no attribute 'rename', as the value stored in new_name is a str, I tested it out with print. It produces the following result:
XXX.xlsx
YYY.xlsx
The variable file_name you are trying to rename is just a str, which does not contain the rename method. You can rename a file using os.rename. See code below.
import os
import pandas as pd
import glob
source_dir = ''
file_names = glob.glob(os.path.join(source_dir, '*.xlsx'))
for file_name in file_names:
df1 = pd.read_excel(file_name)
new_name = df1.iat[0,0]
os.rename(os.path.join(source_dir, file_name), os.path.join(source_dir,new_name+".xlsx"))
I am trying to read a xlsx file, compare all the reference numbers from a column to files inside a folder and if they correspond, rename them to an email associate with the reference number.
Excel File has fields such as:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
... .....
My folder applicants just contains doc files named as the Reference column:
How can I compare the contents of the applicantsCVs folder, to the Reference field inside my excel file and if it matches, rename all of the files as the corresponding email address ?
Here is What I've tried so far:
import os
import pandas as pd
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
references = dfOne['Reference']
emailAddress = dfOne['EmailAddress']
cleanedEmailList = [x for x in emailAddress if str(x) != 'nan']
print(cleanedEmailList)
excelArray = []
filesArray = []
for root, dirs, files in os.walk("applicantCVs"):
for filename in files:
print(filename) #Original file name with type 1233.doc
reworkedFile = os.path.splitext(filename)[0]
filesArray.append(reworkedFile)
for entry in references:
excelArray.append(str(entry))
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I compare the reference names to the folder contents and print it out if it's the same:
for i in excelArray:
if i in filesArray:
print(i, "corresponds to the file names")
I've tried to rename it with os.rename(filename, cleanedEmailList ) but it didn't work because cleanedEmailList is an array of emails.
How can I match and rename the files?
Update:
from os.path import dirname
import pandas as pd
from pathlib import Path
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
emailAddress = dfOne['EmailAddress']
reference = dfOne['Reference'] = dfOne.references.astype(str)
references = dict(dfOne.dropna(subset=[reference, "EmailAddress"]).set_index(reference)["EmailAddress"])
print(references)
files = Path("applicantCVs").glob("*")
for file in files:
new_name = references.get(file.stem, file.stem)
file.rename(file.with_name(f"{new_name}{file.suffix}"))
based on sample data:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
nan jane.smith#example.com
1334 samuel.manuel#yahoo.com
First you assemble a dict with the set of references as keys and the new names as values:
references = dict(df.dropna(subset=["Reference","EmailAddress"]).set_index("Reference")["EmailAddress"])
{'1123': 'bob.smith#yahoo.com',
'1233': 'john.drako#gmail.com',
'1334': 'samuel.manuel#yahoo.com'}
Note that the references are strs here. If they aren't in your original database, you can use astype(str)
Then you use pathlib.Path to look for all the files in the data directory:
files = Path("../data/renames").glob("*")
[WindowsPath('../data/renames/1123.docx'),
WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/1233.txt')]
The renaming can be made very simple:
for file in files:
new_name = references.get(file.stem, file.stem )
file.rename(file.with_name(f"{new_name}{file.suffix}"))
The references.get asks for the new filename, and if it doesn't find it, use the original stem.
[WindowsPath('../data/renames/1156.pptx'),
WindowsPath('../data/renames/bob.smith#yahoo.com.docx'),
WindowsPath('../data/renames/john.drako#gmail.com.txt')]
How about adding the "email associate" (your new name i guess?) into an dictionary, where the keys are your reference numbers?
This could look something like:
cor_dict = {}
for i in excelArray:
if i in filesArray:
cor_dict[i] =dfOne['EmailAddress'].at[dfOne.Reference == i]
for entry in cor_dict.items():
path = 'path to file...'
filename = str(entry[0])+'.doc'
new_filename = str(entry[1]).replace('#','_') + '_.doc'
filepath = os.path.join(path, filename)
new_filepath = os.path.join(path,new_filename)
os.rename(filename, new_filename)
This is one approach using a simple iteration.
Ex:
import os
#Sample Data#
#dfOne = pd.DataFrame({'Reference': [1123, 1233, 1334, 4444, 5555],'EmailAddress': ["bob.smith#yahoo.com", "john.drako#gmail.com", "samuel.manuel#yahoo.com", np.nan, "samuel.manuel#yahoo.com"]})
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne.dropna(inplace=True) #Drop rows with NaN
for root, dirs, files in os.walk("applicantsCVs"):
for file in files:
file_name, ext = os.path.splitext(file)
email = dfOne[dfOne['Reference'].astype(str).str.contains(file_name)]["EmailAddress"]
if email.values:
os.rename(os.path.join(root, file), os.path.join(root, email.values[0]+ext))
Or if you have only .docx file to rename
import os
dfOne = pd.read_excel('Book2.xlsx', na_values=['NA'], usecols = "A:D")
dfOne["Reference"] = dfOne["Reference"].astype(str)
dfOne.dropna(inplace=True) #Drop rows with NaN
ext = ".docx"
for root, dirs, files in os.walk("applicantsCVs"):
files = r"\b" + "|".join(os.path.splitext(i)[0] for i in files) + r"\b"
for email, ref in dfOne[dfOne['Reference'].astype(str).str.contains(files, regex=True)].values:
os.rename(os.path.join(root, ref+ext), os.path.join(root, email+ext))
You could do it directly in your dataframe using df.apply():
import glob
import os.path
#Filter out null addresses
df = df.dropna(subset=['EmailAddress'])
#Add a column to check if file exists
df2['Existing_file'] = df2.apply(lambda row: glob.glob("applicantsCVs/{}.*".format(row['Reference'])), axis=1)
df2.apply(lambda row: os.rename(row.Existing_file[0], 'applicantsCVs/{}.{}'.format( row.EmailAddress, row.Existing_file[0].split('.')[-1])) if len(row.Existing_file) else None, axis = 1)
print(df2.Existing_file.map(len), "existing files renamed")
EDIT :
works now with any extension (.doc, .docx) by using glob module
Let consider our sample data in excel sheet is following:
Reference EmailAddress
1123 bob.smith#yahoo.com
1233 john.drako#gmail.com
1334 samuel.manuel#yahoo.com
nan python#gmail.com
There are following steps involved to solve this problem.
Step 1
import the data properly from excel sheet "my.xlsx". Here I am using the sample data
import pandas as pd
import os
#import data from excel sheet and drop rows with nan
df = pd.read_excel('my.xlsx').dropna()
#check the head of data if the data is in desirable format
df.head()
You will see that the data type in the references are in float type here
Step 2
Change the data type in the reference column to integer and then into string
df['Reference']=df.Reference.astype(int, inplace=True)
df = df.astype(str,inplace=True)
df.head()
Now the data is in desirable format
Step 3
Renaming the files in the desired folder. Zip the lists of 'Reference' and 'EmailAddress' to use in for loop.
#absolute path to folder. I consider you have the folder "application cv" in the home directory
path_to_files='/home/applicant cv/'
for ref,email in zip(list(df['Reference']),list(df['EmailAddress'])):
try:
os.rename(path_to_files+ref+'.doc',path_to_files+email+'.doc')
except:
print ("File name doesn't exist in the list, I am leaving it as it is")
Step 1: import the data from excel sheet "Book1.xlsx"
import pandas as pd
df = pd.read_excel (r'path of your file here\Book1.xlsx')
print (df)
Step 2: Choose path that your ".docx" files are in and store their names.
Get only relevent part of filename to compare.
mypath = r'path of docx files\doc files'
from os import listdir,rename
from os.path import isfile, join
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
#print(onlyfiles)
currentfilename=onlyfiles[0].split(".")[0]
This is how I stored the files
Step 3: Run loop to check if name matches with the Reference. And just use rename(src,dest) function from os
for i in range(3):
#print(currentfilename,df['ref'][i])
if str(currentfilename)==str(df['Reference'][i]):
corrosponding_email=df['EmailAddress'][i]
#print(mypath+"\\"+corrosponding_email)
rename(mypath+"\\"+str(currentfilename)+".docx",mypath+"\\"+corrosponding_email+".docx")
checkout the code with example:https://github.com/Vineet-Dhaimodker
My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)