I have 20 folders, each containing 50 txt files, I need to read all of them in order to compare the word counts of each folder. I know how to read multiple files in one folder, but it is slow, is there a more efficient way instead of reading the folder one by one like below?
import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import os
import glob
1. folder1
folder_path = '/home/runner/Final-Project/folder1'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
2. folder2
folder_path = '/home/runner/Final-Project/folder2'
for filename in glob.glob(os.path.join(folder_path, '*.txt')):
with open(filename, 'r') as f:
text = f.read()
print (filename)
print (len(text))
You can do something similar using glob like you have, but with the directory names.
folder_path = '/home/runner/Final-Project'
for filename in glob.glob(os.path.join(folder_path,'*','*.txt')):
# process your files
The first '*' in the os.path.join() represents directories of any name. So calling glob.glob() like this will go through and find any text file in any direct sub-directory within folder_path
Below function will return list of files in all the directories and sub-directories without using glob. Read from the list of files and open to read.
def list_of_files(dirName):
files_list = os.listdir(dirName)
all_files = list()
for entry in files_list:
# Create full path
full_path = os.path.join(dirName, entry)
if os.path.isdir(full_path):
all_files = all_files + list_of_files(full_path)
else:
all_files.append(full_path)
return all_files
print(list_of_files(<Dir Path>)) # <Dir Path> ==> your directory path
Related
This question already has answers here:
Python Glob without the whole path - only the filename
(10 answers)
Closed 4 months ago.
Can anyone help write a script, the goal is to find files with extension and save the name and path in TXT or CSV
that a script which find and print file file type and path,but how can i save the result to csv/txt ?
import fnmatch
import os
import csv
rootPath = '/'
pattern = '*.exe'
filepath = 'C:/Users/user/Desktop/filetest.txt'
for root, dirs, files in os.walk(rootPath):
for filepath in fnmatch.filter(files, pattern):
x = (os.path.join(root, filepath))
print(x)
i try this one, but its save only the last line.
import fnmatch
import os
import csv
rootPath = '/'
pattern = '*.exe'
filepath = 'C:/Users/user/Desktop/filetest.txt'
for root, dirs, files in os.walk(rootPath):
for filepath in fnmatch.filter(files, pattern):
x = (os.path.join(root, filepath))
file = open(filepath, 'w')
file.write(x)
file.close()
print(x)
I think the reason is you always open the file within loop using open(filepath, 'w') the option 'w' is always overwrite the file, if you want to append, you can use 'a', but I think in this case is not good solution because the main reason is you always reopen the file for each loop
By using your code, I think you can solve it by putting the open command outside the loop
import fnmatch
import os
import csv
rootPath = '/'
pattern = '*.exe'
filepath = 'C:/Users/user/Desktop/filetest.txt'
file = open(filepath, 'w')
for root, dirs, files in os.walk(rootPath):
for filepath in fnmatch.filter(files, pattern):
x = (os.path.join(root, filepath))
file.write(x+'\n')
file.close()
from glob import glob
import os
files = sorted(glob(os.path.join(rootPath, pattern)))
with open(filepath, 'w') as fid:
fid.write('\n'.join(files))
I would like to read all the contents from all the text files in a directory. I have 4 text files in the "path" directory, and my codes are;
for filename in os.listdir(path):
filepath = os.path.join(path, filename)
with open(filepath, mode='r') as f:
content = f.read()
thelist = content.splitlines()
f.close()
print(filepath)
print(content)
print()
When I run the codes, I can only read the contents from only one text file.
I will be thankful that there are any advice or suggestions from you or that you know any other informative inquiries for this question in stackoverflow.
If you need to filter the files' name per suffix, i.e. file extension, you can either use the string method endswith or the glob module of the standard library https://docs.python.org/3/library/glob.html
Here an example of code which save each file content as a string in a list.
import os
path = '.' # or your path
files_content = []
for filename in filter(lambda p: p.endswith("txt"), os.listdir(path)):
filepath = os.path.join(path, filename)
with open(filepath, mode='r') as f:
files_content += [f.read()]
With the glob way here an example
import glob
for filename in glob.glob('*txt'):
print(filename)
This should list your file and you can read them one by one. All the lines of the files are stored in all_lines list. If you wish to store the content too, you can keep append it too
from pathlib import Path
from os import listdir
from os.path import isfile, join
path = "path_to_dir"
only_files = [f for f in listdir(path) if isfile(join(path, f))]
all_lines = []
for file_name in only_files:
file_path = Path(path) / file_name
with open(file_path, 'r') as f:
file_content = f.read()
all_lines.append(file_content.splitlines())
print(file_content)
# use all_lines
Note: when using with you do not need to call close() explicitly
Reference: How do I list all files of a directory?
Basically, if you want to read all the files, you need to save them somehow. In your example, you are overriding thelist with content.splitlines() which deletes everything already in it.
Instead you should define thelist outside of the loop and use thelist.append(content.splitlines) each time, which adds the content to the list each iteration
Then you can iterate over thelist later and get the data out.
Folder A has more than 100 files, folder B is my destination folder. I want to copy 10 files in folder A to folder B. The 10 files names are in the text file C.
import os
import shutil
from glob import glob
namelist = open('/Users/C.txt').read().splitlines()
input = '/Users/A'
output = '/Users/B'
path = '/Users/A'
files = glob(path)
for path in files:
filedir, filename = os.path.split(path)
for filename in namelist:
shutil.copy2(input,output)
It returns an Error. Please help me to do it in Python, thanks a lot!
There are a lot of things that you can do with your code:
import os
import shutil
from glob import glob
#namelist = open('/Users/C.txt').read().splitlines()
# context manager will take care of closing the file after open
# no need read as one string and do splitlines, readlines take care of that
with open('/Users/C.txt') as fp:
namelist = fp.readlines()
input = '/Users/A'
output = '/Users/B'
path = '/Users/A'
files = os.listdir(path)
# dont need glob import as you already imported os
#files = glob(path)
# loop only through files mentioned in the text file and see if they are available in
# folder A
for file_name in namelist:
file_path = os.path.join(input,file_name)
if file_path in files:
dest_path = os.path.join(output,file_name)
shutil.copy(file_path,dest_path)
#for path in files:
# filedir, filename = os.path.split(path)
# for filename in namelist:
# shutil.copy2(input,output)
I do not have sample data or error message to check. From what i can see in your code,
for path in files:
filedir, filename = os.path.split(path)
if filename in namelist:
shutil.copy2(input,output)
Your paths are from the root folder because of the starting forward slash. Try putting a dot in front of them if the folders and files are relative to the location of your .py file or no preceding slash:
./Users/A or Users/A
I'm looking for help in below code where I can remove the files from the folder which not available in the given csv file.
I read the input file in the pandas' data frame and convert it into the list then
reading the fileName from the folder and comparing the fileName with the available file in the folder and if it exists continue if not remove. but it is removing all the files including the not matching files.
I only want to remove the files which are not present in the file I'm reading using pandas data frame.
import os
import pandas as pd
path = "Adwords/"
flist = pd.read_csv('C:/mediaops/mapping/adword/file_name.csv')
file_name = flist['fileName'].tolist()
for filename in os.listdir(path):
print(filename)
if filename == file_name:
continue
elif filename != file_name:
os.remove(filename)
for filename in os.listdir(path):
print(filename)
if filename not in file_name:
os.remove(filename)
In your original solution, you are trying to do filename == file_name and filename != file_name, but you cannot do that.
See filename is a string and file_name is a list, and you cannot use == to compare them, you need to use membership operators like in and not in, like if filename not in file_name: which I did in my answer below
(Thanks to Tobias's Answer)
Now since that is out of the window, now you can iterate through all files using os.listdir, then use os.remove to remove the necessary files, in addition using os.path.join to get the full path of the file!
import os
#List all files in path
for filename in os.listdir(path):
#If file is not present in list
if filename not in file_name:
#Get full path of file and remove it
full_file_path = os.path.join(path, filename)
os.remove(full_file_path)
The problem is that file_name is a list if string, whereas filename is a single string, so the check filename != file_name will always be true and the file thus always be removed. Instead, use in and not in to check whether the string is (not) in the list of strings. Also, using a set would be faster. Also, those variable names are really confusing.
set_of_files = set(file_name)
for filename in os.listdir(path):
if filename not in set_of_files:
os.remove(filename)
Also, as noted in Devesh's answer, you may have to join the filename to the path in order to be able to actually remove the file.
when I implemented these answers it deleted all files in the dir, not in my list. So I wrote one for any weary traveler that may need this script. User needs to add in the path for where their files are and make a csvfile with the basename of the files that they want to keep. you can also add in the extention of the files that you want to look at if they happen to all the same.
The process is making the csv into a list based on each element in the first column and then checking to see if the files in the current dir are present in the list. If they are not then remove.
import os
import csv
import argparse
import sys
import pathlib
data_path = path = "/path/to/your/dir"
csv_guide = "filenamestokeep.csv"
csv_path = os.path.join(data_path, csv_guide)
ext = "input.your.extention.of.files.to.look.at.as.ext, like .txt"
with open(csv_path, 'r') as csvfile:
good_files = []
for n in csv.reader(csvfile):
if len(n) > 0: good_files.append(n[0])
print(good_files)
all_files = os.listdir(data_path)
for filename in all_files:
if filename.endswith(ext) and filename not in good_files:
print(filename)
full_file_path = os.path.join(data_path, filename)
print("File to delete: {} ".format(filename))
os.remove(full_file_path)
else:
print(f"Ignored -- {filename}")
I am trying to count the number times "Tmp" occurs in a file and what file the count belongs to. I created a script that works but I have to setup the input file and output directory for each file. To improve it I would like the script to go through each file in a folder after setting it up once.
I have been experimenting with:
import tkFileDialog
import glob
import os
directory = tkFileDialog.askdirectory()
for infile in glob.glob(os.path.join(directory, "*.*")):
open(infile, "r").read()
infile.count("Tmp")
Currently I am counting the number of times "Tmp" occurs in the file name and not the actual file, when I type:
print infile
it outputs the contents of the text files but not the directory? I am just confused on where to go or what to do.
I would use os.walk rather than glob:
import tkFileDialog
import os
import os.path
import re
directory = tkFileDialog.askdirectory()
for dirpath, dirnames, filenames in os.walk(directory):
for filename in filenames:
path = os.path.join(dirpath, filename)
with open(path) as file:
contents = file.read()
print path[:30], contents.count('Tmp'), re.findall('Tmp\d{5}', contents)
That should be:
data = open(infile, 'r').read()
print data.count('Tmp')
import os
import glob
import tkFileDialog
directory = tkFileDialog.askdirectory()
for infile in glob.glob(os.path.join(directory, '*')):
if os.path.isfile(infile):
f = open(infile)
print os.path.split(infile)[-1], f.read().count('Tmp')