how to read text from multiple files in python - python

I have many text files around 3000 files in a folder and in each file the 193rd line is the only line which has important information. How can i read all these files into 1 single text file using python.

There is a function named list dir in the os module. this function returns a list of all files in a given directory. Then you can access each file using a for a loop.
This tutorial will be helpful for listdir function. https://www.geeksforgeeks.org/python-os-listdir-method/
The example code is below.
import os
# your file path
path = ""
# to store the files or you can give this direct to the for loop like below
# for file in os.listdir(path)
dir_files = []
dir_files = os.listdir(path)
# to store your important text files.
# this list stores your important text line (193th line) in each file
texts = []
for file in dir_files:
with open(path + '\\' + file) as f:
# this t varialble store your 192th line in each cycle
t = f.read().split('\n')[192]
# this file append each file into texts list
texts.append(t)
# to print each important line
for text in texts:
print(text)

You will need to list the directory and read each file like so:
import os
for filename in os.listdir("/path/to/folder"):
with open(f"/path/to/folder/{filename}", "r") as f:
desired_line = f.readlines()[193]
... # Do whatever you need with the line

You can loop through your file paths, read the text from every file and then split lines using the line seperator in your OS.
import os
file_paths = [f"/path/to/file{i}.ext" for i in range(3000)]
info = []
for p in file_paths:
with open(p, "r") as file:
text = file.read(p)
line_list = text.split(os.linesep)
info.append(line_list[192])

You could try that, if its what you want:
import os
important_lines = []
for textfile in os.listdir('data'):
text_from_file = open( os.path.join('data', textfile) ).readlines()
if len(text_from_file) >= 192:
important_line = text_from_file[192]
important_lines.append(important_line)
#list of 192th lines from files

Related

Processing each file in directory, output with different extension

I have written a Python program that reads a test input file and outputs an output file
inputFile = open ('test.dat', 'r')
outputFile = open('test.log', 'w')
outputFile.write(inputFile.read())
inputFile.close()
outputFile.close()
I'd like to read all the .dat files in a folder and output corresponding .log files, keeping the file name prefix the same. Could somebody help me?
I've figured out that I could list all the dat files with the following code. But don't know what to do from then.
import os
for file in os.listdir("."):
if file.endswith(".htm"):
print(os.path.join("xxx", file))
Also: is there any way to know the count of .dat files in a directory? This way, while processing each file, I could display the progress status, like: "Processing File 1 of 999 data files", etc.
Thanks a bunch
Joanna
You can use glob to list only the files you want and shutil.copyfile to copy them to the new file name. Use the size of the list you get from glob to print progress along the way.
from glob import glob
import shutil
dat_files = list(glob("*.dat"))
dat_file_len = len(dat_files)
for i, dat_file in enumerate(dat_files, 1):
print(f"copying {i} of {dat_file_len}")
shutil.copyfile(dat_file, datfile[:-4] + ".log")
First you would have to get a list of all the .dat files contained in the directory, the code would look like this:
import glob
import os
dat_files = glob.glob('*.dat')
for i, dat_file in enumerate(dat_files):
print("Writing %d file..." %(i+1))
inputFile = open (dat_file, 'r')
outputFile = open(dat_file[:-3]+"log", 'w') # removing "dat" from end, and inserting new extension "log"
outputFile.write(inputFile.read())
inputFile.close()
outputFile.close()
You can use below program:
import glob
files_list = glob.glob('*.dat')
count_dat_file=len(files_list)
print("Count of .dat file is : {}\n".format(count_dat_file))
count_var = 1
for item in files_list:
print("Pocessing file {} of {}".format(count_var, count_dat_file))
print("Old File Name is : {}".format(item))
file_name_list = item.split('.')
file_name_list[-1] = 'log'
new_file_name = '.'.join(file_name_list)
print("New file Name is : {}".format(new_file_name))
with open(item,'r') as input_file:
with open(new_file_name,'w') as output_file:
output_file.write(input_file.read())
print("Data written to new file : {}".format(new_file_name))
count_var+=1
print("\n")

Python: Convert all files in directory into one .TXT?

I have been trying to convert a number of DOCX files into TXT.
It works for a single file using the code below:
import docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
if __name__ == '__main__':
filename='/content/drive/My Drive/path/file.DOCX'; #file name
fullText=getText(filename)
print (fullText)
file = open("copy.txt", "w")
file.write(fullText)
file.close()
I tried different options (i.e. glob) but did not manage get it to do the above operation on all files in a folder.
Ideally the output should be 1 large text file and not separate ones.
I will need to do some formatting and assigning of IDs in that file in a next step.
Thank you for your help!
corp-alt
With file = open("copy.txt", "w") you open the file and replace its content with write().
With file = open("copy.txt", "a") you append to the existing file with write(). Or maybe even better:
With file = open("copy.txt", "a+") you append to an existing file with write(), or create a new file if it doesn't exist yet.
To go through all files in a folder you can loop over them:
import os
import docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
if __name__ == '__main__':
foldername='/content/drive/My Drive/path/'; #folder name
all_files = os.listdir(foldername) #get all filenames
docx_files = [ filename for filename in all_files if filename.endswith('.docx') ] #get .docx filenames
file = open("copy.txt", "a+")
for docx_file in docx_files: #loop over .docx files
fullText=getText(filename)
file.write(fullText)
file.close()

How to unzip all folders/files that end in .zip and extract “file.txt” file from each zipped folder

My code currently unzips one zip folder and finds the file called file.txt and extracts it. Now I need to unzip multiple folders that have the extension .zip. I have tried to use code similar to what I need it to do but the problem is that now I have to find a file called file.txt in each of those .zip folders and extract that file only . Also to store file.txt into a separate folder that has the same name where it came from. Thank you in advance for your time.
import re
import os
from zipfile import ZipFile
def pain():
print("\t\t\tinput_files.zip has been unzipped")
with ZipFile('input_files.zip', 'r') as zipObj:
zipObj.extractall()
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.txt'):
zipObj.extract(fileName, 'storage')
outfile = "output2.txt" #this will be the filename that the code will write to
baconFile = open(outfile,"wt")
file_name1 = "file.txt"
print('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the python shell and this is the way the code should collect the data
baconFile.write('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the output file and this is the way the code should collect the data
#for filename in os.listdir(os.getcwd() + "/input_files"):
for filename in os.listdir('C:\Users\M29858\Desktop\TestPy\Version10\input_files'):
with open("input_files/" + filename, 'r') as f:
if file_name1 in filename:
output_contents(filename, f, baconFile)
baconFile.close() #closes the for loop that the code is writing to
def output_contents(filename, f, baconFile): #using open() function to open the file inside the directory
index = 0
for line in f:
#create a list of all of the numerical values in our line
content = line.split(',') #this will be used to count the amount numbers before and after comma
whitespace_found = False
tab_found = False
false_string = "False (end of file)"
carriage_found = false_string
sigfigs = ""
index += 1 #adds 1 for every line if it finds what the command wants
if " " in line: #checking for whitespace
whitespace_found = True
if "\t" in line: #checking for tabs return
tab_found = True
if '\n' in line: #checking if there is a newline after the end of each line
carriage_found = True
sigfigs = (','.join(str(len(g)) for g in re.findall(r'\d+\.?(\d+)?', line ))) #counts the sigsfigs after decimal point
print(filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found))) #whatever is inside the .format() is the way it the data is stored into
baconFile.write('\n')
baconFile.write( filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found)))
if __name__ == '__main__':
pain()
#THIS WORKS
import glob
import os
from zipfile import ZipFile
def main():
for fname in glob.glob("*.zip"): # get all the zip files
with ZipFile(fname) as archive:
# if there's no file.txt, ignore and go on to the next zip file
if 'file.txt' not in archive.namelist(): continue
# make a new directory named after the zip file
dirname = fname.rsplit('.',1)[0]
os.mkdir(dirname)
extract file.txt into the directory you just created
archive.extract('file.txt', path=dirname)

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

Concatenate multiple files' data into one file and also rename the file?

Using python how can I combine all the text file in the specified directory into one text file and rename the output text file with the same filename.
For example: Filea.txt and Fileb_2.txt is in root directory, and it output generated file is Filea_Fileb_2.txt
Filea.txt
123123
21321
Fileb_2.txt
2344
23432
Filea_Fileb_2.txt
123123
21321
2344
23432
my script:
PWD1 = /home/jenkins/workspace
files = glob.glob(PWD1 + '/' + '*.txt')
with open(f, 'r') as file:
for line in (file):
outputfile = open('outputfile.txt', 'a')
outputfile.write(line)
outputfile.close()
Here's another way to combine text files.
#! python3
from pathlib import Path
import glob
folder_File1 = r"C:\Users\Public\Documents\Python\CombineFIles"
txt_only = r"\*.txt"
files_File1 = glob.glob(f'{folder_File1}{txt_only}')
new_txt = f'{folder_File1}\\newtxt.txt'
newFile = []
for indx, file in enumerate(files_File1):
if file == new_txt:
pass
else:
contents = Path(file).read_text()
newFile.append(contents)
file = open(new_txt, 'w')
file.write("\n".join(newFile))
file.close()
This is a working solution which stores both file names and file contents in a list, then joins the list filenames and creates a "combined" filename and then adds the contents of all the files to it, because lists append in order that the data is read this is sufficient (my example filenames are filea.txt and fileb.txt but it will work for the filenames you've used):
import os
import sys
path = sys.argv[1]
files = []
contents = []
for f in os.listdir(path):
if f.endswith('.txt'): # in case there are other file types in there
files.append(str(f.replace('.txt', ''))) #chops off txt so we can join later
with open(f) as cat:
for line in cat:
contents.append(line) # put file contents in list
outfile_name = '_'.join(x for x in files)+'.txt' #create your output filename
outfile = open(outfile_name, 'w')
for line in contents:
outfile.write(line)
outfile.close()
to run this on a specific directory just pass it on the commandline:
$python3.6 catter.py /path/to/my_text_files/
output filename:
filea_fileb.txt
contents:
123123
21321
2344
23432

Categories

Resources