I need to process two types of files in a directory - .txt and .gz.
There are two types of open statements for this purpose:
.gz files:
with gzip.open(file_name, 'rt', encoding='utf-8') as f:
line = next(f)
while line:
some code
.txt files:
with open(file_name, 'r', encoding='utf-8') as f:
line = next(f)
while line:
some code
Any further processing commands are absolutely identical. Now I see two options to process these two file types:
Option 1 - Use two identical functions that differ only by open statement. Sounds ugly...
Option 2 - Use if construction as following:
if ext == '.gz':
f = gzip.open(file_name, 'rt', encoding='utf-8')
elif ext == '.txt':
f = open(file_name, 'r', encoding='utf-8')
line = next(f)
while line:
some code
But it still looks awkward to me :/
Question: what's pythonic way in Python 3.x to use open statement according to a file extension?
why not:
with (gzip.open if ext==".gz" else open)(file_name, 'rt', encoding='utf-8') as f:
the first argument of with is a ternary expression, where you decide which function to use depending on the extension. I used 'rt' in both cases, it's default for standard open. That method has the advantage to avoid copy/paste and to be able to use context manager.
Maybe some generic function could be created with an helper function:
def myopen(file_name)
return (gzip.open if os.path.splitext(file_name)[1]==".gz" else open)(file_name, 'rt', encoding='utf-8')
use like:
with myopen(file_name):
an alternative is to use a defaultdict with the extension as key
from collections import defaultdict
from pathlib import Path
open_functions = defaultdict(lambda: (open, ("r",), {encoding: "utf-8"}))
open_functions["gz"] = (gzip.open, ("rt",), {encoding: "utf-8"})
filename = Path(filename)
open_function, args, kwargs = open_functions[filename.suffix]
with open_function(filename, *args, **kwargs) as f:
...
I would like suggest the following way:
#------------------------------------
import zipfile
#-----------------------Common Code-------------------------
def disp_line(filee):
for line in filee.readlines():
print(line)
#-----------------------First File-----------------------
z = zipfile.ZipFile('D:\\DC-Data\\baby_names.zip', "r")
zinfo = z.namelist()
for name in zinfo:
with z.open(name) as filee:
disp_line(filee)
#-----------------------2nd File-------------------------
with open('D:\\DC-Data\\iris.txt', 'r') as filee:
disp_line(filee)
#------------------------End ----------------------
Related
Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.
Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.
Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function
The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.
import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)
I am very new to python, and I have a python script to run for a particular file (input1.txt) and generated a output (output1.fasta), but I would like to run this script for multiple files, for example: input2.txt, input3.txt...and generate the respective output: output2.fasta, output3.fasta
from Bio import SeqIO
fasta_file = "sequences.txt"
wanted_file = "input1.txt"
result_file = "output1.fasta"
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
I tried to add the glob function, but I do not know how to deal with the output file name.
from Bio import SeqIO
import glob
fasta_file = "sequences.txt"
for filename in glob.glob('*.txt'):
wanted = set()
with open(filename) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
The error message is: NameError: name 'result_file' is not defined
Your glob is currently pulling your "sequences" file as well as the inputs because *.txt includes the sequences.txt file. If the "fasta" file is always the same and you only want to iterate the input files, then you need
for filename in glob.glob('input*.txt'):
Also, to iterate through your entire process, perhaps you want to put it within a method. And if the output filename is always created to correspond to the input, then you can create that dynamically.
from Bio import SeqIO
def create_fasta_outputs(fasta_file, wanted_file):
result_file = wanted_file.replace("input","output").replace(".txt",".fasta")
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
fasta_file = "sequences.txt"
for wanted_file in glob.glob('input*.txt'):
create_fasta_outputs(fasta_file, wanted_file)
I would like to loop through files into a directory, make something on these files and then for each file write out the result.
But my files can't be read because python interprets file names as string objects and not a readable file.
Is there a way to avoid this?
import re
import os
def create_filename_for_fileout (f1):
fileout_n = f1.replace("TT", "out")
fileout = "C:\\Users\\KP\\Desktop\\FSC_Treetag\\out\\"+str(fileout_n)
return fileout
for file_in in os.listdir('C:\\Users\\KP\\Desktop\\FSC_Treetag'):
filename = str(file_in)
file_out = create_filename_for_fileout(filename)
open(file_in, 'r')
open(file_out, 'w')
content_file = file_in.readlines()
for ln in content_file:
regex = re.compile('(.*\t(ADJ|ADV|NOM|VER:cond|VER:futu|VER:impe|VER:impf|VER:infi|VER:pper|VER:pres|VER:pres|VER:simp|VER:subi|VER:subp)\t(.*))')
res = regex.search(ln)
if res:
# categ = res.group(2)
lemme = res.group(3)
file_out.write(str(lemme)+"\n")
file_out.close()
file_in.close()
Result:
content_file = file_in.readlines()
AttributeError: 'str' object has no attribute 'readlines'
>>>
You're not assigning your open to any variable to use.
# Change
open(file_in, 'r')
open(file_out, 'w')
# to
input_file = open(file_in, 'r')
output_file = open(file_out, 'w')
for ln in input_file:
# do your processing
if res:
lemme = res.group(3)
output_file.write(str(lemme) + "\n")
You are not assigning the open functions to the respective handlers (open is returning an object of the file type).
filename = str(file_in)
file_out = create_filename_for_fileout(filename)
open(file_in, 'r')
open(file_out, 'w')
Should be:
file_out = open(create_filename_for_fileout(file_in), 'w')
file_in = open(file_in, 'r')
NOTE: for clarity sake it's a good idea to use another pointer for the infile handler.
Check: https://docs.python.org/2/library/functions.html#open
open(name[, mode[, buffering]])
Open a file, returning an object of the file type described in section File Objects. If the file cannot be opened, IOError is raised.
Before I begin I would like to state that I am very much a beginner in Python. I am trying to create a function called fcopy() that will take two file names as arguments and copies the content of the first file into the second, when the second file does not exist at the time of the copy. Then get the function to close all files and open the second file for reading. I am using a for statement to read and print the lines in the second file. Here is what I have so far:
def fcopy(file1, file2):
os.chdir('C:/Users/Noah/Documents/myPython')
file1 = str(input('Enter file with text: '))
file2 = str(input('Enter empty file: '))
opened_file= open(file1, 'r')
for lines in file1:
file2.write('file1')
print(lines)
The filenames are passed in as arguments; you don't need to prompt the user to type in the names.
def fcopy(file1, file2):
os.chdir('C:/Users/Noah/Documents/myPython')
open_file_1 = open(file1, 'r')
open_file_2 = open(file2, 'w')
for line in open_file_1:
open_file_2.write(line)
open_file_1.close()
open_file_2.close()
open_file_2 = open(file2, 'r')
for line in open_file_2:
print(line)
open_file_2.close()
If you still want to test if the file2 is non existent at the time of the writing and abort otherwise, you can make it with os.path.isfile in John Gordon code.
def fcopy(file1, file2):
os.chdir('C:/Users/Noah/Documents/myPython')
if(os.path.isfile(file2)):
open_file_1 = open(file1, 'r')
open_file_2 = open(file2, 'w')
for line in open_file_1:
open_file_2.write(line)
open_file_1.close()
open_file_2.close()
open_file_2 = open(file2, 'r')
for line in open_file_2:
print(line)
open_file_2.close()
Currently I'm using this:
f = open(filename, 'r+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.close()
But the problem is that the old file is larger than the new file. So I end up with a new file that has a part of the old file on the end of it.
If you don't want to close and reopen the file, to avoid race conditions, you could truncate it:
f = open(filename, 'r+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.truncate()
f.close()
The functionality will likely also be cleaner and safer using open as a context manager, which will close the file handler, even if an error occurs!
with open(filename, 'r+') as f:
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.write(text)
f.truncate()
The fileinput module has an inplace mode for writing changes to the file you are processing without using temporary files etc. The module nicely encapsulates the common operation of looping over the lines in a list of files, via an object which transparently keeps track of the file name, line number etc if you should want to inspect them inside the loop.
from fileinput import FileInput
for line in FileInput("file", inplace=1):
line = line.replace("foobar", "bar")
print(line)
Probably it would be easier and neater to close the file after text = re.sub('foobar', 'bar', text), re-open it for writing (thus clearing old contents), and write your updated text to it.
I find it easier to remember to just read it and then write it.
For example:
with open('file') as f:
data = f.read()
with open('file', 'w') as f:
f.write('hello')
To anyone who wants to read and overwrite by line, refer to this answer.
https://stackoverflow.com/a/71285415/11442980
filename = input("Enter filename: ")
with open(filename, 'r+') as file:
lines = file.readlines()
file.seek(0)
for line in lines:
value = int(line)
file.write(str(value + 1))
file.truncate()
Honestly you can take a look at this class that I built which does basic file operations. The write method overwrites and append keeps old data.
class IO:
def read(self, filename):
toRead = open(filename, "rb")
out = toRead.read()
toRead.close()
return out
def write(self, filename, data):
toWrite = open(filename, "wb")
out = toWrite.write(data)
toWrite.close()
def append(self, filename, data):
append = self.read(filename)
self.write(filename, append+data)
Try writing it in a new file..
f = open(filename, 'r+')
f2= open(filename2,'a+')
text = f.read()
text = re.sub('foobar', 'bar', text)
f.seek(0)
f.close()
f2.write(text)
fw.close()