Read multiple gzip files to 1 fileobject in python - python

i want to read multiple gzip file to 1 file object
currently i am doing
import gzip
a = gzip.open(path2zipfile1)
for line in a.readline()
#do some stuff
but i need to read from say 2 files
a = gzip.open(path2zipfile1) #read zip1
a = gzip.open(path2zipfile2, 'rU') #appending file object with contents of 2nd file
for line in a.readlines()
#this should give me contents from zip1 then zip2
unable to find the right mode to do so

use itertools.chain:
import itertools, gzip
files = ['path2zipfile1', 'path2zipfile2']
it = (gzip.open(f, 'rt') for f in files)
for line in itertools.chain.from_iterable(it):
print(line)
another version without itertools:
def gen(files):
for f in files:
fo = gzip.open(f, 'rt')
while True:
line = fo.readline()
if not line:
break
yield line
files = ['path2zipfile1', 'path2zipfile2']
for line in gen(files):
print(line)

Related

How to edit specific line for all text files in a folder by python?

Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.
Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.
Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function
The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.
import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)

Python; trying to add these files line by line with a tab inbetween

I'm reading in text files from the command line and I'm trying to produce output as follows...
Desired output given these command line arguments
Essentially, I want to read in files from the command line; take the first line from each file & print them on one line separated by a tab. Take the second line from each file & print them on the next line separated by a tab & so on.
This is the best code I've come up with (I'm a beginner and I've tried looking at other responses for far too long; glob & os hasn't been helping me understand how to do this; I'd just like to use basic loops and opening of files to do this):
import sys
l = []
list_files = sys.argv[:1]
for fname in list_files:
open(fname) as infile:
for line in infile:
line = line.strip()
if line == '':
l.append("''")
else:
l.append(line)
print(l) # List of all appended animals. Not in the right order
#(takes all names from one file, then all the names from the
#next instead of taking one line from every file on each iteration)
This is a minimally changed version that should work.
import sys
from itertools import zip_longest
files = []
list_files = sys.argv[:1]
for fname in list_files:
with open(fname) as infile: # Don't forget the `with`!
l = []
for line in infile:
line = line.strip()
if line == '':
l.append("''")
else:
l.append(line)
files.append(l) # list of lists
for lines in zip_longest(*files, fillvalue=''): # transpose list of lists
print(*lines, sep='\t') # separate with tabs.
The best way to open files in python is with with. More information can be found at https://www.pythonforbeginners.com/files/with-statement-in-python. Anyways:
import sys
if len(sys.argv) != 3:
sys.exit(1)
filename1 = sys.argv[1]
filename2 = sys.argv[2]
with open(filename1, 'r') as file1, open(filename2, 'r') as file2:
for line1, line2 in zip(file1, file2):
print(line1.strip(), line2.strip(), sep='\t')
This can be changed to allow for more than two files:
import sys
if len(sys.argv) != 3:
sys.exit(1)
filenames = sys.argv[1:]
all_lines = []
for filename in filenames:
with open(filename, 'r') as file:
all_lines.append([l.strip() for l in file.readlines()])
for line in zip(*all_lines):
print(*line, sep='\t')

read from line to line yelp dataset by python

I want to change this code to specifically read from line 1400001 to 1450000. What is modification?
file is composed of a single object type, one JSON-object per-line.
I want also to save the output to .csv file. what should I do?
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
for line in f:
revu = json.loads(line[1400001:1450000)
If it is JSON per line:
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
# expensive statement, depending on your filesize this might
# let you run out of memory
revu = [json.loads(s) for s in f.readlines()[1400001:1450000]]
if you do it on the /etc/passwd file it is easy to test (no json of course, so that is left out)
revu = []
with open("/etc/passwd", 'r') as f:
# expensive statement
revu = [s for s in f.readlines()[5:10]]
print(revu) # gives entry 5 to 10
Or you iterate over all lines, saving you from memory issues:
revu = []
with open("...", 'r') as f:
for i, line in enumerate(f):
if i >= 1400001 and i <= 1450000:
revu.append(json.loads(line))
# process revu
To CSV ...
import pandas as pd
import json
def mylines(filename, _from, _to):
with open(filename, encoding="utf8") as f:
for i, line in enumerate(f):
if i >= _from and i <= _to:
yield json.loads(line)
df = pd.DataFrame([r for r in mylines("review.json", 1400001, 1450000)])
df.to_csv("/tmp/whatever.csv")

Modify python script to run for multiple input files

I am very new to python, and I have a python script to run for a particular file (input1.txt) and generated a output (output1.fasta), but I would like to run this script for multiple files, for example: input2.txt, input3.txt...and generate the respective output: output2.fasta, output3.fasta
from Bio import SeqIO
fasta_file = "sequences.txt"
wanted_file = "input1.txt"
result_file = "output1.fasta"
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
I tried to add the glob function, but I do not know how to deal with the output file name.
from Bio import SeqIO
import glob
fasta_file = "sequences.txt"
for filename in glob.glob('*.txt'):
wanted = set()
with open(filename) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
The error message is: NameError: name 'result_file' is not defined
Your glob is currently pulling your "sequences" file as well as the inputs because *.txt includes the sequences.txt file. If the "fasta" file is always the same and you only want to iterate the input files, then you need
for filename in glob.glob('input*.txt'):
Also, to iterate through your entire process, perhaps you want to put it within a method. And if the output filename is always created to correspond to the input, then you can create that dynamically.
from Bio import SeqIO
def create_fasta_outputs(fasta_file, wanted_file):
result_file = wanted_file.replace("input","output").replace(".txt",".fasta")
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
fasta_file = "sequences.txt"
for wanted_file in glob.glob('input*.txt'):
create_fasta_outputs(fasta_file, wanted_file)

Open Multiple text files and call function

Below code works perfectly, where it opens one text files and function parse_messages gets as parameter
def parse_messages(hl7):
hl7_msgs = hl7.split("MSH|")
hl7_msgs = ["{}{}".format("MSH|", x) for x in hl7_msgs if x]
for hl7_msg in hl7_msgs:
#does something..
with open('sample.txt', 'r') as f:
hl7 = f.read()
df = parse_messages(hl7)
But now I have multiple text files in directory. I want to do open each one then call from parse_messages function. Here is what I tried so far.
But this only read last text file, not all of them
import glob
data_directory = "C:/Users/.../"
hl7_file = glob.glob(data_directory + '*.txt')
for file in hl7_file:
with open(file, 'r') as hl7:
hl7 = f.read()
df = parse_messages(hl7)
in your read file loop for file in hl7_file, you are overwrite hl7 at every iteration leaving only the last read store at hl7
You probably wanna concatenate all contents of the files together
hl7 = ''
for file in hl7_file:
with open(file, 'r') as f:
hl7 += f.read()
df = parse_messages(hl7) # process all concatenate contents together
or you can call parse_messages function inside the loop with df list store the results as below
df = []
for file in hl7_file:
with open(file, 'r') as f:
hl7 = f.read()
df.append(parse_messages(hl7))
# df[0] holds the result for 1st file read, df[1] for 2nd file and so on
This should work, if I understood what you want to do
import os
all = []
files = [x for x in os.listdir() if x.endswith(".txt")]
for x in files:
with open(x, encoding='utf-8','r') as fileobj:
content = fileobj.read()
all.append(parse_message(content))

Categories

Resources