read from line to line yelp dataset by python - python

I want to change this code to specifically read from line 1400001 to 1450000. What is modification?
file is composed of a single object type, one JSON-object per-line.
I want also to save the output to .csv file. what should I do?
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
for line in f:
revu = json.loads(line[1400001:1450000)

If it is JSON per line:
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
# expensive statement, depending on your filesize this might
# let you run out of memory
revu = [json.loads(s) for s in f.readlines()[1400001:1450000]]
if you do it on the /etc/passwd file it is easy to test (no json of course, so that is left out)
revu = []
with open("/etc/passwd", 'r') as f:
# expensive statement
revu = [s for s in f.readlines()[5:10]]
print(revu) # gives entry 5 to 10
Or you iterate over all lines, saving you from memory issues:
revu = []
with open("...", 'r') as f:
for i, line in enumerate(f):
if i >= 1400001 and i <= 1450000:
revu.append(json.loads(line))
# process revu
To CSV ...
import pandas as pd
import json
def mylines(filename, _from, _to):
with open(filename, encoding="utf8") as f:
for i, line in enumerate(f):
if i >= _from and i <= _to:
yield json.loads(line)
df = pd.DataFrame([r for r in mylines("review.json", 1400001, 1450000)])
df.to_csv("/tmp/whatever.csv")

Related

Store the contents of text file in return variable and manipulate wherever required in python?

I have below function & I am trying to get/store the contents of text file in another temp file(removing unnecessary line) with appending special character.
But I also want the same content which is in temp text file with different special character next time but I am not able to do that.Below function is creating a temp file.To get desired output I need to create file every time with same function again which is not good way.Is there anything we can do without creating a temp/extra file and store the contents in return variable and append the special character whatever we want multiple times
import os
import re
def mainfest():
pathfile = "abc_12.txt"
with open(pathfile, 'r') as firstfile, open('tmp.txt', 'r') as s:
for line in firstfile:
if line.strip().startswith("-") or line.startswith("<"):
print"ignore")
elif re.search('\\S', line):
name = str(os.path.basename(line))
s.write("*" +fname)
def contents():
temppath = "temp.txt"
with open(temp path, 'r') as f:
lines = f.readlines()
lines+= lines
return lines
manifest()
value = contents()
file abc_12.txt
---ABC-123
nice/abc.py
xml/abc.py
<<NOP-123
bac\nice.py
abc.py
---CDEF-345
jkl.oy
abc.py
I want the contents of abc_12.txt file I can get in return something like that
abc.py
abc.py
nice.py
abc.py
jkl.oy
abc.py
and manipulate them wherever I want similar to below output
Output 1:
* abc.py
* abc.py
* nice.py
* abc.py
* jkl.oy
* abc.py
Output 2:
##abc.py
##abc.py
##nice.py
##abc.py
##jkl.oy
##abc.py
Maybe first you should read file, search names and keep on list
def read_data():
results = []
with open("abc_12.txt") as infile:
for line in infile:
if line.strip().startswith(("-", "<")): # `startswith`/`endswith` can use tuple
print("ignore:", line)
elif re.search('\\S', line):
name = os.path.basename(line)
results.append(name)
return results
And later you can use this list to create temp file or other file
data = read_data()
with open('temp.txt', 'w') as outfile:
for line in data:
outfile.write(f'* {line}')
#print(f'* {line}', end='')
with open('other.txt', 'w') as outfile:
for line in data:
outfile.write(f'##{line}')
#print(f'##{line}', end='')
EDIT:
Minimal working code.
I used io.StringIO only to simulate file in memory - so everyone can simply copy and test it.
import os
import re
import io
text = r'''---ABC-123
nice/abc.py
xml/abc.py
<<NOP-123
bac\nice.py
abc.py
---CDEF-345
jkl.oy
abc.py
'''
def read_data():
results = []
with io.StringIO(text) as infile:
#with open("abc_12.txt") as infile:
for line in infile:
line = line.strip()
if line:
if line.startswith(("-", "<")): # `startswith`/`endswith` can use tuple
print("ignore:", line)
else:
name = os.path.basename(line)
results.append(name)
return results
data = read_data()
with open('temp.txt', 'w') as outfile:
for line in data:
outfile.write(f'* {line}\n')
print(f'* {line}')
with open('other.txt', 'w') as outfile:
for line in data:
outfile.write(f'##{line}\n')
print(f'##{line}')
EDIT:
If you don't want to save in file then you still need for-loop to create string
data = read_data()
string_1 = ''
for line in data:
string_1 += f'* {line}\n'
string_2 = ''
for line in data:
string_2 += f'##{line}\n'
or to create new list (and eventually string)
data = read_data()
list_1 = []
for line in data:
list_1.append(f'* {line}')
list_2 = []
for line in data:
list_2.append(f'##{line}')
string_1 = "\n".join(list_1)
string_2 = "\n".join(list_2)

Modify python script to run for multiple input files

I am very new to python, and I have a python script to run for a particular file (input1.txt) and generated a output (output1.fasta), but I would like to run this script for multiple files, for example: input2.txt, input3.txt...and generate the respective output: output2.fasta, output3.fasta
from Bio import SeqIO
fasta_file = "sequences.txt"
wanted_file = "input1.txt"
result_file = "output1.fasta"
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
I tried to add the glob function, but I do not know how to deal with the output file name.
from Bio import SeqIO
import glob
fasta_file = "sequences.txt"
for filename in glob.glob('*.txt'):
wanted = set()
with open(filename) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
The error message is: NameError: name 'result_file' is not defined
Your glob is currently pulling your "sequences" file as well as the inputs because *.txt includes the sequences.txt file. If the "fasta" file is always the same and you only want to iterate the input files, then you need
for filename in glob.glob('input*.txt'):
Also, to iterate through your entire process, perhaps you want to put it within a method. And if the output filename is always created to correspond to the input, then you can create that dynamically.
from Bio import SeqIO
def create_fasta_outputs(fasta_file, wanted_file):
result_file = wanted_file.replace("input","output").replace(".txt",".fasta")
wanted = set()
with open(wanted_file) as f:
for line in f:
line = line.strip()
if line != "":
wanted.add(line)
fasta_sequences = SeqIO.parse(open(fasta_file),'fasta')
with open(result_file, "w") as f:
for seq in fasta_sequences:
if seq.id in wanted:
SeqIO.write([seq], f, "fasta")
fasta_file = "sequences.txt"
for wanted_file in glob.glob('input*.txt'):
create_fasta_outputs(fasta_file, wanted_file)

Open Multiple text files and call function

Below code works perfectly, where it opens one text files and function parse_messages gets as parameter
def parse_messages(hl7):
hl7_msgs = hl7.split("MSH|")
hl7_msgs = ["{}{}".format("MSH|", x) for x in hl7_msgs if x]
for hl7_msg in hl7_msgs:
#does something..
with open('sample.txt', 'r') as f:
hl7 = f.read()
df = parse_messages(hl7)
But now I have multiple text files in directory. I want to do open each one then call from parse_messages function. Here is what I tried so far.
But this only read last text file, not all of them
import glob
data_directory = "C:/Users/.../"
hl7_file = glob.glob(data_directory + '*.txt')
for file in hl7_file:
with open(file, 'r') as hl7:
hl7 = f.read()
df = parse_messages(hl7)
in your read file loop for file in hl7_file, you are overwrite hl7 at every iteration leaving only the last read store at hl7
You probably wanna concatenate all contents of the files together
hl7 = ''
for file in hl7_file:
with open(file, 'r') as f:
hl7 += f.read()
df = parse_messages(hl7) # process all concatenate contents together
or you can call parse_messages function inside the loop with df list store the results as below
df = []
for file in hl7_file:
with open(file, 'r') as f:
hl7 = f.read()
df.append(parse_messages(hl7))
# df[0] holds the result for 1st file read, df[1] for 2nd file and so on
This should work, if I understood what you want to do
import os
all = []
files = [x for x in os.listdir() if x.endswith(".txt")]
for x in files:
with open(x, encoding='utf-8','r') as fileobj:
content = fileobj.read()
all.append(parse_message(content))

subset the data and count the line in each file

I am trying to subset my data from a single file to two separate files and count the lines in each file separately.
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
IND4,BB,AB
IND5,BB,AA
One file would be:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
The other would be:
ID,MARK1,MARK2
IND4,BB,AB
IND5,BB,AA
Here is my code:
import re
def file_len(filename):
with open(filename, mode = 'r', buffering = 1) as f:
for i, line in enumerate(f):
pass
return i
inputfile = open("test.txt", 'r')
outputfile_f1 = open("f1.txt", 'w')
outputfile_f2 = open("f2.txt", 'w')
matchlines = inputfile.readlines()
outputfile_f1.write(matchlines[0]) #add the header to the "f1.txt"
for line in matchlines:
if re.match("sire*", line):
outputfile_f1.write(line)
elif re.match("dam*", line):
outputfile_f1.write(line)
else:
outputfile_f2.write(line)
print 'the number of individuals in f1 is:', file_len(outputfile_f1)
print 'the number of individuals in f2 is:', file_len(outputfile_f2)
inputfile.close()
outputfile_f1.close()
outputfile_f2.close()
The code can separate subset the files just fine, but i am particularly not like the way I add the header to the new file, I am wondering if any better way to do it? Also, the function looks fine to count lines, but when I ran it, it gave me an error
"Traceback (most recent call last):
File "./subset_individuals_based_on_ID.py", line 28, in <module>
print 'the number of individuals in f1 is:', file_len(outputfile_f1)
File "./subset_individuals_based_on_ID.py", line 7, in file_len
with open(filename, mode = 'r', buffering = 1) as f:
TypeError: coercing to Unicode: need string or buffer, file found
"
so i googled this site, added buffering = 1 (it was not originally in the code), still not solve the problem.
Thank you very much for helping improve the code and cleaning the error.
You can also use itertools.tee to split the input into multiple streams and process them individually.
import itertools
def write_file(match, source, out_file):
count = -1
with open(out_file, 'w') as output:
for line in source:
if count < 0 or match(line):
output.write(line)
count += 1
print('Wrote {0} lines to {1}'.format(count, out_file))
with open('test.txt', 'r') as f:
first, second = itertools.tee(f.readlines())
write_file(lambda x: not x.startswith('IND'), first, 'f1.txt')
write_file(lambda x: x.startswith('IND'), second, 'f2.txt')
EDIT - removed redundant elif
I might be misreading you, but I believe you are just trying to do this:
>>> with open('test', 'r') as infile:
... with open('test_out1', 'w') as out1, open('test_out2', 'w') as out2:
... header, *lines = infile.readlines()
... out1.write(header)
... out2.write(header)
... for line in lines:
... if line.startswith('sir') or line.startswith('dam'):
... out1.write(line)
... else:
... out2.write(line)
Contents of test before:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
IND4,BB,AB
IND5,BB,AA
Contents of test_out1 after:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
Contents of test_out2 after:
ID,MARK1,MARK2
IND4,BB,AB
IND5,BB,AA

Want to read multiple csv file one by one and filepaths are stored in a text file using python

here is my code for readinng individual cell of one csv file. but want to read multiple csv file one by one from .txt file where csv file paths are located.
import csv
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
data = list(csv.reader(ifile, delimiter = ';'))
REQ = []
RES = []
n = len(data)
for i in range(n):
x = data[i][1]
y = data[i][2]
REQ.append (x)
RES.append (y)
i += 1
for j in range(2,n):
try:
if REQ[j] != '' and RES[j]!= '': # ignore blank cell
print REQ[j], ' ', RES[j]
except:
pass
j += 1
And csv file paths are stored in a .txt file like
C:\Desktop\Test_Specification\RDBI.csv
C:\Desktop\Test_Specification\ECUreset.csv
C:\Desktop\Test_Specification\RDTC.csv
and so on..
You can read stuff stored in files into variables. And you can use variables with strings in them anywhere you can use a literal string. So...
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip() # or was it trim()? I keep mixing them up
ifile = open(file_name, 'rb')
# ... the rest of your code goes here
Maybe we can fix this up a little...
import csv
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip()
csv_file = csv.reader(open(file_name, 'rb', delimiter=';'))
for record in csv_file[1:]: # skip header row
req = record[1]
res = record[2]
if len(req + res):
print req, ' ', res
you just need to add a while which will read your file containing your list of files & paths upon your first open statement, for example
from __future__ import with_statement
with open("myfile_which_contains_file_path.txt") as f:
for line in f:
ifile = open(line, 'rb')
# here the rest of your code
You need to use a raw string string your path contains \
import csv
file_list = r"C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
with open(file_list) as f:
for line in f:
with open(line.strip(), 'rb') as the_file:
reader = csv.reader(the_file, delimiter=';')
for row in reader:
req,res = row[1:3]
if req and res:
print('{0} {1}'.format(req, res))

Categories

Resources