subset the data and count the line in each file

subset the data and count the line in each file - python

I am trying to subset my data from a single file to two separate files and count the lines in each file separately.
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
IND4,BB,AB
IND5,BB,AA
One file would be:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
The other would be:
ID,MARK1,MARK2
IND4,BB,AB
IND5,BB,AA
Here is my code:
import re
def file_len(filename):
with open(filename, mode = 'r', buffering = 1) as f:
for i, line in enumerate(f):
pass
return i
inputfile = open("test.txt", 'r')
outputfile_f1 = open("f1.txt", 'w')
outputfile_f2 = open("f2.txt", 'w')
matchlines = inputfile.readlines()
outputfile_f1.write(matchlines[0]) #add the header to the "f1.txt"
for line in matchlines:
if re.match("sire*", line):
outputfile_f1.write(line)
elif re.match("dam*", line):
outputfile_f1.write(line)
else:
outputfile_f2.write(line)
print 'the number of individuals in f1 is:', file_len(outputfile_f1)
print 'the number of individuals in f2 is:', file_len(outputfile_f2)
inputfile.close()
outputfile_f1.close()
outputfile_f2.close()
The code can separate subset the files just fine, but i am particularly not like the way I add the header to the new file, I am wondering if any better way to do it? Also, the function looks fine to count lines, but when I ran it, it gave me an error
"Traceback (most recent call last):
File "./subset_individuals_based_on_ID.py", line 28, in <module>
print 'the number of individuals in f1 is:', file_len(outputfile_f1)
File "./subset_individuals_based_on_ID.py", line 7, in file_len
with open(filename, mode = 'r', buffering = 1) as f:
TypeError: coercing to Unicode: need string or buffer, file found
"
so i googled this site, added buffering = 1 (it was not originally in the code), still not solve the problem.
Thank you very much for helping improve the code and cleaning the error.

You can also use itertools.tee to split the input into multiple streams and process them individually.
import itertools
def write_file(match, source, out_file):
count = -1
with open(out_file, 'w') as output:
for line in source:
if count < 0 or match(line):
output.write(line)
count += 1
print('Wrote {0} lines to {1}'.format(count, out_file))
with open('test.txt', 'r') as f:
first, second = itertools.tee(f.readlines())
write_file(lambda x: not x.startswith('IND'), first, 'f1.txt')
write_file(lambda x: x.startswith('IND'), second, 'f2.txt')
EDIT - removed redundant elif

I might be misreading you, but I believe you are just trying to do this:
>>> with open('test', 'r') as infile:
... with open('test_out1', 'w') as out1, open('test_out2', 'w') as out2:
... header, *lines = infile.readlines()
... out1.write(header)
... out2.write(header)
... for line in lines:
... if line.startswith('sir') or line.startswith('dam'):
... out1.write(line)
... else:
... out2.write(line)
Contents of test before:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
IND4,BB,AB
IND5,BB,AA
Contents of test_out1 after:
ID,MARK1,MARK2
sire1,AA,BB
dam2,AB,AA
sire3,AB,-
dam1,AA,BB
Contents of test_out2 after:
ID,MARK1,MARK2
IND4,BB,AB
IND5,BB,AA

Related

read from line to line yelp dataset by python

I want to change this code to specifically read from line 1400001 to 1450000. What is modification?
file is composed of a single object type, one JSON-object per-line.
I want also to save the output to .csv file. what should I do?
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
for line in f:
revu = json.loads(line[1400001:1450000)

If it is JSON per line:
revu=[]
with open("review.json", 'r',encoding="utf8") as f:
# expensive statement, depending on your filesize this might
# let you run out of memory
revu = [json.loads(s) for s in f.readlines()[1400001:1450000]]
if you do it on the /etc/passwd file it is easy to test (no json of course, so that is left out)
revu = []
with open("/etc/passwd", 'r') as f:
# expensive statement
revu = [s for s in f.readlines()[5:10]]
print(revu) # gives entry 5 to 10
Or you iterate over all lines, saving you from memory issues:
revu = []
with open("...", 'r') as f:
for i, line in enumerate(f):
if i >= 1400001 and i <= 1450000:
revu.append(json.loads(line))
# process revu
To CSV ...
import pandas as pd
import json
def mylines(filename, _from, _to):
with open(filename, encoding="utf8") as f:
for i, line in enumerate(f):
if i >= _from and i <= _to:
yield json.loads(line)
df = pd.DataFrame([r for r in mylines("review.json", 1400001, 1450000)])
df.to_csv("/tmp/whatever.csv")

How to erase line from text file in Python?

I'm trying to make a code to rewrite a specific line from a .txt file.
I can get to write in the line i want, but i can't erase the previous text on the line.
Here is my code:
(i'm trying a couple of things)
def writeline(file,n_line, text):
f=open(file,'r+')
count=0
for line in f:
count=count+1
if count==n_line :
f.write(line.replace(str(line),text))
#f.write('\r'+text)
You can use this code to make a test file for testing:
with open('writetest.txt','w') as f:
f.write('1 \n2 \n3 \n4 \n5')
writeline('writetest.txt',4,'This is the fourth line')
Edit: For Some reason, if i use 'if count==5:' the code compiles ok (even if it doen't erase the previous text), but if i do 'if count==n_line: ', the file ends up with a lot of garbage.
The Answers work, but i would like to know what are the problems with my code, and why i can't read and write. Thanks!

You are reading from the file and also writing to it. Don't do that. Instead, you should write to a NamedTemporaryFile and then rename it over the original file after you finish writing and close it.
Or if the size of the file is guaranteed to be small, you can use readlines() to read all of it, then close the file, modify the line you want, and write it back out:
def editline(file,n_line,text):
with open(file) as infile:
lines = infile.readlines()
lines[n_line] = text+' \n'
with open(file, 'w') as outfile:
outfile.writelines(lines)

Use temporary file:
import os
import shutil
def writeline(filename, n_line, text):
tmp_filename = filename + ".tmp"
count = 0
with open(tmp_filename, 'wt') as tmp:
with open(filename, 'rt') as src:
for line in src:
count += 1
if count == n_line:
line = line.replace(str(line), text + '\n')
tmp.write(line)
shutil.copy(tmp_filename, filename)
os.remove(tmp_filename)
def create_test(fname):
with open(fname,'w') as f:
f.write('1 \n2 \n3 \n4 \n5')
if __name__ == "__main__":
create_test('writetest.txt')
writeline('writetest.txt', 4, 'This is the fourth line')

how to read the content of .txt file using python?

output_filename = r"C:\Users\guage\Output.txt"
RRA:
GREQ-299684_6j
GREQ-299684_6k
CZM:
V-GREQ-299684_6k
V-GREQ-299524_9
F_65624_1
R-GREQ-299680_5
DUN:
FB_71125_1
FR:
VQ-299659_18
VR-GREQ-299659_19
VEQ-299659_28
VR-GREQ-299659_31
VR-GREQ-299659_32
VEQ-299576_1
GED:
VEQ-299622_2
VR-GREQ-299618_13
VR-GREQ-299559_1
VR-GREQ-299524_14
FB_65624_1
VR-GREQ-299645_1
MNT:
FB_71125_1
FB_71125_2
VR-534_4
The above is the content of the the .txt file. how can I read it separately the content of it. for example -
RRA:VR-GREQ-299684_6j VR-GREQ-299684_6k VR-GREQ-299606_3 VR-GREQ-299606_4 VR-GREQ-299606_5 VR-GREQ-299606_7
and save it in a variable or something similar to it. Later I want to read CZM separately and so on. I did as below.
with open(output_filename, 'r') as f:
excel = f.read()
But how to read it separately ? can someone tell me how to do it ?

Something like this:
def read_file_with_custom_record_separator(file_path, delimiter='\n'):
fh = open(file_path)
data = ""
for line in fh:
if line.strip().endswith(delimiter) and data != "":
print "VARIABLE:\n<", data, ">\n"
data = line
else:
data += line
print "LAST VARIABLE:\n<", data, ">\n"
And then:
read_file_with_custom_record_separator("input.txt", ":")

You can make use of the file text : as indicator to create a new file like this:
savefilename = ""
with open(filename, 'r') as f:
for line in f:
line = line.strip() # get rid of the unnecessary white chars
lastchar = line[-1:] # get the last char
if lastchar == ":": # if the last char is ":"
savefilename = line[0:-1] # get file name from line (except the ":")
sf = open(savefilename + ".txt", 'w') # create a new file
else:
sf.write(line + "\n") # write the data to the opened file
Then you should get collection of files:
RRA.txt
CZM.txt
DUN.txt
# etc
which contains all the appropriate data:
RRA.txt
VR-GREQ-299684_6j
VR-GREQ-299684_6k
VR-GREQ-299606_3
VR-GREQ-299606_4
VR-GREQ-299606_5
VR-GREQ-299606_7
CZM.txt
VR-GREQ-299684_6k
VR-GREQ-299606_6
VR-GREQ-299606_8
VR-GREQ-299640_1
VR-GREQ-299640_5
VR-GREQ-299524_9
FB_65624_1
VR-GREQ-299680_5
DUN.txt
FB_71125_1
# and so on
You can replace the sf = open and the sf.write which whatever way you feel best to separate the data. Here, I use files...

You can iterate over the file and use the lines and indices to your advantage; something like this:
with open(output_filename, 'r') as f:
for index, line in enumerate(f):
# here you have access to each line and its index
# so you can save any number of lines you wish

What about reading it into a list, then process its element as you prefer
>>> f = open('myfile.txt', 'r').readlines()
>>> len(f)
46
>>> f[0]
RRA:
>>> f[-1]
VR-GREQ-299534_4
>>> f[:3]
['RRA:\n', 'VR-GREQ-299684_6j \n', 'VR-GREQ-299684_6k \n']
>>>
>>> [l for l in f if l.startswith('FB_')]
['FB_65624_1 \n', 'FB_71125_1 \n', 'FB_69228_1 \n', 'FB_65624_1 \n', 'FB_71125_1 \n', 'FB_71125_2 \n']
>>>

How to delete lines in a file in Python?

I have a text file file that looks like this:
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
it
0.451455675985 -0.0309283486444 -0.233415252863 -0.0273732833795 -0.294310277236 0.324236481567 -0.084486587459 0.340305398253 -0.56250445207 0.00640281538272
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
movie
0.444291696176 -0.110937149049 -0.259525377532 0.00986849685667 -0.311934727067 0.319610517473 -0.0644468651461 0.372562407 -0.572686043624 0.0262434708424
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062
I want to keep in the file only the lines that contain both a word and a set of values on the same line.
How can I delete the rest of them?
The expected output is:
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062

A few ways to solve the problem.
Read the file as CSV. If the column count is 12 and the first column is not a blank string, write it out:
import csv
with open('original.txt','r') as f, open('new.txt','w') as o:
reader = csv.reader(f, delimiter=' ')
writer = csv.writer(o, delimiter=' ')
for row in reader:
if len(row) == 12 and row[0]:
writer.write(row)
Read the file, write out those lines where the first item is a word and there are more than 2 columns:
with open('original.txt', 'r') as f, open('new.txt', 'w') as o:
for line in f:
if line.lstrip().split(' ')[0].isalpha() and len(line.split(' ')) > 2:
o.write(line)

You can implement this very nicely using the fileinput module:
import fileinput
import sys
for line in fileinput.input(inplace=True):
if line.find(' ') > 0:
sys.stdout.write(line)
Note that this modifies all the files given on the command line in-place, i.e. they will be modified.

You can open the file and read it line by line. Each line that contains a letter can be copied to another file.
letters = "abcdefghijklmnopqrstuvwxyz"
letters += letters.upper()
f_in = open("myfile.txt", 'rb')
f_out = open("newfile.txt", 'wb')
for line in f_in:
for letter in letters:
if letter in line:
f_out.write(line)
break
f_in.close()
f_out.close()
Be careful, the example you have given contains only one long line.
This line has to be split into multiple lines.

Try this. Hope this gives you the desired result.
import re
f = open("i1.txt", "r")
lines = f.readlines()
new_list = []
for items in lines:
if re.match("[a-zA-Z]+[\s\d]+", items.strip()) or re.match("[\d.\s]+[a-zA-Z]+[\s\d.]*", items.strip()):
print items.strip()
The following is the output
people 0.508931508057 -0.280345656093 -0.0318199105583 -0.189979892892 0.00748802665945 -0.0570929853912 0.0688883067716 0.187604694632 0.114414087961 0.150298183734
well 0.634085165013 -0.130742033765 0.0833007355449 -0.304469830925 0.133714906135 -0.0221626440854 0.062845160898 0.0607120405012 0.0384326647526 -0.0102762686058
but 0.776732251824 0.0216479978956 0.326422159918 0.0654654707123 0.235569019918 0.0792330670559 0.22189299375 0.194232853917 0.102964793215 0.00926554861178
could 0.505766726467 -0.304640132821 0.015043924871 -0.42831149929 0.13475950648 0.0275223466164 0.154347034425 0.443048319277 0.229038343902 -0.209763506494
think 0.734314690035 -0.15352368041 0.383964369466 -0.283262375383 0.000534210123265 0.0452656078196 0.0174349360274 -0.0210130687293 0.0247592836651 0.0930452272721
made 0.546164908581 -0.148512160184 0.301391306124 -0.553970562504 -0.0423941756245 -0.0789194920559 -0.0336542251386 0.00929984630184 -0.030340761377 -0.112650323493
way 0.751616772605 -0.345057880564 0.10091886809 -0.147689086912 -0.0721519520719 -0.246317313253 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062
0.34567 yes 0.9876 -0.00606560306655 0.0689594126233 0.0468387063595 -0.00900506150062

Match the last word and delete the entire line

Input.txt File
12626232 : Bookmarks
1321121:
126262
Here 126262: can be anything text or digit, so basically will search for last word is : (colon) and delete the entire line
Output.txt File
12626232 : Bookmarks
My Code:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not ":" in line:
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
Problem: When I match with : it remove the entire line, but I just want to check if it is exist in the end of line and if it is end of the line then only remove the entire line.
Any suggestion will be appreciated. Thanks.
I saw as following but not sure how to use it in here
a = "abc here we go:"
print a[:-1]

I believe with this you should be able to achieve what you want.
with open(fname) as f:
lines = f.readlines()
for line in lines:
if not line.strip().endswith(':'):
print line
Here fname is the variable pointing to the file location.

You were almost there with your function. You were checking if : appears anywhere in the line, when you need to check if the line ends with it:
def function_example():
fn = 'input.txt'
f = open(fn)
output = []
for line in f:
if not line.strip().endswith(":"): # This is what you were missing
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
You could have also done if not line.strip()[:-1] == ':':, but endswith() is better suited for your use case.
Here is a compact way to do what you are doing above:
def function_example(infile, outfile, limiter=':'):
''' Filters all lines in :infile: that end in :limiter:
and writes the remaining lines to :outfile: '''
with open(infile) as in, open(outfile,'w') as out:
for line in in:
if not line.strip().endswith(limiter):
out.write(line)
The with statement creates a context and automatically closes files when the block ends.

To search if the last letter is : Do following
if line.strip().endswith(':'):
...Do Something...

You can use a regular expression
import re
#Something end with ':'
regex = re.compile('.(:+)')
new_lines = []
file_name = "path_to_file"
with open(file_name) as _file:
lines = _file.readlines()
new_lines = [line for line in lines if regex.search(line.strip())]
with open(file_name, "w") as _file:
_file.writelines(new_lines)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

subset the data and count the line in each file - python

Related

read from line to line yelp dataset by python

How to erase line from text file in Python?

how to read the content of .txt file using python?

How to delete lines in a file in Python?

Match the last word and delete the entire line

Categories

Resources