Reorganizing data - python

I have to input a text file that contains comma seperated and line seperated data in the following format:
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585,05-21-11,04:00:00,REGULAR,003169415,001097588,05-21-11,08:00:00,REGULAR,003169431,001097607
Multiple sets of such data is present in the text file
I need to print all this in new lines with the condition:
1st 3 elements of every set followed by 5 parameters in a new line. So solution of the above set would be:
A002,R051,02-00-00,05-21-11,00:00:00,REGULAR,003169391,001097585
A002,R051,02-00-00,05-21-11,04:00:00,REGULAR,003169415,001097588
A002,R051,02-00-00,05-21-11,08:00:00,REGULAR,003169431,001097607
My function to achieve it is given below:
def fix_turnstile_data(filenames):
for name in filenames:
f_in = open(name, 'r')
reader_in = csv.reader(f_in, delimiter = ',')
f_out = open('updated_' + name, 'w')
writer_out = csv.writer(f_out, delimiter = ',')
array=[]
for line in reader_in:
i = 0
j = -1
while i < len(line):
if i % 8 == 0:
i+=2
j+=1
del array[:]
array.append(line[0])
array.append(line[1])
array.append(line[2])
elif (i+1) % 8 == 0:
array.append(line[i-3*j])
writer_out.writerow(array)
else:
array.append(line[i-3*j])
i+=1
f_in.close()
f_out.close()
The output is wrong and there is a space of 3 lines at the end of those lines whose length is 8. I suspect it might be the writer_out.writerow(array) which is to blame.
Can anyone please help me out?

Hmm, the logic you use ends up being fairly confusing. I'd do it more along these lines (this replaces your for loop), and this is more Pythonic:
for line in reader_in:
header = line[:3]
for i in xrange(3, len(line), 5):
writer_out.writerow(header + line[i:i+5])

Related

Overlapping points in a interval having conditions

I want to find positions that overlaps with two coordinates and also that both are in the same chromosomes.
The file with the positions looks like this
with open(file_path, 'r') as f:
lines = [l for l in f if not l.startswith('#')]
print(lines)
['chr1\t36931696\t.\tT\t.\t100\tPASS\tDP=839\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/.:100:830:839:0.0107:24:-100.0000:0.0071\n', 'chr2\t25457280\t.\tA\t.\t100\tPASS\tDP=1410\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:19:1403:1410:0.0050:24:-100.0000:0.0014\n', '\n', '\n']
# I have limited the file to have only two lines. But actually this normally have 100k lines
And the file with the intervals looks like this
print(bedregions)
[('chr1', 36931694, 36931909, 'CSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)'), ('chr2', 25466989, 25467211, 'DNMT3A.CDS.17.line.57.merged--with.DNMT3A.CDS.16.li.probe--coordinates(25466989-25467211)')]
# I have limited this file as well to have two tuples, this has actually 500 tuples
This is what I have been trying
def roi2(file_path,bedregions):
with open(file_path, 'r') as f:
lines = [l for l in f if not l.startswith('#')]
chr2position = {}
for position, line in enumerate(lines):
# If there is a empty line this will give a empty list
# Amd the following split will give a out of range error
if (len(line)) == 1:
break
# Take the chr
chr = line.strip().split()[0]
if chr not in chr2position:
chr2position[chr] = position
filtered_lines =[]
for element in bedregions:
ch, start, end, probe_name = element
for lineindex in range(start + chr2position[chr], end + chr2position[chr] ):
filtered_lines.append(lines[lineindex])
# This return a error in the last line. IndexError list index out of range
This should do what you want considering the data structure you mentioned
f = open(file_path, 'r')
lines = f.readlines()
chr2base2index = dict()
for index,line in enumerate(lines):
if (len(line)) == 1:
break
if line[0] == '#':
continue
handle = line.strip().split()
chrm, base = handle[0], int(handle[1])
if chrm not in chr2base2index:
chr2base2index[chrm] = dict()
if base not in chr2base2index[chrm]:
chr2base2index[chrm][base] = index
filtered_lines = []
for chrm, start, end, probe_name in bedregions:
if chrm not in chr2base2index:
print(f'Chromosome {chrm} not found')
continue
for base in range(start, end):
index = chr2base2index[chrm].get(base, None)
if index != None:
filtered_lines.append('\t'.join(lines[index].strip().split() + [probe_name]))
filtered_lines
['chr1\t36931696\t.\tT\t.\t100\tPASS\tDP=839\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/.:100:830:839:0.0107:24:-100.0000:0.0071\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931697\t.\tT\t.\t100\tPASS\tDP=832\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:15:829:832:0.0036:24:-100.0000:0.0154\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931698\t.\tT\t.\t100\tPASS\tDP=837\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:36:836:837:0.0012:24:-100.0000:0.0095\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931699\t.\tA\t.\t100\tPASS\tDP=836\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:36:835:836:0.0012:24:-100.0000:0.0107\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931700\t.\tC\t.\t100\tPASS\tDP=818\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:14:814:818:0.0049:24:-100.0000:0.0320\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931701\t.\tA\t.\t100\tPASS\tDP=841\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:20:838:841:0.0036:24:-100.0000:0.0047\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931702\t.\tA\t.\t100\tPASS\tDP=825\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:19:822:825:0.0036:24:-100.0000:0.0237\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931703\t.\tT\t.\t100\tPASS\tDP=833\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:26:832:833:0.0012:24:-100.0000:0.0142\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
'chr1\t36931704\t.\tA\t.\t100\tPASS\tDP=833\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:11:829:833:0.0048:24:-100.0000:0.0142\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)']
Work with as many chromosomes as found in the interval file. If a chromosome is given is the interval file but not in the searched file, this would give you and error so I have put a line that solve that issue and also inform you if that happen.
Results are given in a list, one line per element but this can be change in the last line of the code.

Finding missing lines in file

I have a 7000+ lines .txt file, containing description and ordered path to image. Example:
abnormal /Users/alex/Documents/X-ray-classification/data/images/1.png
abnormal /Users/alex/Documents/X-ray-classification/data/images/2.png
normal /Users/alex/Documents/X-ray-classification/data/images/3.png
normal /Users/alex/Documents/X-ray-classification/data/images/4.png
Some lines are missing. I want to somehow automate the search of missing lines. Intuitively i wrote:
f = open("data.txt", 'r')
lines = f.readlines()
num = 1
for line in lines:
if num in line:
continue
else:
print (line)
num+=1
But of course it didn't work, since lines are strings.
Is there any elegant way to sort this out? Using regex maybe?
Thanks in advance!
the following should hopefully work - it grabs the number out of the filename, sees if it's more than 1 higher than the previous number, and if so, works out all the 'in-between' numbers and prints them. Printing the number (and then reconstructing the filename later) is needed as line will never contain the names of missing files during iteration.
# Set this to the first number in the series -1
num = lastnum = 0
with open("data.txt", 'r') as f:
for line in f:
# Pick the digit out of the filename
num = int(''.join(x for x in line if x.isdigit()))
if num - lastnum > 1:
for i in range(lastnum+1, num):
print("Missing: {}.png".format(str(i)))
lastnum = num
The main advantage of working this way is that as long as your files are sorted in the list, it can handle starting at numbers other than 1, and also reports more than one missing number in the sequence.
You can try this:
lines = ["abnormal /Users/alex/Documents/X-ray-classification/data/images/1.png","normal /Users/alex/Documents/X-ray-classification/data/images/3.png","normal /Users/alex/Documents/X-ray-classification/data/images/4.png"]
maxvalue = 4 # or any other maximum value
missing = []
i = 0
for num in range(1, maxvalue+1):
if str(num) not in lines[i]:
missing.append(num)
else:
i += 1
print(missing)
Or if you want to check for the line ending with XXX.png:
lines = ["abnormal /Users/alex/Documents/X-ray-classification/data/images/1.png","normal /Users/alex/Documents/X-ray-classification/data/images/3.png","normal /Users/alex/Documents/X-ray-classification/data/images/4.png"]
maxvalue = 4 # or any other maximum value
missing = []
i = 0
for num in range(1, maxvalue+1):
if not lines[i].endswith(str(num) + ".png"):
missing.append(num)
else:
i += 1
print(missing)
Example: here

Splitting a large json file into multiple json files using python [duplicate]

I have a text file say really_big_file.txt that contains:
line 1
line 2
line 3
line 4
...
line 99999
line 100000
I would like to write a Python script that divides really_big_file.txt into smaller files with 300 lines each. For example, small_file_300.txt to have lines 1-300, small_file_600 to have lines 301-600, and so on until there are enough small files made to contain all the lines from the big file.
I would appreciate any suggestions on the easiest way to accomplish this using Python
lines_per_file = 300
smallfile = None
with open('really_big_file.txt') as bigfile:
for lineno, line in enumerate(bigfile):
if lineno % lines_per_file == 0:
if smallfile:
smallfile.close()
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
smallfile = open(small_filename, "w")
smallfile.write(line)
if smallfile:
smallfile.close()
Using itertools grouper recipe:
from itertools import zip_longest
def grouper(n, iterable, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx
args = [iter(iterable)] * n
return zip_longest(fillvalue=fillvalue, *args)
n = 300
with open('really_big_file.txt') as f:
for i, g in enumerate(grouper(n, f, fillvalue=''), 1):
with open('small_file_{0}'.format(i * n), 'w') as fout:
fout.writelines(g)
The advantage of this method as opposed to storing each line in a list, is that it works with iterables, line by line, so it doesn't have to store each small_file into memory at once.
Note that the last file in this case will be small_file_100200 but will only go until line 100000. This happens because fillvalue='', meaning I write out nothing to the file when I don't have any more lines left to write because a group size doesn't divide equally. You can fix this by writing to a temp file and then renaming it after instead of naming it first like I have. Here's how that can be done.
import os, tempfile
with open('really_big_file.txt') as f:
for i, g in enumerate(grouper(n, f, fillvalue=None)):
with tempfile.NamedTemporaryFile('w', delete=False) as fout:
for j, line in enumerate(g, 1): # count number of lines in group
if line is None:
j -= 1 # don't count this line
break
fout.write(line)
os.rename(fout.name, 'small_file_{0}.txt'.format(i * n + j))
This time the fillvalue=None and I go through each line checking for None, when it occurs, I know the process has finished so I subtract 1 from j to not count the filler and then write the file.
I do this a more understandable way and using less short cuts in order to give you a further understanding of how and why this works. Previous answers work, but if you are not familiar with certain built-in-functions, you will not understand what the function is doing.
Because you posted no code I decided to do it this way since you could be unfamiliar with things other than basic python syntax given that the way you phrased the question made it seem as though you did not try nor had any clue as how to approach the question
Here are the steps to do this in basic python:
First you should read your file into a list for safekeeping:
my_file = 'really_big_file.txt'
hold_lines = []
with open(my_file,'r') as text_file:
for row in text_file:
hold_lines.append(row)
Second, you need to set up a way of creating the new files by name! I would suggest a loop along with a couple counters:
outer_count = 1
line_count = 0
sorting = True
while sorting:
count = 0
increment = (outer_count-1) * 300
left = len(hold_lines) - increment
file_name = "small_file_" + str(outer_count * 300) + ".txt"
Third, inside that loop you need some nested loops that will save the correct rows into an array:
hold_new_lines = []
if left < 300:
while count < left:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
sorting = False
else:
while count < 300:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
Last thing, again in your first loop you need to write the new file and add your last counter increment so your loop will go through again and write a new file
outer_count += 1
with open(file_name,'w') as next_file:
for row in hold_new_lines:
next_file.write(row)
note: if the number of lines is not divisible by 300, the last file will have a name that does not correspond to the last file line.
It is important to understand why these loops work. You have it set so that on the next loop, the name of the file that you write changes because you have the name dependent on a changing variable. This is a very useful scripting tool for file accessing, opening, writing, organizing etc.
In case you could not follow what was in what loop, here is the entirety of the function:
my_file = 'really_big_file.txt'
sorting = True
hold_lines = []
with open(my_file,'r') as text_file:
for row in text_file:
hold_lines.append(row)
outer_count = 1
line_count = 0
while sorting:
count = 0
increment = (outer_count-1) * 300
left = len(hold_lines) - increment
file_name = "small_file_" + str(outer_count * 300) + ".txt"
hold_new_lines = []
if left < 300:
while count < left:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
sorting = False
else:
while count < 300:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
outer_count += 1
with open(file_name,'w') as next_file:
for row in hold_new_lines:
next_file.write(row)
lines_per_file = 300 # Lines on each small file
lines = [] # Stores lines not yet written on a small file
lines_counter = 0 # Same as len(lines)
created_files = 0 # Counting how many small files have been created
with open('really_big_file.txt') as big_file:
for line in big_file: # Go throught the whole big file
lines.append(line)
lines_counter += 1
if lines_counter == lines_per_file:
idx = lines_per_file * (created_files + 1)
with open('small_file_%s.txt' % idx, 'w') as small_file:
# Write all lines on small file
small_file.write('\n'.join(stored_lines))
lines = [] # Reset variables
lines_counter = 0
created_files += 1 # One more small file has been created
# After for-loop has finished
if lines_counter: # There are still some lines not written on a file?
idx = lines_per_file * (created_files + 1)
with open('small_file_%s.txt' % idx, 'w') as small_file:
# Write them on a last small file
small_file.write('n'.join(stored_lines))
created_files += 1
print '%s small files (with %s lines each) were created.' % (created_files,
lines_per_file)
import csv
import os
import re
MAX_CHUNKS = 300
def writeRow(idr, row):
with open("file_%d.csv" % idr, 'ab') as file:
writer = csv.writer(file, delimiter=',', quotechar='\"', quoting=csv.QUOTE_ALL)
writer.writerow(row)
def cleanup():
for f in os.listdir("."):
if re.search("file_.*", f):
os.remove(os.path.join(".", f))
def main():
cleanup()
with open("large_file.csv", 'rb') as results:
r = csv.reader(results, delimiter=',', quotechar='\"')
idr = 1
for i, x in enumerate(r):
temp = i + 1
if not (temp % (MAX_CHUNKS + 1)):
idr += 1
writeRow(idr, x)
if __name__ == "__main__": main()
with open('/really_big_file.txt') as infile:
file_line_limit = 300
counter = -1
file_index = 0
outfile = None
for line in infile.readlines():
counter += 1
if counter % file_line_limit == 0:
# close old file
if outfile is not None:
outfile.close()
# create new file
file_index += 1
outfile = open('small_file_%03d.txt' % file_index, 'w')
# write to file
outfile.write(line)
I had to do the same with 650000 line files.
Use the enumerate index and integer div it (//) with the chunk size
When that number changes close the current file and open a new one
This is a python3 solution using format strings.
chunk = 50000 # number of lines from the big file to put in small file
this_small_file = open('./a_folder/0', 'a')
with open('massive_web_log_file') as file_to_read:
for i, line in enumerate(file_to_read.readlines()):
file_name = f'./a_folder/{i // chunk}'
print(i, file_name) # a bit of feedback that slows the process down a
if file_name == this_small_file.name:
this_small_file.write(line)
else:
this_small_file.write(line)
this_small_file.close()
this_small_file = open(f'{file_name}', 'a')
Set files to the number of file you want to split the master file to
in my exemple i want to get 10 files from my master file
files = 10
with open("data.txt","r") as data :
emails = data.readlines()
batchs = int(len(emails)/10)
for id,log in enumerate(emails):
fileid = id/batchs
file=open("minifile{file}.txt".format(file=int(fileid)+1),'a+')
file.write(log)
A very easy way would if you want to split it in 2 files for example:
with open("myInputFile.txt",'r') as file:
lines = file.readlines()
with open("OutputFile1.txt",'w') as file:
for line in lines[:int(len(lines)/2)]:
file.write(line)
with open("OutputFile2.txt",'w') as file:
for line in lines[int(len(lines)/2):]:
file.write(line)
making that dynamic would be:
with open("inputFile.txt",'r') as file:
lines = file.readlines()
Batch = 10
end = 0
for i in range(1,Batch + 1):
if i == 1:
start = 0
increase = int(len(lines)/Batch)
end = end + increase
with open("splitText_" + str(i) + ".txt",'w') as file:
for line in lines[start:end]:
file.write(line)
start = end
In Python files are simple iterators. That gives the option to iterate over them multiple times and always continue from the last place the previous iterator got. Keeping this in mind, we can use islice to get the next 300 lines of the file each time in a continuous loop. The tricky part is knowing when to stop. For this we will "sample" the file for the next line and once it is exhausted we can break the loop:
from itertools import islice
lines_per_file = 300
with open("really_big_file.txt") as file:
i = 1
while True:
try:
checker = next(file)
except StopIteration:
break
with open(f"small_file_{i*lines_per_file}.txt", 'w') as out_file:
out_file.write(checker)
for line in islice(file, lines_per_file-1):
out_file.write(line)
i += 1

Do something to line and next lines until a symbol is hit

I have data, that is set up as the following:
//Name_1 * *
>a xyzxyzyxyzyxzzxy
>b xyxyxyzxyyxzyxyz
>c xyzyxzyxyzyxyzxy
//Name_2
>a xyzxyzyxyzxzyxyx
>b zxyzxyzxyyzxyxzx
>c zxyzxyzxyxyzyzxy
//Name_3 * *
>a xyzxyzyxyzxzyxyz
>b zxyzxyzxzyyzyxyx
>c zxyzxyzxyxyzyzxy
...
The //-line refers to an ID for the following group of sequences until the next //-line is reached.
I have been working on writing a program, that reads the position of the asterix, and print the characters on the given position for the sequences.
To simplifiy things for myself, I have been working on a subset of my data, containing only one group of sequences, so e.g.:
//Name_1 * *
>a xyzxyzyxyzyxzzxy
>b xyxyxyzxyyxzyxyz
>c xyzyxzyxyzyxyzxy
My program does what I want on this subset.
import sys
import csv
datafile = open(sys.argv[1], 'r')
outfile = open(sys.argv[1]+"_FGT_Data", 'w')
csv_out = csv.writer(outfile, delimiter=',')
csv_out.writerow(['Locus', 'Individual', 'Nucleotide', 'Position'])
with (datafile) as searchfile:
var_line = [line for line in searchfile if '*' in line]
LocusID = [line[2:13].strip() for line in var_line]
poslist = [i for line in var_line for i, x in enumerate(line) if x =='*']
datafile = open(sys.argv[1], 'r')
with (datafile) as getsnps:
lines = [line for line in getsnps.readlines() if line.startswith('>')]
for pos in poslist:
for line in lines:
snp = line[pos]
individual = line[0:7]
indistr = individual.strip()
csv_out.writerow((LocusID[0], indistr, line[pos], str(pos)))
datafile.close()
outfile.close()
However, now I am trying to modify it to work on the full dataset. I am having trouble finding a way to iterate over the data in the correct way.
I need to search through the file, and when a line containing '' is reached, I need to do as in the above code for the sequences corresponding to the given line, and then continue to the next line containing an ''. Do I need to split up my data with regards to the //-lines or what is the best approach?
I have uploaded a sample of my data to dropbox:
Data_Sample.txt contains several groups, and is the kind of data, I am trying to get the program to work on.
Data_One_Group.txt contains only one group, and is the data I have gotten the program to work on so far.
https://www.dropbox.com/sh/3j4i04s2rg6b63h/AADkWG3OcsutTiSsyTl8L2Vda?dl=0
--------EDIT---------
I am trying to implement the suggestion by #Julien Spronck below.
However, I am having trouble processing the produced block. How would I be able to search through the block line for line. E.g., why does the below not work as intended? It just prints the asterix' and not the line itself.
block =''
with open('onelocus.txt', 'r') as searchfile:
for line in searchfile:
if line.startswith('//'):
#print line
if block:
for line in block:
if '*' in line:
print line
block = line
else:
block += line
---------EDIT 2----------
I am getting closer. I understand that fact, that I need to split the string into line, to be able to search through them. The below works on one group, but when I try to itereate over several, it prints the information for the first group only. But does it for as many groups, as there are. I have tried clearing LocusID and poslist before next iteration, but this does not seem to be the solution.
block =''
with (datafile) as searchfile:
for line in searchfile:
if line.startswith('//'):
if block:
var_line = [line for line in block.splitlines() if '*' in line]
LocusID = [line[2:13].strip() for line in var_line]
print LocusID
poslist = [i for line in var_line for i, x in enumerate(line) if x == '*']
print poslist
block = line
else:
block += line
Can't you do something like:
block =''
with open(filename, 'r') as fil:
for line in fil:
if line.startswith('//'):
if block:
do_something_with(block)
block = line
else:
block += line
if block:
do_something_with(block)
In this code, I just append the lines of the file to a variable block. Once I find a line that starts with //, I process the previous block and reinitialize the block for the next iteration.
The last two lines will take care of processing the last block, which would not be processed otherwise.
do_something_with(block) could be something like this:
def do_something_with(block):
lines = block.splitlines()
j = 0
first_line = lines[j]
while first_line.strip() == '':
j += 1
first_line = lines[j]
pos = []
position = first_line.find('*')
while position != -1:
pos.append(position)
position = first_line.find('*', position+1)
for k, line in enumerate(lines):
if k > j:
for p in pos:
print line[p],
print
## prints
## z y
## x z
## z y
I have created a way to make this work with the data you provided.
You should run it with 2 file locations, 1 should be your input.txt and 2 should be your output.csv
explanation
first we create a dictionary with the locus as key and the sequences as values.
We iterate over this dictionary and get the * locations in the locus and append these to a list indexes.
We iterate over the values belonging to this key and extract the sequence
per iteration we iterate over indexes so that we gather the snps.
per iteration we append to our csv file.
We empty the indexes list so we can go to the next key.
Keep in mind
This method is highly dependant on the amount of spaces you have inside your input.txt.
You should know that this will not be the fastest way to get it done. but it does get it done.
I hope this helped, if you have any questions, feel free to ask them, and if I have time, I will happily try to answer them.
script
import sys
import csv
sequences = []
dic = {}
indexes = []
datafile = sys.argv[1]
outfile = sys.argv[2]
with open(datafile,'r') as snp_file:
lines = snp_file.readlines()
for i in range(0,len(lines)):
if lines[i].startswith("//"):
dic[lines[i].rstrip()] = sequences
del sequences[:]
if lines[i].startswith(">"):
sequences.append(lines[i].rstrip())
for key in dic:
locus = key.split(" ")[0].replace("//","")
for i, x in enumerate(key):
if x == '*':
indexes.append(i-11)
for sequence in dic[key]:
seq = sequence.split(" ")[1]
seq_id = sequence.split(" ")[0].replace(">","")
for z in indexes:
position = z+1
nucleotide = seq[z]
with open(outfile,'a')as handle:
csv_out = csv.writer(handle, delimiter=',')
csv_out.writerow([locus,seq_id,position,nucleotide])
del indexes[:]
input.txt
//Locus_1 * *
>Safr01 AATCCGTTTTAAACCAGNTCYAT
>Safr02 TTAATCCGTTTTAAACCAGNTCY
//Locus_2 * *
>Safr01 AATCCGTTTTAAACCAGNTCYAT
>Safr02 TTAATCCGTTTTAAACCAGNTCY
output.csv
Locus_1,Safr01,1,A
Locus_1,Safr01,22,A
Locus_1,Safr02,1,T
Locus_1,Safr02,22,C
Locus_2,Safr01,5,C
Locus_2,Safr01,19,T
Locus_2,Safr02,5,T
Locus_2,Safr02,19,G
This is how I ended up solving the problem:
def do_something_with(block):
lines = block.splitlines()
for line in lines:
if '*' in line:
hit = line
LocusID = hit[2:13].strip()
for i, x in enumerate(hit):
if x=='*':
poslist.append(i)
for pos in poslist:
for line in lines:
if line.startswith('>'):
individual = line[0:7].strip()
snp = line[pos]
print LocusID, individual, snp, pos,
csv_out.writerow((LocusID, individual, snp, pos))
with (datafile) as searchfile:
for line in searchfile:
if line.startswith('//'):
if block:
do_something_with(block)
poslist = list()
block = line
else:
block += line
if block:
do_something_with(block)

Splitting large text file into smaller text files by line numbers using Python

I have a text file say really_big_file.txt that contains:
line 1
line 2
line 3
line 4
...
line 99999
line 100000
I would like to write a Python script that divides really_big_file.txt into smaller files with 300 lines each. For example, small_file_300.txt to have lines 1-300, small_file_600 to have lines 301-600, and so on until there are enough small files made to contain all the lines from the big file.
I would appreciate any suggestions on the easiest way to accomplish this using Python
lines_per_file = 300
smallfile = None
with open('really_big_file.txt') as bigfile:
for lineno, line in enumerate(bigfile):
if lineno % lines_per_file == 0:
if smallfile:
smallfile.close()
small_filename = 'small_file_{}.txt'.format(lineno + lines_per_file)
smallfile = open(small_filename, "w")
smallfile.write(line)
if smallfile:
smallfile.close()
Using itertools grouper recipe:
from itertools import zip_longest
def grouper(n, iterable, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx
args = [iter(iterable)] * n
return zip_longest(fillvalue=fillvalue, *args)
n = 300
with open('really_big_file.txt') as f:
for i, g in enumerate(grouper(n, f, fillvalue=''), 1):
with open('small_file_{0}'.format(i * n), 'w') as fout:
fout.writelines(g)
The advantage of this method as opposed to storing each line in a list, is that it works with iterables, line by line, so it doesn't have to store each small_file into memory at once.
Note that the last file in this case will be small_file_100200 but will only go until line 100000. This happens because fillvalue='', meaning I write out nothing to the file when I don't have any more lines left to write because a group size doesn't divide equally. You can fix this by writing to a temp file and then renaming it after instead of naming it first like I have. Here's how that can be done.
import os, tempfile
with open('really_big_file.txt') as f:
for i, g in enumerate(grouper(n, f, fillvalue=None)):
with tempfile.NamedTemporaryFile('w', delete=False) as fout:
for j, line in enumerate(g, 1): # count number of lines in group
if line is None:
j -= 1 # don't count this line
break
fout.write(line)
os.rename(fout.name, 'small_file_{0}.txt'.format(i * n + j))
This time the fillvalue=None and I go through each line checking for None, when it occurs, I know the process has finished so I subtract 1 from j to not count the filler and then write the file.
I do this a more understandable way and using less short cuts in order to give you a further understanding of how and why this works. Previous answers work, but if you are not familiar with certain built-in-functions, you will not understand what the function is doing.
Because you posted no code I decided to do it this way since you could be unfamiliar with things other than basic python syntax given that the way you phrased the question made it seem as though you did not try nor had any clue as how to approach the question
Here are the steps to do this in basic python:
First you should read your file into a list for safekeeping:
my_file = 'really_big_file.txt'
hold_lines = []
with open(my_file,'r') as text_file:
for row in text_file:
hold_lines.append(row)
Second, you need to set up a way of creating the new files by name! I would suggest a loop along with a couple counters:
outer_count = 1
line_count = 0
sorting = True
while sorting:
count = 0
increment = (outer_count-1) * 300
left = len(hold_lines) - increment
file_name = "small_file_" + str(outer_count * 300) + ".txt"
Third, inside that loop you need some nested loops that will save the correct rows into an array:
hold_new_lines = []
if left < 300:
while count < left:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
sorting = False
else:
while count < 300:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
Last thing, again in your first loop you need to write the new file and add your last counter increment so your loop will go through again and write a new file
outer_count += 1
with open(file_name,'w') as next_file:
for row in hold_new_lines:
next_file.write(row)
note: if the number of lines is not divisible by 300, the last file will have a name that does not correspond to the last file line.
It is important to understand why these loops work. You have it set so that on the next loop, the name of the file that you write changes because you have the name dependent on a changing variable. This is a very useful scripting tool for file accessing, opening, writing, organizing etc.
In case you could not follow what was in what loop, here is the entirety of the function:
my_file = 'really_big_file.txt'
sorting = True
hold_lines = []
with open(my_file,'r') as text_file:
for row in text_file:
hold_lines.append(row)
outer_count = 1
line_count = 0
while sorting:
count = 0
increment = (outer_count-1) * 300
left = len(hold_lines) - increment
file_name = "small_file_" + str(outer_count * 300) + ".txt"
hold_new_lines = []
if left < 300:
while count < left:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
sorting = False
else:
while count < 300:
hold_new_lines.append(hold_lines[line_count])
count += 1
line_count += 1
outer_count += 1
with open(file_name,'w') as next_file:
for row in hold_new_lines:
next_file.write(row)
lines_per_file = 300 # Lines on each small file
lines = [] # Stores lines not yet written on a small file
lines_counter = 0 # Same as len(lines)
created_files = 0 # Counting how many small files have been created
with open('really_big_file.txt') as big_file:
for line in big_file: # Go throught the whole big file
lines.append(line)
lines_counter += 1
if lines_counter == lines_per_file:
idx = lines_per_file * (created_files + 1)
with open('small_file_%s.txt' % idx, 'w') as small_file:
# Write all lines on small file
small_file.write('\n'.join(stored_lines))
lines = [] # Reset variables
lines_counter = 0
created_files += 1 # One more small file has been created
# After for-loop has finished
if lines_counter: # There are still some lines not written on a file?
idx = lines_per_file * (created_files + 1)
with open('small_file_%s.txt' % idx, 'w') as small_file:
# Write them on a last small file
small_file.write('n'.join(stored_lines))
created_files += 1
print '%s small files (with %s lines each) were created.' % (created_files,
lines_per_file)
import csv
import os
import re
MAX_CHUNKS = 300
def writeRow(idr, row):
with open("file_%d.csv" % idr, 'ab') as file:
writer = csv.writer(file, delimiter=',', quotechar='\"', quoting=csv.QUOTE_ALL)
writer.writerow(row)
def cleanup():
for f in os.listdir("."):
if re.search("file_.*", f):
os.remove(os.path.join(".", f))
def main():
cleanup()
with open("large_file.csv", 'rb') as results:
r = csv.reader(results, delimiter=',', quotechar='\"')
idr = 1
for i, x in enumerate(r):
temp = i + 1
if not (temp % (MAX_CHUNKS + 1)):
idr += 1
writeRow(idr, x)
if __name__ == "__main__": main()
with open('/really_big_file.txt') as infile:
file_line_limit = 300
counter = -1
file_index = 0
outfile = None
for line in infile.readlines():
counter += 1
if counter % file_line_limit == 0:
# close old file
if outfile is not None:
outfile.close()
# create new file
file_index += 1
outfile = open('small_file_%03d.txt' % file_index, 'w')
# write to file
outfile.write(line)
I had to do the same with 650000 line files.
Use the enumerate index and integer div it (//) with the chunk size
When that number changes close the current file and open a new one
This is a python3 solution using format strings.
chunk = 50000 # number of lines from the big file to put in small file
this_small_file = open('./a_folder/0', 'a')
with open('massive_web_log_file') as file_to_read:
for i, line in enumerate(file_to_read.readlines()):
file_name = f'./a_folder/{i // chunk}'
print(i, file_name) # a bit of feedback that slows the process down a
if file_name == this_small_file.name:
this_small_file.write(line)
else:
this_small_file.write(line)
this_small_file.close()
this_small_file = open(f'{file_name}', 'a')
Set files to the number of file you want to split the master file to
in my exemple i want to get 10 files from my master file
files = 10
with open("data.txt","r") as data :
emails = data.readlines()
batchs = int(len(emails)/10)
for id,log in enumerate(emails):
fileid = id/batchs
file=open("minifile{file}.txt".format(file=int(fileid)+1),'a+')
file.write(log)
A very easy way would if you want to split it in 2 files for example:
with open("myInputFile.txt",'r') as file:
lines = file.readlines()
with open("OutputFile1.txt",'w') as file:
for line in lines[:int(len(lines)/2)]:
file.write(line)
with open("OutputFile2.txt",'w') as file:
for line in lines[int(len(lines)/2):]:
file.write(line)
making that dynamic would be:
with open("inputFile.txt",'r') as file:
lines = file.readlines()
Batch = 10
end = 0
for i in range(1,Batch + 1):
if i == 1:
start = 0
increase = int(len(lines)/Batch)
end = end + increase
with open("splitText_" + str(i) + ".txt",'w') as file:
for line in lines[start:end]:
file.write(line)
start = end
In Python files are simple iterators. That gives the option to iterate over them multiple times and always continue from the last place the previous iterator got. Keeping this in mind, we can use islice to get the next 300 lines of the file each time in a continuous loop. The tricky part is knowing when to stop. For this we will "sample" the file for the next line and once it is exhausted we can break the loop:
from itertools import islice
lines_per_file = 300
with open("really_big_file.txt") as file:
i = 1
while True:
try:
checker = next(file)
except StopIteration:
break
with open(f"small_file_{i*lines_per_file}.txt", 'w') as out_file:
out_file.write(checker)
for line in islice(file, lines_per_file-1):
out_file.write(line)
i += 1

Categories

Resources