Merging contents of two lists based on a if-loop - python

I have a minor problem while checking for elements in a list:
I have two files with contents something like this
file 1: file2:
47 358 47
48 450 49
49 56 50
I parsed both files into two lists and used the following code to check
for i in file_1:
for j in file_2:
j = j.split()
if i == j[1]:
x=' '.join(j)
print >> write_in, x
I am now trying to get a "0" if the value of file_1 is not there in file_2 for example, value "48" is not there is file_2 so I need to get the output like (with only one space in between the two numbers) Also both the conditions should produce only one output file:
output_file:
358 47
0 48
450 49
56 50
I tried using the dictionary approach but I didn't quite get what I wanted (actually I don't know how to use dictionary in python correctly ;)). Any help will be great.

r1=open('file1').read().split()
r2=open('file2').read().split()
d=dict(zip(r2[1::2],r2[::2]))
output='\n'.join(x in d and d[x]+' '+x or '0 '+x for x in r1)
open('output_file','wb').write(output)
Test
>>> file1='47\n48\n49\n50'
>>> file2='358 47\n450 49\n56 50'
>>>
>>> r1=file1.split()
>>> r2=file2.split()
>>>
>>> d=dict(zip(r2[1::2],r2[::2])) #
>>> d
{'47': '358', '50': '56', '49': '450'}
>>>
>>> print '\n'.join(x in d and d[x]+' '+x or '0 '+x for x in r1)
358 47
0 48
450 49
56 50
>>>

You could modify your code quite easily:
for i in file_1:
x = None
for j in file_2:
j = j.split()
if i == j[1]:
x = ' '.join(j)
if x is None:
x = ' '.join(['0', i])
Depending on your inputs, the whole task might be of course simplified even further. At the moment, your code is 0(n**2) complexity.

Here's a readable solution using a dictionary:
d = {}
for k in file1:
d[k] = 0
for line in file2:
v, k = line.split()
d[k] = v
for k in sorted(d):
print d[k], k

You can try something like:
l1 = open('file1').read().split()
l2 = [line.split() for line in open('file2')]
for x, y in zip(l1, l2):
if x not in y:
print 0, x
print ' '.join(y)
but if you follow your logic, the output should be
358 47
0 48
450 49
0 49
56 50
and not
358 47
0 48
450 49
56 50

def file_process(filename1, filename2):
# read first file with zeroes as values
with open(filename1) as fp:
adict= dict( (line.rstrip(), 0) for line in fp)
# read second file as "value key"
with open(filename2) as fp:
adict.update(
line.rstrip().partition(" ")[2::-2] # tricky, read notes
for line in fp)
for key in sorted(adict):
yield adict[key], key
fp= open("output_file", "w")
fp.writelines("%s %s\n" % items for items in file_process("file1", "file2"))
fp.close()
str.partition(" ") returns a tuple of (pre-space, space, post-space). By slicing the tuple, starting at item 2 (post-space) and moving by a step of -2, we return a tuple of (post-space, pre-space), which are (key, value) for the dictionary that describes the solution.
PS Um :) I just noticed that my answer is essentially the same as Daniel Stutzbach's.

Related

How to give a condition that if difference between two hexadecimal no is 2 then concatenate two rows?

I have a file which contain two columns, eg
abc.txt
000000008b8c5200 af dg sd jh g1 43 66 23 67
000000008b8c5220 bc bi ub cb ue qi hd 16 72
0000000056fb2620 ad ag sj ha bn bc bh 53 69
0000000056fb2640 ak bh jg bf re 34 16 8g ff
0000000045ab4630 sg fj g3 6g dh w7 28 g7 dg
0000000045ab4650 jb sh jd b7 us vy ys du 89
Here I need to concatenate 2nd row 2nd column with first row first column like this:
bcbiubcbueqihd1672afdgsdjhg143662367
Condition for concatenating:
only when (hexadecimal)difference between 2nd row, 1st column and 1st row, 1st column is 20. For this example it would be:
000000008b8c5220 - 000000008b8c5200 = 20.
0000000056fb2640 - 0000000056fb2620 = 20.
0000000045ab4650 - 0000000045ab4630 = 20.
Similarly for upcoming rows and columns. Write the results to a file with first row and concatenated data like this:
000000008b8c5200 bcbiubcbueqihd1672afdgsdjhg143662367
0000000056fb2620 akbhjgbfre34168gffadagsjhabnbcbh5369
0000000045ab4630 jbshjdb7usvyysdu89sgfjg36gdhw728g7dg
How can I do this?
Here is how you could do it:
with open('input.txt', 'r') as f, open('output.txt', 'w') as g:
out = []
lines = [elem.strip().split() for elem in f] # lines will be list of lists, each list with 2 elements (-> 2 columns)
for line_x, line_y in zip(lines[::2], lines[1::2]): # use zip and steps of 2 to iterate like: 1&2, 3&4, 5&6...
num_x = int(line_x[0], base=16) # convert hex to int
num_y = int(line_y[0], base=16)
print(num_x, num_y, num_y - num_x)
if num_y - num_x == 32:
# if difference is 20, build string seperated by tab with the hex number and the concatenated data
new_row = '\t'.join([line_x[0], line_y[1] + line_y[1]])
out.append(new_row)
g.write('\n'.join(out)) # write all new rows to output file
For the provided example it prints:
2341229056 2341229088 32
1459299872 1459299904 32
1168852528 1168852560 32
Because no row difference is 20 there will be no data in the output file.
UPDATE
for your changed input you can do it like this:
with open('input.txt', 'r') as f, open('output.txt', 'w') as g:
out = []
lines = [elem.strip().split() for elem in f] # lines will be list of lists
print(lines[0]) # this is how each line looks
# ['000000008b8c5200', 'af', 'dg', 'sd', 'jh', 'g1', '43', '66', '23', '67']
for a, b in zip(lines[::2], lines[1::2]): # use zip and steps of 2 to iterate like: 1&2, 3&4, 5&6...
num_a, remaining_a = a[0], ''.join(a[1:])
num_b, remaining_b = b[0], ''.join(b[1:])
if int(num_b, base=16) - int(num_a, base=16) == 20: # convert hex string to int and build difference
# if difference is 20, build a tuple with the number as 1st element and concatenated data as 2nd element
new_row = '\t'.join([num_a, remaining_b + remaining_a])
out.append(new_row)
g.write('\n'.join(out)) # write row to new file
If I understand your question is how to make hex str to int (for the subtraction)
you can use the int command with 2nd parameter as the base (16)
>>> int('abc', 16)
2748
>>> int('000000008b8c5220',16) - int('000000008b8c5200',16) == 0x20
True
if your qustion is how to read the text from the file:
with open('filename.txt') as my_file:
file_data = [line.replace('\n','').split(' ') for line in my_file if line]
and now you have a 2D array of strs of your data

Python : Sum of numbers in different files

I know it seems my problem already has a solution but it's not quite what I need in the other subjects. So here is it:
I have 200 files where each file has 800 lines. Each line of a file contains 800 numbers. In short, each file has exactly the same format. Let's say, to make it simple, that my files are something like that:
File 1:
28 56 72 50 01
65 41 20 18 00
File 2:
01 32 09 05 42
00 23 14 52 99
What I need to do is, the sum of the numbers placed at the same location in the files, meaning that I need an output file like that:
Output:
29 88 81 55 43
65 64 34 70 99
For now, what I wanted to do is write each line in different files but it would take so much place...
I'm not sure how I can do that. If anyone has any suggestion, I'm open to it. Thanks
First you can load a single file, in order to get the strucure of the file(s). This will also handel the case where not all rows have the same number of observations. Then based on the structure you itterate over all files and rows in order to add the single values.
further_files = ['file 2']
sums = []
with open('file 1') as file:
for row in file:
sums.append(row.split())
for file in further_files:
with open(file) as open_file:
for i, row in enumerate(open_file):
sums[i] = [x + y for x, y in zip(sums[i], row.split())]
Assuming you know the file format beforehand and have a list of file names, you just iterate over the files and accumulate the sums in a list of lists of the right size:
nrows, ncols = 2, 5 # 800, 800 in your real code
sums = [[0] * ncols for _ in range(nrows)]
file_names = ["file1.txt", "file2.txt"]
for file_name in file_names:
with open(file_name) as f:
for i, row in enumerate(f):
for j, col in enumerate(row.split()):
sums[i][j] += int(col)
for row in sums:
print(*row)
# 29 88 81 55 43
# 65 64 34 70 99
Alternatively using numpy.loadtxt:
import numpy as np
sum(np.loadtxt(file_name, dtype=int) for file_name in file_names)
# array([[ 29, 88, 81, 55, 43],
# [ 65, 64, 34, 70, 99]])
Using numpy
Ex:
import os
import numpy as np
result = {}
base_path = r"PATH_TO_FILES"
for filename in os.listdir(base_path): #Iterate each file
filename = os.path.join(base_path, filename)
with open(filename) as infile: #Open file for read
for i, line in enumerate(infile):
if i not in result:
result[i] = np.array(line.split(), dtype=int)
else:
result[i] = result[i] + np.array(line.split(), dtype=int) #sum lines
for k, v in result.items():
print(v)
Output:
[29 88 81 55 43]
[65 64 34 70 99]

python; how to write output to text file

With my code, I loop over files and count patterns in files. My code is as follows
from collections import defaultdict
import csv, os, re
from itertools import groupby
import glob
def count_kmers(read, k):
counts = defaultdict(list)
num_kmers = len(read) - k + 1
for i in range(num_kmers):
kmer = read[i:i+k]
if kmer not in counts:
counts[kmer] = 0
counts[kmer] += 1
for item in counts:
return(basename, sequence, item, counts[item])
for fasta_file in glob.glob('*.fasta'):
basename = os.path.splitext(os.path.basename(fasta_file))[0]
with open(fasta_file) as f_fasta:
for k, g in groupby(f_fasta, lambda x: x.startswith('>')):
if k:
sequence = next(g).strip('>\n')
else:
d1 = list(''.join(line.strip() for line in g))
d2 = ''.join(d1)
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
reverse_complement = "".join(complement.get(base, base) for base in reversed(d1))
d3 = list(''.join(line.strip() for line in reverse_complement))
d4 = ''.join(d3)
d5 = (d2+d4)
counting = count_kmers(d5, 5)
with open('kmer.out', 'a') as text_file:
text_file.write(counting)
And my output looks like this
1035 1 GAGGA 2
1035 1 CGCAT 1
1035 1 TCCCG 1
1035 1 CTCAT 2
1035 1 CCTGG 2
1035 1 GTCCA 1
1035 1 CATGG 1
1035 1 TAGCC 2
1035 1 GCTGC 7
1035 1 TGCAT 1
The code works fine, but I cannot write my output to a file. I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-190-89e3487da562> in <module>()
37 counting = count_kmers(d5, 5)
38 with open('kmer.out', 'w') as text_file:
---> 39 text_file.write(counting)
TypeError: write() argument must be str, not tuple
What am I doing wrong and how can I solve this problem, to make sure that my code write the output to a txt file?
The original verions of count_kmers() did not contain a return statement, which means it has an implicit return None.
As you assign this to counting all of your errors became self explanatory.
After your edit, the end of the function looked like this:
for item in counts:
return(basename, sequence, item, counts[item])
which returns a tuple with four values. It also exits the function on the first pass through the loop.

Python list in list formatting the output

I have a list in list as follows:
bounding_time = [['58'], ['68']]
v = [['-0.00162439495203'], ['-0.000178892778126'],]
and 58 in bounding_time correspond to first item in v and subsequently for 68. I trying to write to a file in such a way that I should get
58 -0.00162439495203
68 -0.000178892778126
However, with my code, which is:
for bt_new in bounding_time:
bt = ''.join(map(str, bt_new))
print bt
for v_new in v[0]:
print v_new
I am getting
58
68
['-0.00162439495203']['-0.000178892778126']
Is there a way to format these lists to the desired output?
First set up the data:
>>> bounding_time = [['58'], ['68']]
>>> v = [['-0.00162439495203'], ['-0.000178892778126'],]
Now, use zip to iterate over each sublist in each list, accessing the first item in each, and concatenating those strings with 4 empty spaces:
>>> for i, j in zip(bounding_time, v):
... print i[0] + ' ' + j[0]
...
58 -0.00162439495203
68 -0.000178892778126
Or you can ensure you have the same width for the first column with str.ljust.
>>> for i, j in zip(bounding_time, v):
... print i[0].ljust(6) + j[0]
...
58 -0.00162439495203
68 -0.000178892778126

Python loop through two files, do computation, then output 3 files

I have 2 tab delimited files
for example:
file1:
12 23 43 34
433 435 76 76
file2:
123 324 53 65
12 457 54 32
I would like to loop through these 2 files, comparing every line of file1 with file2 and vice versa.
If, for example, the 1st number of 1st line in file1 is the same as the 1st number of 2nd line in file 2:
I would like to put from the 1st line in file1 in a file called output.
then I would like to put all the lines from file1 that didn't find a match in file 2 in a new file
and all the lines from file2 that didn't find a match in file1 in a new file.
so far I have been able to find the matching lines and put them in a file but I'm having trouble putting the lines that didn't match into 2 separate files.
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
count=0
list1=[] #list for lines in file1 that didn't find a match
list2=[] #list for lines in file2 that didn't find a match
for i in one:
for j in two:
columns1=i.strip().split('\t')
num1=int(columns1[0])
columns2=j.strip().split('\t')
num2=int(columns2[0])
if num1==num2:
count+=1
output.write(i+j)
else:
list1.append(i)
list2.append(j)
Problem I have here is with the else part.
Can someone show me the right and better way to do this, I would greatly appreciate.
EDIT: Thanks for the quick responses everyone
The 3 output I would be looking for is:
Output_file1: #Matching results between the 2 files
12 23 43 34 #line from file1
12 457 54 32 #line from file2
Output_file2: #lines from the first file that didn't find a match
433 435 76 76
Output_file3: #lines from the second file that didn't find a match
123 324 53 65
I would suggest that you use the csv module to read your files like so (you might have to mess around with the dialect, see http://docs.python.org/library/csv.html for help:
import csv
one = csv.reader(open(file1, 'r'), dialect='excell')
two = csv.reader(open(file2, 'r'), dialect='excell')
then you might find it easier to "zip" along the lines of both files at the same time like so (see http://docs.python.org/library/itertools.html#itertools.izip_longest):
import itertools
file_match = open('match', 'w')
file_nomatch1 = open('nomatch1', 'w')
file_nomatch2 = open('nomatch2', 'w')
for i,j in itertools.izip_longest(one, two, fillvalue="-"):
if i[0] == j[0]:
file_match.write(str(i)+'\n')
else:
file_nomatch1.write(str(i)+'\n')
file_nomatch2.write(str(j)+'\n')
# and maybe handle the case where one is "-"
I reread the post and realized you are looking for a match between ANY two lines in both files. Maybe someone will find the above code useful, but it wont solve your particular problem.
I'd suggest using set operation
from collections import defaultdict
def parse(filename):
result = defaultdict(list)
for line in open(filename):
# take the first number and put it in result
num = int(line.strip().split(' ')[0])
result[num].append(line)
return result
def select(selected, items):
result = []
for s in selected:
result.extend(items[s])
return result
one = parse('one.txt')
two = parse('two.txt')
one_s = set(one)
two_s = set(two)
intersection = one_s & two_s
one_only = one_s - two_s
two_only = two_s - one_s
one_two = defaultdict(list)
for e in one: one_two[e].extend(one[e])
for e in two: one_two[e].extend(two[e])
open('intersection.txt', 'w').writelines(select(intersection, one_two))
open('one_only.txt', 'w').writelines(select(one_only, one))
open('two_only.txt', 'w').writelines(select(two_only, two))
Think that it is not the best way but it works for me and looks prety easy for understanding:
# Sorry but was not able to check code below
def get_diff(fileObj1, fileObj2):
f1Diff = []
f2Diff = []
outputData = []
# x is one row
f1Data = set(x.strip() for x in fileObj1)
f2Data = set(x.strip() for x in fileObj2)
f1Column1 = set(x.split('\t')[0] for x in f1Data)
f2Column1 = set(x.split('\t')[0] for x in f2Data)
l1Col1Diff = f1Column1 ^ f2Column1
l2Col1Diff = f2Column1 ^ f1Column1
commonPart = f1Column1 & f2column1
for line in f1Data.union(f2Data):
lineKey = line.split('\t')[0]
if lineKey in common:
outputData.append(line)
elif lineKey in l1ColDiff:
f1Diff.append(line)
elif lineKey in l2ColDiff:
f2Diff.append(line)
return outputData, f1Diff, f2Diff
outputData, file1Missed, file2Missed = get_diff(open(file1, 'r'), open(file2, 'r'))
I think that this code fits your purposes
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
first = {x.split('\t')[0] for x in one}
second = {x.split('\t')[0] for x in two}
common = first.intersection( second )
list1 = filter( lambda x: not x.split('\t')[0] in common, one )
list2 = filter( lambda x: not x.split('\t')[0] in common, two )
res1 = filter( lambda x: x.split('\t')[0] in common, one )
res2 = filter( lambda x: x.split('\t')[0] in common, two )
count = len( res1 )
for x in range(count):
output.write( res1[x] )
output.write( res2[x] )

Categories

Resources