find common list between files

find common list between files - python

I have three text file:
fileA:
13 abc
123 def
234 ghi
1234 jkl
12 mno
fileB:
12 abc
12 def
34 qwe
43 rty
45 mno
fileC:
12 abc
34 sdg
43 yui
54 poi
54 def
I would like to see what all the values in the 2nd column are matching between the files. The following code works if the 2nd column is already sorted. but if the 2nd column is not sorted, how do i sort the 2nd column and compare the files ?
fileA = open("A.txt",'r')
fileB = open("B.txt",'r')
fileC = open("C.txt",'r')
listA1 = []
for line1 in fileA:
listA = line1.split('\t')
listA1.append(listA)
listB1 = []
for line1 in fileB:
listB = line1.split('\t')
listB1.append(listB)
listC1 = []
for line1 in fileC:
listC = line1.split('\t')
listC1.append(listC)
for key1 in listA1:
for key2 in listB1:
for key3 in listC1:
if key1[1] == key2[1] and key2[1] == key3[1] and key3[1] == key1[1]:
print "Common between three files:",key1[1]
print "Common between file1 and file2 files:"
for key1 in listA1:
for key2 in listB1:
if key1[1] == key2[1]:
print key1[1]
print "Common between file1 and file3 files:"
for key1 in listA1:
for key2 in listC1:
if key1[1] == key2[1]:
print key1[1]

If you just want to sort A1, B1, and C1 by the second column, this is easy:
listA1.sort(key=operator.itemgetter(1))
If you don't understand itemgetter, this is the same:
listA1.sort(key=lambda element: element[1])
However, I think a better solution is to just use a set:
setA1 = set(element[1] for element in listA1)
setB1 = set(element[1] for element in listB1)
setC1 = set(element[1] for element in listC1)
Or, more simply, don't build the lists in the first place; do this:
setA1 = set()
for line1 in fileA:
listA = line1.split('\t')
setA1.add(listA[1])
Either way:
print "Common between file1 and file2 files:"
for key in setA1 & setA2:
print key
To simplify it further, you probably want to refactor the repeated stuff into functions first:
def read_file(path):
with open(path) as f:
result = set()
for line in f:
columns = line.split('\t')
result.add(columns[1])
return result
setA1 = read_file('A.txt')
setB1 = read_file('B.txt')
setC1 = read_file('C.txt')
And then you can find further opportunities. For example:
def read_file(path):
with open(path) as f:
return set(row[1] for row in csv.reader(f))
As John Clements points out, you don't even really need all three of them to be sets, just A1, so you could instead do this:
def read_file(path):
with open(path) as f:
for row in csv.reader(f):
yield row[1]
setA1 = set(read_file('A.txt'))
iterB1 = read_file('B.txt')
iterC1 = read_file('B.txt')
The only other change you need is that you have to call intersection instead of using the & operator, so:
for key in setA1.intersection(iterB1):
I'm not sure this last change is actually an improvement. But in Python 3.3, where the only thing you need to do is change the return set(…) into yield from (…), I probably would do it this way. (Even if the files are huge and have tons of duplicates, so there was a performance cost to it, I'd just stick unique_everseen from the itertools recipes around the read_file calls.)

Related

How do I merge two csv files based on the words in a given list?

I have two csv files, each of them are in this format,
file1
zip name score
23431 david 12
23231 rob 45
33441 hary 23
98901 rgrg 55
file2
zip1 name1 score1
23433 david 12
23245 stel 45
33478 hary 23
98988 rob 55
12121 jass 33
and I have a list that has the names, like this
lista = ['harry', 'rob', 'wine', 'david', 'jass']
The final csv file should look like this:
name zip score zip1 score1
harry x x x x
rob 23231 45 98988 55
wine x x x x
david 23431 12 23433 12
jass x x 12121 33
that means, if any name from the list lies in either of the csv files, than we should include it in the new csv file along with its zip and score. Otherwise we should print 'x' in it.
This is what I have done so far:
import csv
with open('file1.csv', 'r') as input1, open('file2.csv', 'r') as input2, open('merge_final.csv', 'w') as output:
writer = csv.writer(output)
reader1 = csv.reader(input1)
eader2 = csv.reader(input2)
lista = ['harry', 'rob', 'wine', 'david', 'jass']
writer.writerow(['name','zip','score','zip1','score'])
for i in lista:
for row in list(reader1):
rev = row[1]
if i in rev:
score = row[2]
zip = row[0]
else:
score = 'x'
zip = 'x'
for row in list(reader2):
rev = row[1]
if i in rev:
score1 = row[2]
zip1 = row[0]
else:
score1 = 'x'
zip1 = 'x'
writer.writerow([i, score, zip, score1, zip1])
This code is not working as expected. This is the output I got using this code.
name zip score zip1 score1
harry x x x x
rob x x x x
wine x x x x
david x x x x
jass x x x x
Even thought there are many common words, only 'x' gets printed in the final merged csv file. I think the problem is with the loops. But, I don't seem to figure out the issue.

First, the first call of list(readerX) exhausts the iterator that is the file handle.
Secondly, rev is supposed to be the name already, so check for equality not contains: if name == rev.
Thirdly, you'd mostly get 'x's except for the last names in each file since you iterate the files to the end and only the last row will really matter. You should break the inner loops as soon as you find a name, but set the default values only after you iterated the entire file without finding the name.
Also, it is very bad performance-wise to repeatedly iterate both files. You better load the two files into a permanent data structure with faster lookup like a nested dict with names as keys:
d1 = {row[1]: {'zip': row[0], 'score': row[2]} for row in reader1}
d2 = {row[1]: {'zip': row[0], 'score': row[2]} for row in reader2}
# {'david': {'zip': 23431, 'score: 12, ...}
for name in lista:
if name in d1 or name in d2:
writer.writerow([
name,
d1.get(name, {}).get('zip', 'x'),
d1.get(name, {}).get('score', 'x'),
d2.get(name, {}).get('zip', 'x'),
d2.get(name, {}).get('score', 'x'),
])
To make your own approach work, change as follows, but note that this has terrible performance for larger data because of the nested loops:
# next(reader1) # skip the header line if necessary
lst1 = list(reader1) # load all the data into a list beforehand ...
for i in lista:
for row in lst1: # ... that you can repeatedly iterate
rev = row[1]
if i == rev: # compare for equality
score = row[2]
zip = row[0]
break # <- you found the name, so end the loop!
else: # note the indentation: this is a for-else-loop, not an if-else
# the else-part is only executed if the for loop was NOT break'ed
score = 'x'
zip = 'x'

Don't you think it would be better to read the two files into a single nested dictionary where the names would be the keys and the values would be dictionaries with keys 'zip,'zip1','score' and 'score1'?
{'hary' :
{'zip1':33478,
'zip':33441 ,
'score':23,
'score1':23 }
}
Then iterate through the list and print 'x' for whatever the keys are not present

The errors in above code:
The first loop of list(reader1) exhausts the iterator that is the file handle. So when the next iteration for 'lista' starts the reader1 is empty with len=0.
Instead of repeatedly iterating the file handle store data in a list or dictionary.
When the i matches with rev in case of if(i in rev) you are continuing to iterate over rest of the file which causes the value of zip and score to be reset to 'x' as for the next iteration "i in rev" will give False. You need to remove the else part to rectify this. Instead declare zip,score as 'x' just after for i in lista:

Python renaming duplicates

How to solve this renaming duplicates problem without resorting to renaming with something unique like "_DUPLICATED_#NO" the names have to be unique when finished, and preferably with iterative numbers denoting number of duplicates
from collections import defaultdict
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
tally = defaultdict(lambda:-1)
for i in range(len(l)):
e = l[i]
tally[e] += 1
if tally[e] > 0:
e += str(tally[e])
l[i] = e
print (l)
results:
['hello1', 'hello2', 'hello3', 'hello', 'hello1', 'hello2']
as you can see, the names are not unique

This seems simple enough. You start with a list of filenames:
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
Then you iterate through them to finished filenames, incrementing a trailing number by 1 if a duplicate is found.
result = {}
for fname in l:
orig = fname
i=1
while fname in result:
fname = orig + str(i)
i += 1
result[fname] = orig
This should leave you with a dictionary like:
{"hello1": "hello1",
"hello2": "hello2",
"hello3": "hello3",
"hello": "hello",
"hello4": "hello",
"hello5": "hello"}
Of course if you don't care about mapping the originals to the duplicate names, you can drop that part.
result = set()
for fname in l:
orig = fname
i=1
while fname in result:
fname = orig + str(i)
i += 1
result.add(fname)
If you want a list afterward, just cast it that way.
final = list(result)
Note that if you're creating files, this is exactly what the tempfile module is designed to do.
import tempfile
l = ["hello1","hello2","hello3",
"hello","hello","hello"]
fs = [tempfile.NamedTemporaryFile(prefix=fname, delete=False, dir="/some/directory/") for fname in l]
This will not create nicely incrementing filenames, but they are guaranteed unique, and fs will be a list of the (open) file objects rather than a list of names, although NamedTemporaryFile.name will give you the filename.

python: recursive dictionary of dictionary

I need help with a pretty simple exercise I am trying to execute, just syntactically I'm a bit lost
basically I read in a very brief text file containing 15 lines of 3 elements (essentially 2 keys and a value)
put those elements into a dictionary comprised of dictionaries
the 1st dictionary contains location and the 2nd dictionary which is made up of the type of the item and how much it costs for example
gymnasium weights 15
market cereal 5
gymnasium shoes 50
saloon beer 3
saloon whiskey 10
market bread 5
which would result in this
{
'gymnasium': {
'weights': 15,
'shoes': 50
},
'saloon': {
'beer': 3,
'whiskey': 10
}
}
and so on for the other keys
basically I need to loop through this file but I'm struggling to read in the contents as a dict of dicts.
moreover without that portion i cant figure out how to append the inner list to the outer list if an instance of the key in the outer list occurs.
I would like to do this recursively
location_dict = {} #row #name day weight temp
item_dict = {}
for line in file:
line = line.strip()
location_dict[item_dict['location'] = item_dict`

this is a good use for setdefault (or defaultdict)
data = {}
for line in file:
key1,key2,value = line.split()
data.setdefault(key1,{})[key2] = value
print data
or based on your comment
from collections import defaultdict
data = defaultdict(lambda:defaultdict(int))
for line in file:
key1,key2,value = line.split()
data[key1][key2] += value
print data

Here is another solution.
yourFile = open("yourFile.txt", "r")
yourText = yourFile.read()
textLines = yourText.split("\n")
locationDict = {}
for line in textLines:
k1, k2, v = line.split(" ")
if k1 not in locationDict.keys():
locationDict[k1] = {}
else:
if k2 not in locationDict[k1].keys():
locationDict[k1][k2] = int(v)
else:
locationDict[k1][k2] += int(v)
print locationDict
Hope it helps!

Comparing two CSV files in Python when rows have multiple values

I have two CSV files that I want to compare one looks like this:
"a" 1 6 3 1 8
"b" 15 6 12 5 6
"c" 7 4 1 4 8
"d" 14 8 12 11 4
"e" 1 8 7 13 12
"f" 2 5 4 13 9
"g" 8 6 9 3 3
"h" 5 12 8 2 3
"i" 5 9 2 11 11
"j" 1 9 2 4 9
So "a" possesses the numbers 1,6,3,1,8 etc. The actual CSV file is 1,000s of lines long so you know for efficiency sake when writing the code.
The second CSV file looks like this:
4
15
7
9
2
I have written some code to import these CSV files into lists in python.
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
wn1 = winningnumbers[0]
wn2 = winningnumbers[1]
wn3 = winningnumbers[2]
wn4 = winningnumbers[3]
wn5 = winningnumbers[4]
print(winningnumbers)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
enl = list(readere)
How would I now search cross reference number 4 so wn1 of CSV file 2 with the first csv file. So that it returns that "b" has wn1 in it. I imported them as a list to see if I could figure out how to do it but just ended up running in circles. I also tried using dict() but had no success.

If I understood you correctly, you want to find the first index (or all indexes) of numbers in entries that are winning. If you want it, you can do that:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
winning_number_index = -1 # Default value which we will print if nothing is found
current_index = 0 # Initial index
for line in readere: # Iterate over entries file
all_numbers_match = True # Default value that will be set to False if any of the elements doesn't match with winningnumbers
for i in range(len(line)):
if line[i] != winningnumbers[i]: # If values of current line and winningnumbers with matching indexes are not equal
all_numbers_match = False # Our default value is set to False
break # Exit "for" without finishing
if all_numbers_match == True: # If our default value is still True (which indicates that all numbers match)
winning_number_index = current_index # Current index is written to winning_number_index
break # Exit "for" without finishing
else: # Not all numbers match
current_index += 1
print(winning_number_index)
This will print the index of the first winning number in entries (if you want all the indexes, write about it in the comments).
Note: this is not the optimal code to solve your problem. It's just easier to undestand and debug if you're not familiar with Python's more advanced features.
You should probably consider not abbreviating your variables. entries_reader takes just a second more to write and 5 seconds less to understand then readere.
This is the variant that is faster, shorter and more memory efficient, but may be harder to understand:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
for line_index, line in enumerate(readere):
if all((line[i] == winningnumbers[i] for i in xrange(len(line)))):
winning_number_index = line_index
break
else:
winning_number_index = -1
print(winning_number_index)
The features that might me unclear are probably enumerate(), any() and using else in for and not in if. Let's go through all of them one by one.
To understand this usage of enumerate, you'll need to understand that syntax:
a, b = [1, 2]
Variables a and b will be assigned according values from the list. In this case a will be 1 and b will be 2. Using this syntax we can do that:
for a, b in [[1, 2], [2, 3], ['spam', 'eggs']]:
# do something with a and b
in each iteration, a and b will be 1 and 2, 2 and 3, 'spam' and 'eggs' accordingly.
Let's assume we have a list a = ['spam', 'eggs', 'potatoes']. enumerate() just returns a "list" like that: [(1, 'spam'), (2, 'eggs'), (3, 'potatoes')]. So, when we use it like that,
for line_index, line in enumerate(readere):
# Do something with line_index and line
line_index will be 1, 2, 3, e.t.c.
any() function accepts a sequence (list, tuple, e.t.c.) and returns True if all the elements in it are equal to True.
Generator expression mylist = [line[i] == winningnumbers[i] for i in range(len(line))] returns a list and is similar to the following:
mylist = []
for i in range(len(line)):
mylist.append(line[i] == winningnumbers[i]) # a == b will return True if a is equal to b
So any will return True only in cases when all the numbers from entry match the winning numbers.
Code in else section of for is called only when for was not interrupted by break, so in our situation it's good for setting a default index to return.

Having duplicate numbers seems illogical but if you want to get the count of matched numbers for each row regardless of index then makes nums a set and sum the times a number from each row is in the set:
from itertools import islice, imap
import csv
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
# iterate over each row and sum how many matches we get
for row in r:
print("{} matched {}".format(row[0], sum(n in nums
for n in islice(row, 1, None))))
Which using your input will output:
a matched 0
b matched 1
c matched 2
d matched 1
e matched 0
f matched 2
g matched 0
h matched 1
i matched 1
j matched 2
presuming your file is comma separated and you have a number per line in your numbers file.
If you actually want to know which numbers if any are present then you need to iterate over the number and print each one that is in our set:
from itertools import islice, imap
import csv
with open("in.txt") as f, open("numbers.txt") as nums:
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
for row in r:
for n in islice(row, 1, None):
if n in nums:
print("{} is in row {}".format(n, row[0]))
print("")
But again, I am not sure having duplicate numbers makes sense.
To group the rows based on how many matches, you can use a dict using the sum as the key and appending the first column value:
from itertools import islice, imap
import csv
from collections import defaultdict
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
results = defaultdict(list)
# iterate over each row and sum how many matches we get
for row in r:
results[sum(n in nums for n in islice(row, 1, None))].append(row[0])
results:
defaultdict(<type 'list'>,
{0: ['a', 'e', 'g'], 1: ['b', 'd', 'h', 'i'],
2: ['c', 'f', 'j']})
The keys are numbers match, the values are the rows ids that matched the n numbers.

Python loop through two files, do computation, then output 3 files

I have 2 tab delimited files
for example:
file1:
12 23 43 34
433 435 76 76
file2:
123 324 53 65
12 457 54 32
I would like to loop through these 2 files, comparing every line of file1 with file2 and vice versa.
If, for example, the 1st number of 1st line in file1 is the same as the 1st number of 2nd line in file 2:
I would like to put from the 1st line in file1 in a file called output.
then I would like to put all the lines from file1 that didn't find a match in file 2 in a new file
and all the lines from file2 that didn't find a match in file1 in a new file.
so far I have been able to find the matching lines and put them in a file but I'm having trouble putting the lines that didn't match into 2 separate files.
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
count=0
list1=[] #list for lines in file1 that didn't find a match
list2=[] #list for lines in file2 that didn't find a match
for i in one:
for j in two:
columns1=i.strip().split('\t')
num1=int(columns1[0])
columns2=j.strip().split('\t')
num2=int(columns2[0])
if num1==num2:
count+=1
output.write(i+j)
else:
list1.append(i)
list2.append(j)
Problem I have here is with the else part.
Can someone show me the right and better way to do this, I would greatly appreciate.
EDIT: Thanks for the quick responses everyone
The 3 output I would be looking for is:
Output_file1: #Matching results between the 2 files
12 23 43 34 #line from file1
12 457 54 32 #line from file2
Output_file2: #lines from the first file that didn't find a match
433 435 76 76
Output_file3: #lines from the second file that didn't find a match
123 324 53 65

I would suggest that you use the csv module to read your files like so (you might have to mess around with the dialect, see http://docs.python.org/library/csv.html for help:
import csv
one = csv.reader(open(file1, 'r'), dialect='excell')
two = csv.reader(open(file2, 'r'), dialect='excell')
then you might find it easier to "zip" along the lines of both files at the same time like so (see http://docs.python.org/library/itertools.html#itertools.izip_longest):
import itertools
file_match = open('match', 'w')
file_nomatch1 = open('nomatch1', 'w')
file_nomatch2 = open('nomatch2', 'w')
for i,j in itertools.izip_longest(one, two, fillvalue="-"):
if i[0] == j[0]:
file_match.write(str(i)+'\n')
else:
file_nomatch1.write(str(i)+'\n')
file_nomatch2.write(str(j)+'\n')
# and maybe handle the case where one is "-"
I reread the post and realized you are looking for a match between ANY two lines in both files. Maybe someone will find the above code useful, but it wont solve your particular problem.

I'd suggest using set operation
from collections import defaultdict
def parse(filename):
result = defaultdict(list)
for line in open(filename):
# take the first number and put it in result
num = int(line.strip().split(' ')[0])
result[num].append(line)
return result
def select(selected, items):
result = []
for s in selected:
result.extend(items[s])
return result
one = parse('one.txt')
two = parse('two.txt')
one_s = set(one)
two_s = set(two)
intersection = one_s & two_s
one_only = one_s - two_s
two_only = two_s - one_s
one_two = defaultdict(list)
for e in one: one_two[e].extend(one[e])
for e in two: one_two[e].extend(two[e])
open('intersection.txt', 'w').writelines(select(intersection, one_two))
open('one_only.txt', 'w').writelines(select(one_only, one))
open('two_only.txt', 'w').writelines(select(two_only, two))

Think that it is not the best way but it works for me and looks prety easy for understanding:
# Sorry but was not able to check code below
def get_diff(fileObj1, fileObj2):
f1Diff = []
f2Diff = []
outputData = []
# x is one row
f1Data = set(x.strip() for x in fileObj1)
f2Data = set(x.strip() for x in fileObj2)
f1Column1 = set(x.split('\t')[0] for x in f1Data)
f2Column1 = set(x.split('\t')[0] for x in f2Data)
l1Col1Diff = f1Column1 ^ f2Column1
l2Col1Diff = f2Column1 ^ f1Column1
commonPart = f1Column1 & f2column1
for line in f1Data.union(f2Data):
lineKey = line.split('\t')[0]
if lineKey in common:
outputData.append(line)
elif lineKey in l1ColDiff:
f1Diff.append(line)
elif lineKey in l2ColDiff:
f2Diff.append(line)
return outputData, f1Diff, f2Diff
outputData, file1Missed, file2Missed = get_diff(open(file1, 'r'), open(file2, 'r'))

I think that this code fits your purposes
one=open(file1, 'r').readlines()
two=open(file2, 'r').readlines()
output=open('output.txt', 'w')
first = {x.split('\t')[0] for x in one}
second = {x.split('\t')[0] for x in two}
common = first.intersection( second )
list1 = filter( lambda x: not x.split('\t')[0] in common, one )
list2 = filter( lambda x: not x.split('\t')[0] in common, two )
res1 = filter( lambda x: x.split('\t')[0] in common, one )
res2 = filter( lambda x: x.split('\t')[0] in common, two )
count = len( res1 )
for x in range(count):
output.write( res1[x] )
output.write( res2[x] )

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

find common list between files - python

Related

How do I merge two csv files based on the words in a given list?

Python renaming duplicates

python: recursive dictionary of dictionary

Comparing two CSV files in Python when rows have multiple values

Python loop through two files, do computation, then output 3 files

Categories

Resources