Find duplicates of two columns from csv

Find duplicates of two columns from csv - python

I want to find duplicate values of one column and replaced with value of another column of csv which has multiple columns. So first I put two columns from the csv to the dictionary. Then I want to find duplicate values of dictionary that has string values and keys. I tried with solutions of remove duplicates of dictionary but got the error as not hashable or no result. Here is the first part of code.
import csv
from collections import defaultdict
import itertools as it
mydict = {}
index = 0
reader = csv.reader(open(r"computing.csv", "rb"))
for i, rows in enumerate(reader):
if i == 0:
continue
if len(rows) == 0:
continue
k = rows[3].strip()
v = rows[2].strip()
if k in mydict:
mydict[k].append(v)
else:
mydict[k] = [v]
#mydict = hash(frozenset(mydict))
print mydict
d = {}
while True:
try:
d = defaultdict(list)
for k,v in mydict.iteritems():
#d[frozenset(mydict.items())]
d[v].append(k)
except:
continue
writer = csv.writer(open(r"OLD.csv", 'wb'))
for key, value in d.items():
writer.writerow([key, value])

Your question is unclear. So I hope I got it right.
Please give an example of input columns and the desired output columns.
Please give a printout of the error and let us know which line caused the error.
if column1=[1,2,3,1,4] and column2=[a,b,c,d,e] do you want the output to be n_column1=[a,2,3,d,4] and column2 =[1,b,c,d,e]
I imagine the exception was in d[v].append(k) since clearly v is a list. you cannot use a list as a key in a dictionary.
In [1]: x = [1,2,3,1,4]
In [2]: y = ['a','b','c','d','e']
In [5]: from collections import defaultdict
In [6]: d = defaultdict(int)
In [7]: for a in x:
...: d[a] += 1
In [8]: d
Out[8]: defaultdict(<type 'int'>, {1: 2, 2: 1, 3: 1, 4: 1})
In [9]: x2 = []
In [10]: for a,b in zip(x,y):
....: x2.append(a if d[a]==1 else b)
....:
In [11]: x
Out[11]: [1, 2, 3, 1, 4]
In [12]: x2
Out[12]: ['a', 2, 3, 'd', 4]
In that case, I guess if I had to change your code to fit. I'd do something like that:
import csv
from collections import defaultdict
import itertools as it
mydict = {}
index = 0
reader = csv.reader(open(r"computing.csv", "rb"))
histogram = defaultdict(int)
k = []
v = []
for i, rows in enumerate(reader):
if i == 0:
continue
if len(rows) == 0:
continue
k.append(rows[3].strip())
v.append(rows[2].strip())
item = k[-1]
histogram[item] += 1
output_column = []
for first_item, second_item in zip(k,v):
output_column.append(first_item if histogram[first_item]==1 else second_item)
writer = csv.writer(open(r"OLD.csv", 'wb'))
for c1, c2 in zip(output_column, v):
writer.writerow([c1, c2])

Related

searching element of 1-D list in 2-D list

I have two lists, one is of form:
A = ["qww","ewq","ert","ask"]
B = [("qww",2) ,("ert",4) , ("qww",6), ("ewq" , 5),("ewq" , 10),("ewq" , 15),("ask",11)]
I have to process such that final output is
C = A = [("qww",8),("ewq",20),("ert",4),("ask",11)]
for that I written code:
# in code temp_list is of form A
# in code total_list is of form B
# in code final is of form C
def ispresent(key,list):
for qwerty in list:
if qwerty == key:
return 1
else:
return 0
def indexreturn(key,list):
counter = 0
for qwerty in list:
if qwerty != key:
counter = counter + 1
else:
return counter
def mult_indexreturn(key,list):
for i in range(len(list)):
if key == list[i][0]:
return i
final = map(lambda n1, n2: (n1,n2 ), temp_list,[ 0 for _ in range(len(temp_list))])
for object2 in total_list:#****
for object1 in temp_list:
if object2 == object1:
final[ indexreturn(object2,final) ][1] = final[ indexreturn(object2, final) ][1] + object2[mult_indexreturn(object2,total_list)][1]#total_list[ mult_indexreturn(object2,total_list) ][1]
print(final)
it should give output as C type list, but giving nothing
but C = [("qww",0),("ewq",0),("ert",0),("ask",0)]
according to me the main problem is in my looping part ( with **** comment), is there problem with logic or something else.
I gave in lot of codes, so that you can understand how my code working

You can build a dictionary using the method fromkeys() and subsequently you can use the for loop to accumulate integers:
A = ["qww","ewq","ert","ask"]
B = [("qww",2) ,("ert",4) , ("qww",6), ("ewq" , 5),("ewq" , 10),("ewq" , 15),("ask",11)]
C = dict.fromkeys(A, 0)
# {'qww': 0, 'ewq': 0, 'ert': 0, 'ask': 0}
for k, v in B:
C[k] += v
C = list(C.items())
# [('qww', 8), ('ewq', 30), ('ert', 4), ('ask', 11)]

Try this:
from collections import defaultdict
result = defaultdict(int)
for i in A:
result[i] = sum([j[1] for j in B if j[0] == i])
then tuple(result.items()) will be your out put.
Or you can do it in just one line:
result = tuple({i:sum([j[1] for j in B if j[0] == i]) for i in A}.items())

Using collection.defaultdict
Ex:
from collections import defaultdict
A = ["qww","ewq","ert","ask"]
B = [("qww",2) ,("ert",4) , ("qww",6), ("ewq" , 5),("ewq" , 10),("ewq" , 15),("ask",11)]
result = defaultdict(int)
for key, value in B:
if key in A: #Check if Key in A.
result[key] += value #Add Value.
print(result)
Output:
defaultdict(<type 'int'>, {'qww': 8, 'ert': 4, 'ewq': 30, 'ask': 11})

Replace values in Python dict

I have 2 files, The first only has 2 columns
A 2
B 5
C 6
And the second has the letters as a first column.
A cat
B dog
C house
I want to replace the letters in the second file with the numbers that correspond to them in the first file so I would get.
2 cat
5 dog
6 house
I created a dict from the first and read the second. I tried a few things but none worked. I can't seem to replace the values.
import csv
with open('filea.txt','rU') as f:
reader = csv.reader(f, delimiter="\t")
for i in reader:
print i[0] #reads only first column
a_data = (i[0])
dictList = []
with open('file2.txt', 'r') as d:
for line in d:
elements = line.rstrip().split("\t")[0:]
dictList.append(dict(zip(elements[::1], elements[0::1])))
for key, value in dictList.items():
if value == "A":
dictList[key] = "cat"

The issue appears to be on your last lines:
for key, value in dictList.items():
if value == "A":
dictList[key] = "cat"
This should be:
for key, value in dictList.items():
if key in a_data:
dictList[a_data[key]] = dictList[key]
del dictList[key]
d1 = {'A': 2, 'B': 5, 'C': 6}
d2 = {'A': 'cat', 'B': 'dog', 'C': 'house', 'D': 'car'}
for key, value in d2.items():
if key in d1:
d2[d1[key]] = d2[key]
del d2[key]
>>> d2
{2: 'cat', 5: 'dog', 6: 'house', 'D': 'car'}
Notice that this method allows for items in the second dictionary which don't have a key from the first dictionary.
Wrapped up in a conditional dictionary comprehension format:
>>> {d1[k] if k in d1 else k: d2[k] for k in d2}
{2: 'cat', 5: 'dog', 6: 'house', 'D': 'car'}
I believe this code will get you your desired result:
with open('filea.txt', 'rU') as f:
reader = csv.reader(f, delimiter="\t")
d1 = {}
for line in reader:
if line[1] != "":
d1[line[0]] = int(line[1])
with open('fileb.txt', 'rU') as f:
reader = csv.reader(f, delimiter="\t")
reader.next() # Skip header row.
d2 = {}
for line in reader:
d2[line[0]] = [float(i) for i in line[1:]]
d3 = {d1[k] if k in d1 else k: d2[k] for k in d2}

You could use dictionary comprehension:
d1 = {'A':2,'B':5,'C':6}
d2 = {'A':'cat','B':'dog','C':'house'}
In [23]: {d1[k]:d2[k] for k in d1.keys()}
Out[23]: {2: 'cat', 5: 'dog', 6: 'house'}

If the two dictionaries are called a and b, you can construct a new dictionary this way:
composed_dict = {a[k]:b[k] for k in a}
This will take all the keys in a, and read the corresponding values from a and b to construct a new dictionary.
Regarding your code:
The variable a_data has no purpose. You read the first file, pront the first column, and do nothing else with the data in it
zip(elements[::1], elements[0::1]) will just construct pairs like [1,2,3] -> [(1,1),(2,2),(3,3)], I think that's not what you want
After all you have a list of dictionaries, and at the last line you just put strings in that list. I think that is not intentional.

import re
d1 = dict()
with open('filea.txt', 'r') as fl:
for f in fl:
key, val = re.findall('\w+', f)
d1[key] = val
d2 = dict()
with open('file2.txt', 'r') as fl:
for f in fl:
key, val = re.findall('\w+', f)
d2[key] = val
with open('file3.txt', 'wb') as f:
for k, v in d1.items():
f.write("{a}\t{b}\n".format(a=v, b=d2[k]))

Python Removing duplicates ( and not keeping them) in a list

Say I have:
x=[a,b,a,b,c,d]
I want a way to get
y=[c,d]
I have managed to do it with count:
for i in x:
if x.count(i) == 1:
unique.append(i)
The problem is, this is very slow for bigger lists, help?

First use a dict to count:
d = {}
for i in x:
if i not in d:
d[i] = 0
d[i] += 1
y = [i for i, j in d.iteritems() if j == 1]

x=["a","b","a","b","c","d"]
from collections import Counter
print([k for k,v in Counter(x).items() if v == 1])
['c', 'd']
Or to guarantee the order create the Counter dict first then iterate over the x list doing lookups for the values only keeping k's that have a value of 1:
x = ["a","b","a","b","c","d"]
from collections import Counter
cn = Counter(x)
print([k for k in x if cn[k] == 1])
So one pass over x to create the dict and another pass in the comprehension giving you an overall 0(n) solution as opposed to your quadratic approach using count.
The Counter dict counts the occurrences of each element:
In [1]: x = ["a","b","a","b","c","d"]
In [2]: from collections import Counter
In [3]: cn = Counter(x)
In [4]: cn
Out[4]: Counter({'b': 2, 'a': 2, 'c': 1, 'd': 1})
In [5]: cn["a"]
Out[5]: 2
In [6]: cn["b"]
Out[6]: 2
In [7]: cn["c"]
Out[7]: 1
Doing cn[k] returns the count for each element so we only end up keeping c and d.

The best way to do this is my using the set() function like this:
x=['a','b','a','b','c','d']
print list(set(x))
As the set() function returns an unordered result. Using the sorted() function, this problem can be solved like so:
x=['a','b','a','b','c','d']
print list(sorted(set(x)))

Problems with Python Dictionarys and nested Lists

I am trying to create a dictionary that has a nested list inside of it.
The goal would be to have it be:
key : [x,y,z]
I am pulling the information from a csv file and counting the number of times a certain key shows up in each column. However I am getting the below error
> d[key][i] = 1
KeyError: 'owner'
Where owner is the title of my column.
if __name__ == '__main__':
d = {}
with open ('sample.csv','r') as f:
reader = csv.reader(f)
for i in range(0,3):
for row in reader:
key = row[0]
if key in d:
d[key][i] +=1
else:
d[key][i] = 1
for key,value in d.iteritems():
print key,value
What do I tweak in this loop to have it create a key if it doesn't exist and then add to it if it does?

The problem is, that you try to use a list ([i]) where no list is.
So you have to replace
d[key][i] = 1
with
d[key] = [0,0,0]
d[key][i] = 1
This would first create the list with three entries (so you can use [0], [1] and [2] afterward without error) and then assigns one to the correct entry in the list.

You can use defaultdict:
from collections import defaultdict
ncols = 3
d = defaultdict(lambda: [0 for i in range(ncols)])

Use a try, catch block to append a list to the new key, then increment as needed
if __name__ == '__main__':
d = {}
with open ('sample.csv','r') as f:
reader = csv.reader(f)
for i in xrange(0,3):
for row in reader:
key = row[i]
try: d[key][i] += 1
except KeyError:
d[key] = [0, 0, 0]
d[key][i] = 1
for key,value in d.iteritems():
print key,value

Using defaultdict and Counter you can come up with a dict that allows you to easily measure how many times a key appeared in a position (in this case 1st, 2nd or 3rd, by the slice)
csv = [
['a','b','c','d'],
['e','f','g', 4 ],
['a','b','c','d']
]
from collections import Counter, defaultdict
d = defaultdict(Counter)
for row in csv:
for idx, value in enumerate(row[0:3]):
d[value][idx] += 1
example usage:
print d
print d['a'][0] #number of times 'a' has been found in the 1st position
print d['b'][2] #number of times 'b' found in the 3rd position
print d['f'][1] #number of times 'f' found in 2nd position
print [d['a'][n] for n in xrange(3)] # to match the format requested in your post
defaultdict(<class 'collections.Counter'>, {'a': Counter({0: 2}), 'c': Counter({2: 2}), 'b': Counter({1: 2}), 'e': Counter({0: 1}), 'g': Counter({2: 1}), 'f': Counter({1: 1})})
2
0
1
[2, 0, 0]
Or put into a function:
def occurrences(key):
return [d[key][n] for n in xrange(3)]
print occurrences('a') # [2, 0, 0]

make dictionary from csv file columns

i am new to the concept of dictionaries in python.
I have a csv file with multiple columns and i want to create a dictionary such that keys are taken from 1st column and values from the second and a key:value pair is made for all rows of those two columns.
The code is as follows:
if __name__=="__main__":
reader = csv.reader(open("file.csv", "rb"))
for rows in reader:
k = rows[0]
v = rows[1]
mydict = {k:v}
print (mydict)
problem: The output returned is only for "last" or "bottom most" row of the first two columns i.e. {'12654':'18790'}. i want the dictionary to contain all 100 rows of the first two columns in this format. How to do that? can i run some loop on the row numbers for the first two columns to do that...i dont know how.

if __name__=="__main__":
mydict = {}
reader = csv.reader(open("file.csv", "rb"))
for rows in reader:
k = rows[0]
v = rows[1]
mydict[k] = v
print mydict
Here:
mydict = {k:v}
You were making new dictionary in every iteration, and the previous data has been lost.
Update:
You can make something like this:
mydict = {}
L = [(1, 2), (2, 4), (1, 3), (3, 2), (3, 4)]
for el in L:
k, v = el
if not k in mydict:
mydict[k] = [v]
else:
mydict[k].append(v)
print mydict
>>>
{1: [2, 3], 2: [4], 3: [2, 4]}
This way, each value of the same key will be stored
Your code will be:
if __name__=="__main__":
mydict = {}
reader = csv.reader(open("file.csv", "rb"))
for i, rows in enumerate(reader):
if i == 0: continue
k = rows[0]
v = rows[1]
if not k in mydict:
mydict[k] = [v]
else:
mydict[k].append(v)
print mydict
Update2: You mean?
for k, v in mydict.items():
print "%s: %s" % (k, v)
>>>
1: [2, 3]
2: [4]
3: [2, 4]
Update3:
This should work:
if __name__=="__main__":
mydict = {}
reader = csv.reader(open("file.csv", "rb"))
for i, rows in enumerate(reader):
if i == 0: continue
k = rows[0]
v = rows[1]
if not k in mydict:
mydict[k] = [v]
else:
mydict[k].append(v)
print mydict

You are creating a new dict and overwriting the old one each iteration. #develerx's answer fixes this problem. I just wanted to point an easier way, using dict comprehensions:
Assuming the csv file contains two columns.
if __name__=="__main__":
reader = csv.reader(open("file.csv", "rb"))
my_dict = {k: v for k, v in reader}
print mydict
If you are using older version(older than 2.7 I think), you can't use dict comprehensions, just use the dict function then:
my_dict = dict((k, v) for k, v in reader)
Edit: And I just thought that; my_dict = dict(reader) could also work.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find duplicates of two columns from csv - python

Related

searching element of 1-D list in 2-D list

Replace values in Python dict

Python Removing duplicates ( and not keeping them) in a list

Problems with Python Dictionarys and nested Lists

make dictionary from csv file columns

Categories

Resources