Joining CSV fields - python

I need to convert
Name | Org
a | 5
a | 6
b | 5
c | 7
into
Name | Org
a | 5,6
b | 5
c | 7
my first attempt was with this code
while i < len(nameColumn):
if nameColumn[i] not in resultC1:
resultC1.append(nameColumn[i])
while l < len(nameColumn):
if nameColumn[l] == nameColumn[i]:
tempdata += organizationColumn[l] + ','
l += 1
resultC2.append(tempdata[:-1])
tempdata = ''
k += 1
i += 1
which just ends up with the result
Name | Org
a |
b |
c |
Any help would be greatly appreciated. I havent had luck finding anything on this yet. I'm reading the data from a .CSV file into lists and working with that data and storing results into resultC1 and resultC2

Here's a solution using collections.OrderedDict:
import csv
from collections import OrderedDict
data = OrderedDict()
with open('test.csv') as f:
reader = csv.reader(f)
for i, line in enumerate(reader):
if i == 0:
continue
if line[0] not in data:
data[line[0]] = []
data[line[0]].append(line[1])
for k, v in data.items():
print(k, '|', ', '.join(v))
An OrderedDict retains its order. The keys are the Names, and the values are lists of all Orgs associated with it.
Output:
a | 5, 6
b | 5
c | 7
If your csv has different delimiters from comma, then you'll have to specify that delimiter. I've assumed commas in my example.
Here's a much simpler solution with pandas:
In [443]: df.head()
Out[443]:
Name Org
0 a 5
1 a 6
2 b 5
3 c 7
In [445]: for k, v in df.groupby('Name').apply(lambda x: list(x['Org'])).iteritems():
...: print(k, '|', ', '.join(map(str, v)))
...:
a | 5, 6
b | 5
c | 7

Use OrderedDict calling setdefault with an empty list, and the csv module:
import csv
from collections import OrderedDict
organizations = OrderedDict()
with open(filename) as infile:
for name, org in csv.reader(infile, delimiter='|'):
organizations.setdefault(name, []).append(org)
Then you can write the dictionary:
with open(filename, 'w') as outfile:
writer = csv.writer(outfile, delimiter='|')
for name, orgs in organizations.items():
writer.writerow([name, ','.join(orgs)])

Assuming you're starting with the two arrays implied in your sample code, I'd go with something like this:
from collections import defaultdict
nameColumn = ['a', 'a', 'b', 'c']
organizationColumn = ["5", "6", "5", "7"]
merged = defaultdict(list)
for name, org in zip(nameColumn, organizationColumn):
merged[name].append(org)
for k, v in merged.items():
print(f'{k} | {v}'))

The solution using itertools.groupby() function:
import csv, itertools
with open('yourfile.csv', 'r') as f:
reader = csv.reader(f, delimiter='|', skipinitialspace=True)
head = next(reader) # header line
items = [list(g) for k,g in itertools.groupby(sorted(reader), key=lambda x: x[0])]
fmt = '{0[0]:<5} | {0[1]:^5}' # format spec
print(fmt.format(head))
for item in items:
print(fmt.format([item[0][0], ','.join(i[1] for i in item)] if len(item) > 1 else item[0]))
The output:
Name | Org
a | 5,6
b | 5
c | 7

Here is another solution that can be generic to have delimiter for input and output file.
def parseData(fileName, delimiter):
dictionary={}
with open(fileName, 'r') as iFile:
for line in iFile.readlines():
row = line.split(delimiter)
values = []
if (row[0] in dictionary.keys()):
values = dictionary[row[0]]
values.append(row[1].replace('\n',''))
else:
values.append(row[1].replace('\n',''))
dictionary[row[0]] = values
dictionary[row[0]] = values
## print for debugging purpose
print(dictionary)
return dictionary
def writeData(fileName, odelimiter, idelimiter, dictionary):
with open(fileName, 'w') as oFile:
for key, values in dictionary.items():
data=""
for value in values:
data = data + value + idelimiter
data=data[:-1]
## print for debugging purpose
print(key, data)
oFile.write(key + odelimiter + data + "\n")
## main
dictionary=parseData('inputPipe.txt', "|")
writeData('output.txt', "|", ",", dictionary)
inputPipe.txt
a|5
a|6
b|5
c|7
output.txt
a|5,6
b|5
c|7
Sample Run
{'a': ['5', '6'], 'b': ['5'], 'c': ['7']}
a 5,6
b 5
c 7

Related

Replace values in Python dict

I have 2 files, The first only has 2 columns
A 2
B 5
C 6
And the second has the letters as a first column.
A cat
B dog
C house
I want to replace the letters in the second file with the numbers that correspond to them in the first file so I would get.
2 cat
5 dog
6 house
I created a dict from the first and read the second. I tried a few things but none worked. I can't seem to replace the values.
import csv
with open('filea.txt','rU') as f:
reader = csv.reader(f, delimiter="\t")
for i in reader:
print i[0] #reads only first column
a_data = (i[0])
dictList = []
with open('file2.txt', 'r') as d:
for line in d:
elements = line.rstrip().split("\t")[0:]
dictList.append(dict(zip(elements[::1], elements[0::1])))
for key, value in dictList.items():
if value == "A":
dictList[key] = "cat"
The issue appears to be on your last lines:
for key, value in dictList.items():
if value == "A":
dictList[key] = "cat"
This should be:
for key, value in dictList.items():
if key in a_data:
dictList[a_data[key]] = dictList[key]
del dictList[key]
d1 = {'A': 2, 'B': 5, 'C': 6}
d2 = {'A': 'cat', 'B': 'dog', 'C': 'house', 'D': 'car'}
for key, value in d2.items():
if key in d1:
d2[d1[key]] = d2[key]
del d2[key]
>>> d2
{2: 'cat', 5: 'dog', 6: 'house', 'D': 'car'}
Notice that this method allows for items in the second dictionary which don't have a key from the first dictionary.
Wrapped up in a conditional dictionary comprehension format:
>>> {d1[k] if k in d1 else k: d2[k] for k in d2}
{2: 'cat', 5: 'dog', 6: 'house', 'D': 'car'}
I believe this code will get you your desired result:
with open('filea.txt', 'rU') as f:
reader = csv.reader(f, delimiter="\t")
d1 = {}
for line in reader:
if line[1] != "":
d1[line[0]] = int(line[1])
with open('fileb.txt', 'rU') as f:
reader = csv.reader(f, delimiter="\t")
reader.next() # Skip header row.
d2 = {}
for line in reader:
d2[line[0]] = [float(i) for i in line[1:]]
d3 = {d1[k] if k in d1 else k: d2[k] for k in d2}
You could use dictionary comprehension:
d1 = {'A':2,'B':5,'C':6}
d2 = {'A':'cat','B':'dog','C':'house'}
In [23]: {d1[k]:d2[k] for k in d1.keys()}
Out[23]: {2: 'cat', 5: 'dog', 6: 'house'}
If the two dictionaries are called a and b, you can construct a new dictionary this way:
composed_dict = {a[k]:b[k] for k in a}
This will take all the keys in a, and read the corresponding values from a and b to construct a new dictionary.
Regarding your code:
The variable a_data has no purpose. You read the first file, pront the first column, and do nothing else with the data in it
zip(elements[::1], elements[0::1]) will just construct pairs like [1,2,3] -> [(1,1),(2,2),(3,3)], I think that's not what you want
After all you have a list of dictionaries, and at the last line you just put strings in that list. I think that is not intentional.
import re
d1 = dict()
with open('filea.txt', 'r') as fl:
for f in fl:
key, val = re.findall('\w+', f)
d1[key] = val
d2 = dict()
with open('file2.txt', 'r') as fl:
for f in fl:
key, val = re.findall('\w+', f)
d2[key] = val
with open('file3.txt', 'wb') as f:
for k, v in d1.items():
f.write("{a}\t{b}\n".format(a=v, b=d2[k]))

Python Dictionary to a specific format in .csv

I have a dictionary in python in this way
my_dict = {'1':['a','b','c'], '2':['d','e','f']}
and i want to write a csv file in which it is displayed as it follows
1, a b c
2, d e f
because it is parsed by another application in this specific format.
Is there any way to it?
This is a way to do it:
my_dict = {'1':['a','b','c'], '2':['d','e','f']}
with open('data.csv', 'w') as f:
f.write('\n'.join([("%s, %s" % (k,' '.join(my_dict[k]))) for k in my_dict])
my_dict = {'1':['a','b','c'], '2':['d','e','f']}
from operator import itemgetter
import csv
with open('data.csv','w') as f:
a = csv.writer(f, delimiter = ',', lineterminator='\n')
for k,v in sorted(my_dict.items(), key=itemgetter(0)):
a.writerow([k,' ' + ' '.join(v)])
data.csv
1, a b c
2, d e f

File Columns to Dictionary

Hello I have a file comprising 4 columns,
a 1 45 test
b 2 42 test
c 3 64 test
I wish to read this file to a dictionary such that column 3 is the key and column 1 is the value, i.e.,
d = {45:'a', 42:'b', 64:'c'}
Keep it simple:
>>>
>>> d = dict()
>>> with open('test.txt') as f:
for line in f:
val, foo, key, bar = line.split()
d[key] = val
>>> d
{'64': 'c', '45': 'a', '42': 'b'}
>>>
Use the csv module to parse the file. Change the delimiter parameter to whatever is the delimiter in your input file. I have assumed it to be tabs.
import csv
d = {}
with open('your-input-file', 'r') as input_file:
csv_reader = csv.reader(input_file, delimiter='\t')
for row in csv_reader:
d[row[2]] = row[0]
input_file.close()

Find duplicates of two columns from csv

I want to find duplicate values of one column and replaced with value of another column of csv which has multiple columns. So first I put two columns from the csv to the dictionary. Then I want to find duplicate values of dictionary that has string values and keys. I tried with solutions of remove duplicates of dictionary but got the error as not hashable or no result. Here is the first part of code.
import csv
from collections import defaultdict
import itertools as it
mydict = {}
index = 0
reader = csv.reader(open(r"computing.csv", "rb"))
for i, rows in enumerate(reader):
if i == 0:
continue
if len(rows) == 0:
continue
k = rows[3].strip()
v = rows[2].strip()
if k in mydict:
mydict[k].append(v)
else:
mydict[k] = [v]
#mydict = hash(frozenset(mydict))
print mydict
d = {}
while True:
try:
d = defaultdict(list)
for k,v in mydict.iteritems():
#d[frozenset(mydict.items())]
d[v].append(k)
except:
continue
writer = csv.writer(open(r"OLD.csv", 'wb'))
for key, value in d.items():
writer.writerow([key, value])
Your question is unclear. So I hope I got it right.
Please give an example of input columns and the desired output columns.
Please give a printout of the error and let us know which line caused the error.
if column1=[1,2,3,1,4] and column2=[a,b,c,d,e] do you want the output to be n_column1=[a,2,3,d,4] and column2 =[1,b,c,d,e]
I imagine the exception was in d[v].append(k) since clearly v is a list. you cannot use a list as a key in a dictionary.
In [1]: x = [1,2,3,1,4]
In [2]: y = ['a','b','c','d','e']
In [5]: from collections import defaultdict
In [6]: d = defaultdict(int)
In [7]: for a in x:
...: d[a] += 1
In [8]: d
Out[8]: defaultdict(<type 'int'>, {1: 2, 2: 1, 3: 1, 4: 1})
In [9]: x2 = []
In [10]: for a,b in zip(x,y):
....: x2.append(a if d[a]==1 else b)
....:
In [11]: x
Out[11]: [1, 2, 3, 1, 4]
In [12]: x2
Out[12]: ['a', 2, 3, 'd', 4]
In that case, I guess if I had to change your code to fit. I'd do something like that:
import csv
from collections import defaultdict
import itertools as it
mydict = {}
index = 0
reader = csv.reader(open(r"computing.csv", "rb"))
histogram = defaultdict(int)
k = []
v = []
for i, rows in enumerate(reader):
if i == 0:
continue
if len(rows) == 0:
continue
k.append(rows[3].strip())
v.append(rows[2].strip())
item = k[-1]
histogram[item] += 1
output_column = []
for first_item, second_item in zip(k,v):
output_column.append(first_item if histogram[first_item]==1 else second_item)
writer = csv.writer(open(r"OLD.csv", 'wb'))
for c1, c2 in zip(output_column, v):
writer.writerow([c1, c2])

Comparing values between 2 CSV files and writing to a 3rd CSV file

I am trying to compare values of a particular column between 2 csv. I tried the following code for the same. However, I am not getting any output and no error too. Please help me with this
with open("File1.csv", "rb") as in_file1, open("File2.csv", "rb") as in_file2,open("File3.csv", "wb") as out_file:
reader1 = csv.reader(in_file1)
reader2 = csv.reader(in_file2)
writer = csv.writer(out_file)
for row2 in reader2:
for row1 in reader1:
if row2[0] == row1[0]:
row2[1] = row1[1]
writer.writerow(row2)
Here is how the data looks like:
File 1
A 100
B 200
C 300
D 400
E 500
FIle 2
A
C
E
E
E
D
File 3 (Should be)
A 100
C 300
E 500
E 500
E 500
D 400
File1.csv is a mapping. Read it first and store it in a dictionary. Then iterate over File2.csv and write it to File3.csv together with the value retrieved from the mapping dictionary.
The following code works for your example:
with open("File1.csv", "rb") as in_file1:
d = dict(csv.reader(in_file1, delimiter=' '))
with open("File2.csv", "rb") as in_file2, open("File3.csv", "wb") as out_file:
writer = csv.writer(out_file, delimiter=' ')
for rec in csv.reader(in_file2, delimiter=' '):
writer.writerow((rec[0], d[rec[0]]))
Just for an illustration, d looks like this:
{'A': '100', 'B': '200', 'C': '300', 'D': '400', 'E': '500'}
The values are strings (not integers), but this is not a problem, since we are just printing them into a file.
Why not simply use it this way:
lookup = {}
with open('file1', 'r') as f:
lookup = dict([l.split() for l in f.read().split('\n') if len(l) > 0])
with open('file2', 'r') as file2, open('out', 'w') as out:
for line in file2.readlines():
line = line.strip()
out.write("%s %s\n" % (line, lookup[line]))
I don't see a point using csv here

Categories

Resources