Python Joining csv files where key is first column value

Python Joining csv files where key is first column value - python

I try to join two csv files where key is value of first column.
There's no header.
Files have different number of lines and rows.
Order of file a must be preserved.
file a:
john,red,34
andrew,green,18
tonny,black,50
jack,yellow,27
phill,orange,45
kurt,blue,29
mike,pink,61
file b:
tonny,driver,new york
phill,scientist,boston
desired result:
john,red,34
andrew,green,18
tonny,black,50,driver,new york
jack,yellow,27
phill,orange,45,scientist,boston
kurt,blue,29
mike,pink,61
I examined all related threads and I am sure that some of you are gonna mark this question duplicate but I simply have not found solution yet.
I grabbed dictionary based solution but this approach does not handle preserve line order from file 'a' condition.
import csv
from collections import defaultdict
with open('a.csv') as f:
r = csv.reader(f, delimiter=',')
dict1 = {}
for row in r:
dict1.update({row[0]: row[1:]})
with open('b.csv') as f:
r = csv.reader(f, delimiter=',')
dict2 = {}
for row in r:
dict2.update({row[0]: row[1:]})
result = defaultdict(list)
for d in (dict1, dict2):
for key, value in d.iteritems():
result[key].append(value)
I also would like to avoid putting these csv files to the database like sqlite or using pandas module.
Thanks in advance

Something like
import csv
from collections import OrderedDict
with open('b.csv', 'rb') as f:
r = csv.reader(f)
dict2 = {row[0]: row[1:] for row in r}
with open('a.csv', 'rb') as f:
r = csv.reader(f)
dict1 = OrderedDict((row[0], row[1:]) for row in r)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('ab_combined.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)
produces
john,red,34
andrew,green,18
tonny,black,50,driver,new york
jack,yellow,27
phill,orange,45,scientist,boston
kurt,blue,29
mike,pink,61
(Note that I didn't bother protecting against the case where dict2 has a key which isn't in dict1-- that's easily added if you like.)

Related

Turning Dictionary into CSV format

My output currently looks like this:
[{'date': '20140206', 'exchange': 'cme', 'total_bytes': '15400000'},
{'date': '20140206', 'exchange': 'phlx', 'total_bytes': '14100000'},
{'date': '20140206', 'exchange': 'phlx', 'total_bytes': '13800000'},
{'date': '20140207', 'exchange': 'cme', 'total_bytes': '15800000'},
{'date': '20140207', 'exchange': 'cme', 'total_bytes': '14200000'},
{'date': '20140207', 'exchange': 'phlx', 'total_bytes': '24100000'}]
But I need it to look more like this:
date,exchange,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
As of right now I have multiple lines for the same date, I would like to group them so that there are no duplicate entries. IE. only one phlx entry for the 7th. (adding both values of bytes)
Here is my code:
import csv
import pprint
endresult = []
# write csv_input to a csv file
with open('csv_input.csv','w') as file:
for line in csv_input:
file.write(line)
# manipulate text - remove the 0001 from the host name to get just the initials - ex. cme
text = open("csv_input.csv", "r")
text = ''.join([i for i in text]) \
.replace("0001", "")
x = open("csv_input.csv","w")
x.writelines(text)
x.close()
# read csv file created and add column names
with open('csv_input.csv', 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
endresult.append({
'date': row['date'],
'exchange': row['host'],
'total_bytes': row['bytes']})
#print(row)
with open('last.csv', 'w', newline='') as txt_file:
fieldnames = ['date','exchange','total_bytes']
csv_dict_writer = csv.DictWriter(txt_file, fieldnames=fieldnames)
csv_dict_writer.writeheader()
for result in endresult:
csv_dict_writer.writerow(result)
pprint.pprint(endresult)

There are a number libraries shipped with Python that can help here.
operator.itemgetter efficiently extras values for a sort key.
itertools.groupby will group iterated lines by similar keys.
csv.DictReader and csv.DictWriter to reference CSV data by named columns.
Note input.csv below is your original CSV data.
import csv
from operator import itemgetter
from itertools import groupby
# Build a sort key by primary/secondary sort value
sorter = itemgetter('date','host')
# Read all the data
with open('input.csv','r',newline='') as fin:
r = csv.DictReader(fin)
data = sorted(r,key=sorter)
# build output lines grouped by the sort key
lines = []
for (date,host),group in groupby(data,sorter):
lines.append({'date' : date,
'host' : host[:-4],
'total_bytes' : sum(int(data['bytes']) for data in group)})
# generate output
with open('output.csv','w',newline='') as fout:
w = csv.DictWriter(fout,fieldnames='date host total_bytes'.split())
w.writeheader()
w.writerows(lines)
output.csv:
date,host,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
Also, if your input data is already sorted appropriately, the code can be simplified to skip reading the entire data into memory and sorting it and process line-by-line instead. Reading the whole file into memory could be impractical for a large amount of data.
Note the use of .writerow() which takes a single dict vs. .writerows() which takes a list of dict.
import csv
from operator import itemgetter
from itertools import groupby
sorter = itemgetter('date','host')
with open('input.csv','r',newline='') as fin, \
open('output.csv','w',newline='') as fout:
r = csv.DictReader(fin)
w = csv.DictWriter(fout,fieldnames='date host total_bytes'.split())
w.writeheader()
for (date,host),group in groupby(r,sorter):
w.writerow({'date' : date,
'host' : host[:-4],
'total_bytes' : sum(int(data['bytes']) for data in group)})

Read in your original csv file as 'test.csv'
Group similar 'date' & 'exchange' values using a defaultdict, with 'total_bytes' added to a list.
Split the keys and values out into a list of lists, and sum 'total_bytes'
Write the list of lists to a csv file.
from collection import defaultdict
import csv
# read the csv
with open('test.csv', 'r') as f:
data = list(csv.DictReader(f))
# extract and group information from data
dd = defaultdict(list)
for d in data:
date, proc, _, _, tb = d.values() # extract the values of interest
proc = proc.split('_')[0]
dd[f'{date}_{proc}'].append(tb)
# add dd keys to a list, sum dd values and add to the list
csv_list = [['date', 'exchange', 'total_bytes']]
for k, v in dd.items():
d, e = k.split('_')
tb = sum(map(int, v))
csv_list.append([d, e, tb])
# write the list of list to a file
with open('new_csv.csv', 'w', newline='') as f:
write = csv.writer(f)
write.writerows(csv_list)
# view of csv file
date,exchange,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
If you're not allowed to use any imports
# read in the file
with open('test.csv', 'r') as f:
data = [row.strip().split(',') for row in f.readlines()]
# add list values of interest from data to a dict
dd = dict()
for i, d in enumerate(data):
if i > 0:
date, proc, _, _, tb = d
proc = proc.split('_')[0]
key = f'{date}_{proc}'
if not dd.get(key): # if the key doesn't exist
dd[key] = [tb] # create the key-value pair
else:
dd[key].append(tb)
# add dd keys to a list, sum dd values and add to the list
csv_list = [['date', 'exchange', 'total_bytes']]
for k, v in dd.items():
d, e = k.split('_')
tb = sum(map(int, v))
csv_list.append([d, e, tb])
# write the list of list to a file
with open('new_csv.csv', 'w', newline='') as f:
write = csv.writer(f)
write.writerows(csv_list)

CSV Grouping w/o Pandas

Id like to group data in a .csv file. My data is like the following:
code,balance
CN,999.99
CN,1.01
LS,177.77
LS,69.42
LA,200.43
WO,100
I would like to group the items by code and sum up the balances of the like codes. Desired output would be:
code,blance
CN,1001
LS,247.19
...
I was originaly using Pandas for this task but will not have a package available to put that library on a server.
mydata = pd.read_csv('./tmp/temp.csv')
out = mydata.groupby('code').sum()
Solutions would preferably be compatible with Python 2.6.
I apologize if this is a duplicate, the other posts seem to be grouping differently.
I would also like to avoid doing this in a -
if code = x
add balance to x_total
-kind of way
MY SOLUTION:
def groupit():
groups = defaultdict(list)
with open('tmp.csv') as fd:
reader = csv.DictReader(fd)
for row in reader:
groups[row['code']].append(float(row['balance.']))
total={key:sum(groups[key]) for key in groups}
total=str(total)
total=total.replace(' ','')
total=total.replace('{','')
total=total.replace('}','')
total=total.replace("'",'')
total=total.replace(',','\n')
total=total.replace(':',',')
outfile = open('out.csv','w+')
outfile.write('code,balance\n')
outfile.write(total)

Python > 2.6:
from collections import defaultdict
import csv
groups = defaultdict(list)
with open('text.txt') as fd:
reader = csv.DictReader(fd)
for row in reader:
groups[row['code']].append(float(row['balance']))
totals = {key: sum(groups[key]) for key in groups}
print(totals)
This outputs:
{'CN': 1001.0, 'LS': 247.19, 'LA': 200.43, 'WO': 100.0}
Python = 2.6:
from collections import defaultdict
import csv
groups = defaultdict(list)
with open('text.txt') as fd:
reader = csv.DictReader(fd)
for row in reader:
groups[row['code']].append(float(row['balance']))
totals = dict((key, sum(groups[key])) for key in groups)
print(totals)

Here is how I will go about it:
with open("data.csv", 'r') as f:
data = f.readlines()
result = {}
for val in range(1, len(data)-1):
x = data[val].split(",")
if x[0] not in result:
result[x[0]] = float(x[1].replace('\n', ""))
else:
result[x[0]] = result[x[0]] + float(x[1].replace('\n', ""))
result dictionary will have the values of interest which can then be saves as csv.
import csv
with open('mycsvfile.csv', 'wb') as f: # Just use 'w' mode in 3.x
w = csv.DictWriter(f, result.keys())
w.writeheader()
w.writerow(result)
Hope this helps :)

How would I write to a CSV file from a python dictionary of dictionaries while some values are empty

I have a Python dictionary of dictionaries and have date stored that i need to write to a CSV file.
the problem i'm having is that some of the dictionaries from the file i have read don't contain any information for that particular ID. So my CSV file column are not lined up properly .
example
d["first1"]["title"] = founder
d["first1"]["started"] = 2005
d["second1"]["title"] = CEO
d["second1"]["favcolour"] = blue
and so when i use the following code:
for key, value in d.iteritems():
ln = [key]
for ikey, ivalue in value.iteritems():
ln.append(ikey)
ln.extend([v for v in ivalue])
writer.writerow(ln)
my CSV file will have all the information but the "started" and "favcolour" are in the same column i want it so that the columns only contain one .
Thanks all in advance

Here's a suggestion:
d = {"first1": {"title": 'founder', "started": 2005}, "second1": {"title": 'CEO', "favcolour": 'blue'}}
columns = []
output = []
for key, value in d.iteritems():
for ikey, ivalue in value.iteritems():
if ikey not in columns:
columns.append(ikey)
ln = []
for col in columns:
if col not in value:
ln.append('')
else:
ln.append(value[col])
output.append(ln)
with open('file', 'w') as fl:
csv_writer = csv.writer(fl)
csv_writer.writerow(columns)
for ln in output:
print ln
csv_writer.writerow(ln)
file:
started,title,favcolour
2005,founder
,CEO,blue

If it doesn't need to be human-readable, you can use alternatively pickle:
import pickle
# Write:
with open('filename.pickle', 'wb') as handle:
pickle.dump(d, handle)
# Read:
with open('filename.pickle', 'rb') as handle:
d = pickle.load(handle)

You can use the DictWriter class in csv to easily append what would be a sparse dictionary into a CSV. The only caveat is you need to know all the possible fields at the beginning.
import csv
data = { "first": {}, "second": {} }
data["first"]["title"] = "founder"
data["first"]["started"] = 2005
data["second"]["title"] = "CEO"
data["second"]["favcolour"] = "blue"
fieldNames = set()
for d in data:
for key in data[d].keys():
# Add all possible keys to fieldNames, beacuse fieldNames is
# a set, you can't have duplicate values
fieldNames.add(key)
with open('csvFile.csv', 'w') as csvfile:
# Initialize DictWriter with the list of fieldNames
# You can sort fieldNames to whatever order you wish the CSV
# headers to be in.
writer = csv.DictWriter(csvfile, fieldnames=list(fieldNames))
# Add Header to the CSV file
writer.writeheader()
# Iterate through all sub-dictionaries
for d in data:
# Add the sub-dictionary to the csv file
writer.writerow(data[d])

Pandas works really well for things like this, so if it's an option, I would recommend it.
import pandas as pd
#not necessary, but for me it's usually easier to work with a list of dicts than dicts
my_list = [my_dict[key] for key in my_dict]
# When you pass a list of dictionaries to pandas DataFrame class, it will take care of
#alignment issues for you, but if you're wanting to do something specific
#with None values, you will need to further manipulate the frame
df = pd.DataFrame(my_list)
df.to_csv('file_path_to_save_to')

Creating a nested dictionary from a CSV file with Python

I have a a csv file "input.csv" which has the following data.
UID,BID,R
U1,B1,4
U1,B2,3
U2,B1,2
I want the above to look like the following dictionary; group by the UID as they key and BID and R as a nested dictionary value.
{"U1":{"B1":4, "B2": 3}, "U2":{"B1":2}}
I have the below code:
new_data_dict = defaultdict(str)
with open("input.csv", 'r') as data_file:
data = csv.DictReader(data_file, delimiter=",")
headers = next(data)
for row in data:
new_data_dict[row["UID"]] += {row["BID"]:int(row["R"])}
The above throws an obvious error of
TypeError: cannot concatenate 'str' and 'dict' objects
Is there a way to do this?

Using the regular dict() you can use get() to initialize a new sub-dict and fill it afterwards.
import csv
new_data_dict = {}
with open("data.csv", 'r') as data_file:
data = csv.DictReader(data_file, delimiter=",")
for row in data:
item = new_data_dict.get(row["UID"], dict())
item[row["BID"]] = int(row["R"])
new_data_dict[row["UID"]] = item
print new_data_dict
Also, your call to next(data) was superfluous as the headers were automatically detected and stripped from the result.

This is a more efficient version using defaultdict:
from collections import defaultdict
new_data_dict = {}
with open("input.csv", 'r') as data_file:
data_file.next()
for row in data_file:
row = row.strip().split(",")
new_data_dict.setdefault(row[0],{})[row[1]] = int(row[2])

How do I merge two CSV files based on field and keep same number of attributes on each record?

I am attempting to merge two CSV files based on a specific field in each file.
file1.csv
id,attr1,attr2,attr3
1,True,7,"Purple"
2,False,19.8,"Cucumber"
3,False,-0.5,"A string with a comma, because it has one"
4,True,2,"Nope"
5,True,4.0,"Tuesday"
6,False,1,"Failure"
file2.csv
id,attr4,attr5,attr6
2,"python",500000.12,False
5,"program",3,True
3,"Another string",-5,False
This is the code I am using:
import csv
from collections import OrderedDict
with open('file2.csv','r') as f2:
reader = csv.reader(f2)
fields2 = next(reader,None) # Skip headers
dict2 = {row[0]: row[1:] for row in reader}
with open('file1.csv','r') as f1:
reader = csv.reader(f1)
fields1 = next(reader,None) # Skip headers
dict1 = OrderedDict((row[0], row[1:]) for row in reader)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('merged.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)
I get output like this, which merges appropriately, but does not have the same number of attributes for all rows:
1,True,7,Purple
2,False,19.8,Cucumber,python,500000.12,False
3,False,-0.5,"A string with a comma, because it has one",Another string,-5,False
4,True,2,Nope
5,True,4.0,Tuesday,program,3,True
6,False,1,Failure
file2 will not have a record for every id in file1. I'd like the output to have empty fields from file2 in the merged file. For example, id 1 would look like this:
1,True,7,Purple,,,
How can I add the empty fields to records that don't have data in file2 so that all of my records in the merged CSV have the same number of attributes?

If we're not using pandas, I'd refactor to something like
import csv
from collections import OrderedDict
filenames = "file1.csv", "file2.csv"
data = OrderedDict()
fieldnames = []
for filename in filenames:
with open(filename, "rb") as fp: # python 2
reader = csv.DictReader(fp)
fieldnames.extend(reader.fieldnames)
for row in reader:
data.setdefault(row["id"], {}).update(row)
fieldnames = list(OrderedDict.fromkeys(fieldnames))
with open("merged.csv", "wb") as fp:
writer = csv.writer(fp)
writer.writerow(fieldnames)
for row in data.itervalues():
writer.writerow([row.get(field, '') for field in fieldnames])
which gives
id,attr1,attr2,attr3,attr4,attr5,attr6
1,True,7,Purple,,,
2,False,19.8,Cucumber,python,500000.12,False
3,False,-0.5,"A string with a comma, because it has one",Another string,-5,False
4,True,2,Nope,,,
5,True,4.0,Tuesday,program,3,True
6,False,1,Failure,,,
For comparison, the pandas equivalent would be something like
df1 = pd.read_csv("file1.csv")
df2 = pd.read_csv("file2.csv")
merged = df1.merge(df2, on="id", how="outer").fillna("")
merged.to_csv("merged.csv", index=False)
which is much simpler to my eyes, and means you can spend more time dealing with your data and less time reinventing wheels.

You can use pandas to do this:
import pandas
csv1 = pandas.read_csv('filea1.csv')
csv2 = pandas.read_csv('file2.csv')
merged = csv1.merge(csv2, on='id')
merged.to_csv("output.csv", index=False)
I haven't tested this yet but it should put you on the right track until I can try it out. The code is quite self-explanatory; first you import the pandas library so that you can use it. Then using pandas.read_csv you read the 2 csv files and use the merge method to merge them. The on parameter specifies which column should be used as the "key". Finally, the merged csv is written to output.csv.

Use dict of dict then update it. Like this:
import csv
from collections import OrderedDict
with open('file2.csv','r') as f2:
reader = csv.reader(f2)
lines2 = list(reader)
with open('file1.csv','r') as f1:
reader = csv.reader(f1)
lines1 = list(reader)
dict1 = {row[0]: dict(zip(lines1[0][1:], row[1:])) for row in lines1[1:]}
dict2 = {row[0]: dict(zip(lines2[0][1:], row[1:])) for row in lines2[1:]}
#merge
updatedDict = OrderedDict()
mergedAttrs = OrderedDict.fromkeys(lines1[0][1:] + lines2[0][1:], "?")
for id, attrs in dict1.iteritems():
d = mergedAttrs.copy()
d.update(attrs)
updatedDict[id] = d
for id, attrs in dict2.iteritems():
updatedDict[id].update(attrs)
#out
with open('merged.csv', 'wb') as f:
w = csv.writer(f)
for id, rest in sorted(updatedDict.iteritems()):
w.writerow([id] + rest.values())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Joining csv files where key is first column value - python

Related

Turning Dictionary into CSV format

CSV Grouping w/o Pandas

How would I write to a CSV file from a python dictionary of dictionaries while some values are empty

Creating a nested dictionary from a CSV file with Python

How do I merge two CSV files based on field and keep same number of attributes on each record?

Categories

Resources