Turning Dictionary into CSV format - python

My output currently looks like this:
[{'date': '20140206', 'exchange': 'cme', 'total_bytes': '15400000'},
{'date': '20140206', 'exchange': 'phlx', 'total_bytes': '14100000'},
{'date': '20140206', 'exchange': 'phlx', 'total_bytes': '13800000'},
{'date': '20140207', 'exchange': 'cme', 'total_bytes': '15800000'},
{'date': '20140207', 'exchange': 'cme', 'total_bytes': '14200000'},
{'date': '20140207', 'exchange': 'phlx', 'total_bytes': '24100000'}]
But I need it to look more like this:
date,exchange,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
As of right now I have multiple lines for the same date, I would like to group them so that there are no duplicate entries. IE. only one phlx entry for the 7th. (adding both values of bytes)
Here is my code:
import csv
import pprint
endresult = []
# write csv_input to a csv file
with open('csv_input.csv','w') as file:
for line in csv_input:
file.write(line)
# manipulate text - remove the 0001 from the host name to get just the initials - ex. cme
text = open("csv_input.csv", "r")
text = ''.join([i for i in text]) \
.replace("0001", "")
x = open("csv_input.csv","w")
x.writelines(text)
x.close()
# read csv file created and add column names
with open('csv_input.csv', 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
endresult.append({
'date': row['date'],
'exchange': row['host'],
'total_bytes': row['bytes']})
#print(row)
with open('last.csv', 'w', newline='') as txt_file:
fieldnames = ['date','exchange','total_bytes']
csv_dict_writer = csv.DictWriter(txt_file, fieldnames=fieldnames)
csv_dict_writer.writeheader()
for result in endresult:
csv_dict_writer.writerow(result)
pprint.pprint(endresult)

There are a number libraries shipped with Python that can help here.
operator.itemgetter efficiently extras values for a sort key.
itertools.groupby will group iterated lines by similar keys.
csv.DictReader and csv.DictWriter to reference CSV data by named columns.
Note input.csv below is your original CSV data.
import csv
from operator import itemgetter
from itertools import groupby
# Build a sort key by primary/secondary sort value
sorter = itemgetter('date','host')
# Read all the data
with open('input.csv','r',newline='') as fin:
r = csv.DictReader(fin)
data = sorted(r,key=sorter)
# build output lines grouped by the sort key
lines = []
for (date,host),group in groupby(data,sorter):
lines.append({'date' : date,
'host' : host[:-4],
'total_bytes' : sum(int(data['bytes']) for data in group)})
# generate output
with open('output.csv','w',newline='') as fout:
w = csv.DictWriter(fout,fieldnames='date host total_bytes'.split())
w.writeheader()
w.writerows(lines)
output.csv:
date,host,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
Also, if your input data is already sorted appropriately, the code can be simplified to skip reading the entire data into memory and sorting it and process line-by-line instead. Reading the whole file into memory could be impractical for a large amount of data.
Note the use of .writerow() which takes a single dict vs. .writerows() which takes a list of dict.
import csv
from operator import itemgetter
from itertools import groupby
sorter = itemgetter('date','host')
with open('input.csv','r',newline='') as fin, \
open('output.csv','w',newline='') as fout:
r = csv.DictReader(fin)
w = csv.DictWriter(fout,fieldnames='date host total_bytes'.split())
w.writeheader()
for (date,host),group in groupby(r,sorter):
w.writerow({'date' : date,
'host' : host[:-4],
'total_bytes' : sum(int(data['bytes']) for data in group)})

Read in your original csv file as 'test.csv'
Group similar 'date' & 'exchange' values using a defaultdict, with 'total_bytes' added to a list.
Split the keys and values out into a list of lists, and sum 'total_bytes'
Write the list of lists to a csv file.
from collection import defaultdict
import csv
# read the csv
with open('test.csv', 'r') as f:
data = list(csv.DictReader(f))
# extract and group information from data
dd = defaultdict(list)
for d in data:
date, proc, _, _, tb = d.values() # extract the values of interest
proc = proc.split('_')[0]
dd[f'{date}_{proc}'].append(tb)
# add dd keys to a list, sum dd values and add to the list
csv_list = [['date', 'exchange', 'total_bytes']]
for k, v in dd.items():
d, e = k.split('_')
tb = sum(map(int, v))
csv_list.append([d, e, tb])
# write the list of list to a file
with open('new_csv.csv', 'w', newline='') as f:
write = csv.writer(f)
write.writerows(csv_list)
# view of csv file
date,exchange,total_bytes
20140206,cme,15400000
20140206,phlx,27900000
20140207,cme,30000000
20140207,phlx,24100000
If you're not allowed to use any imports
# read in the file
with open('test.csv', 'r') as f:
data = [row.strip().split(',') for row in f.readlines()]
# add list values of interest from data to a dict
dd = dict()
for i, d in enumerate(data):
if i > 0:
date, proc, _, _, tb = d
proc = proc.split('_')[0]
key = f'{date}_{proc}'
if not dd.get(key): # if the key doesn't exist
dd[key] = [tb] # create the key-value pair
else:
dd[key].append(tb)
# add dd keys to a list, sum dd values and add to the list
csv_list = [['date', 'exchange', 'total_bytes']]
for k, v in dd.items():
d, e = k.split('_')
tb = sum(map(int, v))
csv_list.append([d, e, tb])
# write the list of list to a file
with open('new_csv.csv', 'w', newline='') as f:
write = csv.writer(f)
write.writerows(csv_list)

Related

Extract various information

Overview
would like to extract various information like name, date and address from a 2 column csv file before writing to another csv file
Conditions
Extract Name by first row as it will always be the first
row.
Extract Date by regex (is there regex in python?) ##/##/####
format
Extract Address by the constant keyword 'road'
Example CSV dummy Source data reference file format viewed from EXCEL
ID,DATA
88888,DADDY
88888,2/06/2016
88888,new issac road
99999,MUMMY
99999,samsung road
99999,12/02/2016
Desired CSV outcome
ID,Name,Address,DATE
8888,DADDY,new issac road,2/06/2016
9999,MUMMY,samsung road,12/02/2016
What i have so far:
import csv
from collections import defaultdict
columns = defaultdict(list) # each value in each column is appended to a list
with open('dummy_data.csv') as f:
reader = csv.DictReader(f) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k,v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
# based on column name k
uniqueidstatement = columns['receipt_id']
print uniqueidstatement
resultFile = open("wtf.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')
wr.writerow(uniqueidstatement)
You can group the sections by ID and from each group you can determine which is the date and which is the address with some simple logic .
import csv
from itertools import groupby
from operator import itemgetter
with open("test.csv") as f, open("out.csv", "w") as out:
reader = csv.reader(f)
next(reader)
writer = csv.writer(out)
writer.writerow(["ID","NAME","ADDRESS", "DATE"])
groups = groupby(csv.reader(f), key=itemgetter(0))
for k, v in groups:
id_, name = next(v)
add_date_1, add_date_2 = next(v)[1], next(v)[1]
date, add = (add_date_1, add_date_2) if "road" in add_date_2 else (add_date_2, add_date_1)
writer.writerow([id_, name, add, date])

How would I write to a CSV file from a python dictionary of dictionaries while some values are empty

I have a Python dictionary of dictionaries and have date stored that i need to write to a CSV file.
the problem i'm having is that some of the dictionaries from the file i have read don't contain any information for that particular ID. So my CSV file column are not lined up properly .
example
d["first1"]["title"] = founder
d["first1"]["started"] = 2005
d["second1"]["title"] = CEO
d["second1"]["favcolour"] = blue
and so when i use the following code:
for key, value in d.iteritems():
ln = [key]
for ikey, ivalue in value.iteritems():
ln.append(ikey)
ln.extend([v for v in ivalue])
writer.writerow(ln)
my CSV file will have all the information but the "started" and "favcolour" are in the same column i want it so that the columns only contain one .
Thanks all in advance
Here's a suggestion:
d = {"first1": {"title": 'founder', "started": 2005}, "second1": {"title": 'CEO', "favcolour": 'blue'}}
columns = []
output = []
for key, value in d.iteritems():
for ikey, ivalue in value.iteritems():
if ikey not in columns:
columns.append(ikey)
ln = []
for col in columns:
if col not in value:
ln.append('')
else:
ln.append(value[col])
output.append(ln)
with open('file', 'w') as fl:
csv_writer = csv.writer(fl)
csv_writer.writerow(columns)
for ln in output:
print ln
csv_writer.writerow(ln)
file:
started,title,favcolour
2005,founder
,CEO,blue
If it doesn't need to be human-readable, you can use alternatively pickle:
import pickle
# Write:
with open('filename.pickle', 'wb') as handle:
pickle.dump(d, handle)
# Read:
with open('filename.pickle', 'rb') as handle:
d = pickle.load(handle)
You can use the DictWriter class in csv to easily append what would be a sparse dictionary into a CSV. The only caveat is you need to know all the possible fields at the beginning.
import csv
data = { "first": {}, "second": {} }
data["first"]["title"] = "founder"
data["first"]["started"] = 2005
data["second"]["title"] = "CEO"
data["second"]["favcolour"] = "blue"
fieldNames = set()
for d in data:
for key in data[d].keys():
# Add all possible keys to fieldNames, beacuse fieldNames is
# a set, you can't have duplicate values
fieldNames.add(key)
with open('csvFile.csv', 'w') as csvfile:
# Initialize DictWriter with the list of fieldNames
# You can sort fieldNames to whatever order you wish the CSV
# headers to be in.
writer = csv.DictWriter(csvfile, fieldnames=list(fieldNames))
# Add Header to the CSV file
writer.writeheader()
# Iterate through all sub-dictionaries
for d in data:
# Add the sub-dictionary to the csv file
writer.writerow(data[d])
Pandas works really well for things like this, so if it's an option, I would recommend it.
import pandas as pd
#not necessary, but for me it's usually easier to work with a list of dicts than dicts
my_list = [my_dict[key] for key in my_dict]
# When you pass a list of dictionaries to pandas DataFrame class, it will take care of
#alignment issues for you, but if you're wanting to do something specific
#with None values, you will need to further manipulate the frame
df = pd.DataFrame(my_list)
df.to_csv('file_path_to_save_to')

How do I convert a text file into a dictionary and print it?

My file is formatted into three columns of numbers:
2 12345 1.12345
1 54321 1.54321
3 12345 1.12345
I would like to have Python use the first two columns as keys and use the third column as the values. The file is large, meaning that I can't format it by hand. So how do I have Python automatically convert my large file into a dictionary?
Here is my code:
with open ('file name.txt' 'r') as f:
rows = ( line.split('\t') for line in f )
d = { row[0]:row[:3] for row in rows}
print(d)
The output prints the numbers diagonally all over the place. How do I format it properly?
Banana, you're close.
You need a comma separating the arguments of open.
You want to assign the third member of row, i.e. row[2].
You need to decide how to group the first two members of row into a hashable key. Making a tuple out of them, i.e. (row[0],row[1]) works.
Print the dictionary line by line.
Try:
with open('filename.txt','r') as f:
rows = ( line.split('\t') for line in f )
d = { (row[0],row[1]):row[2] for row in rows}
for key in d.keys():
print key,d[key]
I'm not sure exactly how you want the keys to layout. Regardless, you should use the csv module, using the '\t' as your delimiter.
import csv
with open('data.txt') as file:
tsvfile = csv.reader(file, delimiter='\t')
d = { "{},{}".format(row[0], row[1]): row[2] for row in tsvfile }
print(d)
Prints out:
{'3,12345': '1.12345', '1,54321': '1.54321', '2,12345': '1.12345'}
Alternatively, you have this:
with open('data.txt') as file:
tsvfile = csv.reader(file, delimiter='\t')
d = {}
for row in tsvfile:
d[row[0]] = row[2]
d[row[1]] = row[2]
print(d)
Prints out:
{'54321': '1.54321', '3': '1.12345', '1': '1.54321', '12345': '1.12345', '2': '1.12345'}
You should try -
import pprint
d = {}
with open ('file name.txt','r') as f:
for line in f:
row = line.split('\t')
if len(row) == 3:
d[(row[0], row[1])] = row[2]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(d)
First of all your slicing is wrong.You can get the first tow column with line[:2] and 3rd with line[2].
Also you don't need to create your rows in a separated data structure you can use unpacking operation and map function within a dict comprehension :
with open ('ex.txt') as f:
d={tuple(i):j.strip() for *i,j in map(lambda line:line.split('\t'),f)}
print(d)
result :
{('2', '12345'): '1.12345', ('3', '12345'): '1.12345', ('1', '54321'): '1.54321'}
Note that as *i is a list and lists are unhashable objects you can not use it as your dictionary key so you can convert it to tuple.
And if you want to preserve the order you can use collections.OrderedDict :
from collections import OrderedDict
with open ('ex.txt') as f:
d=OrderedDict({tuple(i):j.strip() for *i,j in map(lambda line:line.split('\t'),f)})
print(d)
OrderedDict([(('2', '12345'), '1.12345'), (('1', '54321'), '1.54321'), (('3', '12345'), '1.12345')])

How do I merge two CSV files based on field and keep same number of attributes on each record?

I am attempting to merge two CSV files based on a specific field in each file.
file1.csv
id,attr1,attr2,attr3
1,True,7,"Purple"
2,False,19.8,"Cucumber"
3,False,-0.5,"A string with a comma, because it has one"
4,True,2,"Nope"
5,True,4.0,"Tuesday"
6,False,1,"Failure"
file2.csv
id,attr4,attr5,attr6
2,"python",500000.12,False
5,"program",3,True
3,"Another string",-5,False
This is the code I am using:
import csv
from collections import OrderedDict
with open('file2.csv','r') as f2:
reader = csv.reader(f2)
fields2 = next(reader,None) # Skip headers
dict2 = {row[0]: row[1:] for row in reader}
with open('file1.csv','r') as f1:
reader = csv.reader(f1)
fields1 = next(reader,None) # Skip headers
dict1 = OrderedDict((row[0], row[1:]) for row in reader)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('merged.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)
I get output like this, which merges appropriately, but does not have the same number of attributes for all rows:
1,True,7,Purple
2,False,19.8,Cucumber,python,500000.12,False
3,False,-0.5,"A string with a comma, because it has one",Another string,-5,False
4,True,2,Nope
5,True,4.0,Tuesday,program,3,True
6,False,1,Failure
file2 will not have a record for every id in file1. I'd like the output to have empty fields from file2 in the merged file. For example, id 1 would look like this:
1,True,7,Purple,,,
How can I add the empty fields to records that don't have data in file2 so that all of my records in the merged CSV have the same number of attributes?
If we're not using pandas, I'd refactor to something like
import csv
from collections import OrderedDict
filenames = "file1.csv", "file2.csv"
data = OrderedDict()
fieldnames = []
for filename in filenames:
with open(filename, "rb") as fp: # python 2
reader = csv.DictReader(fp)
fieldnames.extend(reader.fieldnames)
for row in reader:
data.setdefault(row["id"], {}).update(row)
fieldnames = list(OrderedDict.fromkeys(fieldnames))
with open("merged.csv", "wb") as fp:
writer = csv.writer(fp)
writer.writerow(fieldnames)
for row in data.itervalues():
writer.writerow([row.get(field, '') for field in fieldnames])
which gives
id,attr1,attr2,attr3,attr4,attr5,attr6
1,True,7,Purple,,,
2,False,19.8,Cucumber,python,500000.12,False
3,False,-0.5,"A string with a comma, because it has one",Another string,-5,False
4,True,2,Nope,,,
5,True,4.0,Tuesday,program,3,True
6,False,1,Failure,,,
For comparison, the pandas equivalent would be something like
df1 = pd.read_csv("file1.csv")
df2 = pd.read_csv("file2.csv")
merged = df1.merge(df2, on="id", how="outer").fillna("")
merged.to_csv("merged.csv", index=False)
which is much simpler to my eyes, and means you can spend more time dealing with your data and less time reinventing wheels.
You can use pandas to do this:
import pandas
csv1 = pandas.read_csv('filea1.csv')
csv2 = pandas.read_csv('file2.csv')
merged = csv1.merge(csv2, on='id')
merged.to_csv("output.csv", index=False)
I haven't tested this yet but it should put you on the right track until I can try it out. The code is quite self-explanatory; first you import the pandas library so that you can use it. Then using pandas.read_csv you read the 2 csv files and use the merge method to merge them. The on parameter specifies which column should be used as the "key". Finally, the merged csv is written to output.csv.
Use dict of dict then update it. Like this:
import csv
from collections import OrderedDict
with open('file2.csv','r') as f2:
reader = csv.reader(f2)
lines2 = list(reader)
with open('file1.csv','r') as f1:
reader = csv.reader(f1)
lines1 = list(reader)
dict1 = {row[0]: dict(zip(lines1[0][1:], row[1:])) for row in lines1[1:]}
dict2 = {row[0]: dict(zip(lines2[0][1:], row[1:])) for row in lines2[1:]}
#merge
updatedDict = OrderedDict()
mergedAttrs = OrderedDict.fromkeys(lines1[0][1:] + lines2[0][1:], "?")
for id, attrs in dict1.iteritems():
d = mergedAttrs.copy()
d.update(attrs)
updatedDict[id] = d
for id, attrs in dict2.iteritems():
updatedDict[id].update(attrs)
#out
with open('merged.csv', 'wb') as f:
w = csv.writer(f)
for id, rest in sorted(updatedDict.iteritems()):
w.writerow([id] + rest.values())

Python Joining csv files where key is first column value

I try to join two csv files where key is value of first column.
There's no header.
Files have different number of lines and rows.
Order of file a must be preserved.
file a:
john,red,34
andrew,green,18
tonny,black,50
jack,yellow,27
phill,orange,45
kurt,blue,29
mike,pink,61
file b:
tonny,driver,new york
phill,scientist,boston
desired result:
john,red,34
andrew,green,18
tonny,black,50,driver,new york
jack,yellow,27
phill,orange,45,scientist,boston
kurt,blue,29
mike,pink,61
I examined all related threads and I am sure that some of you are gonna mark this question duplicate but I simply have not found solution yet.
I grabbed dictionary based solution but this approach does not handle preserve line order from file 'a' condition.
import csv
from collections import defaultdict
with open('a.csv') as f:
r = csv.reader(f, delimiter=',')
dict1 = {}
for row in r:
dict1.update({row[0]: row[1:]})
with open('b.csv') as f:
r = csv.reader(f, delimiter=',')
dict2 = {}
for row in r:
dict2.update({row[0]: row[1:]})
result = defaultdict(list)
for d in (dict1, dict2):
for key, value in d.iteritems():
result[key].append(value)
I also would like to avoid putting these csv files to the database like sqlite or using pandas module.
Thanks in advance
Something like
import csv
from collections import OrderedDict
with open('b.csv', 'rb') as f:
r = csv.reader(f)
dict2 = {row[0]: row[1:] for row in r}
with open('a.csv', 'rb') as f:
r = csv.reader(f)
dict1 = OrderedDict((row[0], row[1:]) for row in r)
result = OrderedDict()
for d in (dict1, dict2):
for key, value in d.iteritems():
result.setdefault(key, []).extend(value)
with open('ab_combined.csv', 'wb') as f:
w = csv.writer(f)
for key, value in result.iteritems():
w.writerow([key] + value)
produces
john,red,34
andrew,green,18
tonny,black,50,driver,new york
jack,yellow,27
phill,orange,45,scientist,boston
kurt,blue,29
mike,pink,61
(Note that I didn't bother protecting against the case where dict2 has a key which isn't in dict1-- that's easily added if you like.)

Categories

Resources