Parsing string in csv into dictionary - python

I have a csv file in the following format
mod, id
128, 2pmk|5svq|3ar7|5xw6|5ncq|5a3s|2gvd|1i5d
574, 3zjt
0A, 4wb3|4wb2|4r8i
0C, 1r3o|4wb3|4wb2|2gq6|2gq4|2gq5|4r8i|2gpm|2g32|2gq7
0G, 1r3o|4wb3|4wb2|2gq6|2gq4|2gq5|4r8i|2gpm|2g32|2gq7
0U, 1r3o|4wb3|4wb2|2gq6|2gq4|2gq5|4r8i|2gpm|2g32|2gq7
I wanted to convert the information into a dictionary of key and values where the key would be id's [from a separate list] and values would be all the mod present in the id. I've written the following code which I think is wrong
import csv
id_list = ['1r3o', '4wb2', '1kmk']
n = {}
with open('test6.csv', mode='rU') as infile:
reader = csv.reader(infile)
for elem1 in id_list:
for row in reader:
identifier = row[1].split('|')
for elem2 in identifier:
while elem1 == elem2:
n[elem1] = row[0]
print n
If there is no mapping between the id from the list and mod, I want the string 'None' appended to the dictionary value. The desired output is shown below:
{
'4wb2': ['OA', 'OC', 'OG', 'OU'],
'1r3o': ['OC', 'OG', 'OU'],
'1kmk': ['None']
}
Any help is appreciated. Thank you

import csv
id_list = ['1r3o', '4wb2', '1kmk']
n = {}
mapping = {}
with open('test6.csv', mode='rU') as infile:
reader = csv.reader(infile)
for row in reader:
mod, ids = row
for id in ids.split('|'):
if id not in mapping.keys():
mapping[id] = set()
mapping[id].add(mod)
for id in id_list:
values = list(mapping.get(id, []))
if not values:
values = ['None']
n[id] = values
print n

I know this question already has an accepted answer, but I would like to share with you another approach using dictionary comprehensions and lambdas.
import csv
id_list = ['1r3o', '4wb2', '1kmk', 'foo', 'bar', '3zjt']
# Read the content of the file
csv_content = []
with open('test6.csv', mode='rU') as file:
for row in csv.reader(file):
csv_content.append([row[0], row[1]])
# Collect the required data
mapped = { id: map(lambda f: f[0], filter(lambda r: id in r[1], csv_content)) for id in id_list }
# Add 'None' on empty results
results = dict(map(lambda item: (item[0], ['None'] if len(item[1]) == 0 else item[1]), mapped.iteritems()))
print(results)

Related

CSV without unique headers to list of dicts with unique keys

I have a CSV file with headers on row 0. The headers are often unique but sometimes they are not, for "comments" in this example. For each of several comments, the header is "Comment".
The problem with my function that makes dicts from CSVs is that it only returns the last column of Comment.
def csv_to_list_with_dicts(csvfile):
with open(csvfile) as f:
list_of_issues = [{k: v for k, v in row.items()}
for row in csv.DictReader(f, skipinitialspace=True)]
return list_of_issues
My CSV file columns are like this:
User;ID;Comment;Comment;Comment
If one of the headers is repeating, I need to add an index to make it unique (like Comment1;Comment2 without changing the CSV) in the dict or all comments included under just Comment.
This did return just the way I wanted. Just tweaked yours a small bit Happy Ahmad! HUGE THANKS!!! <3
def csv_to_list_with_dicts(csvfile):
with open(csvfile, "r") as file:
keys = file.readline().split(",")
alteredKeys = []
for eachKey in keys:
counter = 0
while(eachKey in alteredKeys):
counter += 1
eachKey = eachKey[:len(eachKey)-(0 if counter == 1 else 1)] + str(counter)
alteredKeys.append(eachKey)
list_of_issues = []
reader = csv.reader(file, delimiter=',', skipinitialspace=True)
for eachLine in reader:
eachIssue = dict()
columnIndex = 0
for eachColumn in eachLine:
if columnIndex < len(alteredKeys):
eachIssue[alteredKeys[columnIndex]] = eachColumn
columnIndex += 1
list_of_issues.append(eachIssue)
return list_of_issues
In this solution, I use an alterKey list that changes any repeated key in the header by adding an index at its end. Then, I iterate on the other lines of the CSV file and make a dictionary from each one.
def csv_to_list_with_dicts(csvfile):
with open(csvfile, "r") as file:
keys = file.readline().split(";")
alteredKeys = []
for eachKey in keys:
counter = 0
while(eachKey in alteredKeys):
counter += 1
eachKey = eachKey[:len(eachKey)-(0 if counter == 1 else 1)] + str(counter)
alteredKeys.append(eachKey)
list_of_issues = []
for eachLine in file:
eachIssue = dict()
columnIndex = 0
for eachColumn in eachLine.split(";")
if columnIndex < len(alteredKeys):
eachIssue[alteredKeys[columnIndex]] = eachColumn
columnIndex += 1
list_of_issues.append(eachIssue)
return list_of_issues
It woujld be fairly easy to write code that will automatically generate unique keys for you by simply keeping track of those already seen and generating a unique name for any encountered that conflicted with one before it. Checking for that would be quick if those seen were kept in a set which features fast membership testing.
For example, assume this was in a CSV file named non-unique.csv:
User;ID;Comment;Comment;Comment
Jose;1138;something1;something2;something3
Gene;2907;abc;def;ghi
Guido;6450;jkl;mno;pqr
Code:
import csv
def csv_to_list_with_dicts(csv_filename):
# Read the first row of the csv file.
with open(csv_filename, encoding='utf-8', newline='') as csv_file:
reader = csv.reader(csv_file, delimiter=';', skipinitialspace=True)
names = next(reader) # Header row.
# Create list of unique fieldnames for the namee in the header row.
seen = set()
fieldnames = []
for i, name in enumerate(names):
if name in seen:
name = f'_{i}'
else:
seen.add(name)
fieldnames.append(name)
# Read entire file and make each row a dictionary with keys based on the fieldnames.
with open(csv_filename, encoding='utf-8', newline='') as csv_file:
reader = csv.DictReader(csv_file, fieldnames=fieldnames, delimiter=';',
skipinitialspace=True)
next(reader) # Ignore header row.
return list(reader)
results = csv_to_list_with_dicts('non-unique.csv')
from pprint import pprint
pprint(results, sort_dicts=False, width=120)
Results:
[{'User': 'Jose', 'ID': '1138', 'Comment': 'something1', '_3': 'something2', '_4': 'something3'},
{'User': 'Gene', 'ID': '2907', 'Comment': 'abc', '_3': 'def', '_4': 'ghi'},
{'User': 'Guido', 'ID': '6450', 'Comment': 'jkl', '_3': 'mno', '_4': 'pqr'}]

appending row data from a dictionary seems to 'leak'

I'm iterating through a large collection (1000+) of JSON files using python 3.6 using the following code, see below. It all seems straight-forward enough but I am finding that some columns are being held over (and therefore repeated) in subsequent lines. I thought that using the .get() method would account for KeyErrors, but it's not working as I expected.
def make_place_table(row):
with open('place_table.csv', 'a', newline='') as results:
writer = csv.writer(results)
writer.writerow(row)
row = []
for filename in glob.glob('/rawjson/*.json'):
with open(filename) as filename:
data = json.load(filename)
#
has_error = bool('error' in data.keys())
has_place_results = bool('place_results' in data.keys())
if(has_error):
row=[]
row.append(data['search_metadata']['id'])
row.append(data['error'])
make_error_table(row)
else:
if(has_place_results):
results = {key:value for key,value in data.items() if key in 'place_results' }
place_results.update(results['place_results'])
has_reviews = bool('user_reviews' in place_results.keys())
has_extensions = bool('extensions' in place_results.keys())
row = []
row.append(data['search_metadata']['id'])
row.append(place_results.get('title', 'n/a'))
row.append(place_results['gps_coordinates']['latitude'])
row.append(place_results['gps_coordinates']['longitude'])
row.append(place_results.get('rating', 'n/a'))
row.append(place_results.get('reviews', 'n/a'))
row.append(place_results.get('price', 'n/a'))
row.append(place_results.get('address', 'n/a'))
row.append(place_results.get('phone', 'n/a'))
row.append(place_results['website'])
amenity_type = [name for name in place_results['type']]
row.append(amenity_type)
row.append(place_results["hours"][3])
row.append(place_results["hours"][4])
row.append(place_results["hours"][5])
row.append(place_results["hours"][6])
row.append(place_results["hours"][0])
row.append(place_results["hours"][1])
row.append(place_results["hours"][2])
make_place_table(row)
row=[]

Checking a CSV for the existence of a similar value in Python

Consider the following CSV:
date,description,amount
14/02/2020,march contract,-99.00
15/02/2020,april contract,340.00
16/02/2020,march contract,150.00
17/02/2020,april contract,-100.00
What I'd like to do is:
Iterate through all of the rows
Total the amounts of lines which have the same description
Return the last line which has that newly-calculated amount
Applied to the above example, the CSV would look like this:
16/02/2020,march contract,51.00
17/02/2020,april contract,240.00
So far, I've tried nesting csv.reader()s inside of each other and I'm not getting the result I am wanting.
I'd like to achieve this without any libraries and/or modules.
Here is the code I have so far, where first_row is each row in the CSV and second_row is the iteration of looking for matching descriptions:
csv_reader = csv.reader(report_file)
for first_row in csv_reader:
description_index = 5
amount_index = 13
print(first_row)
for second_row in csv_reader:
if second_row is not first_row:
print(first_row[description_index] == second_row[description_index])
if first_row[description_index] == second_row[description_index]:
first_row[amount_index] = float(first_row[amount_index]) + float(second_row[amount_index])
This will work:
import csv
uniques = {} # dictionary to store key/value pairs
with open(report_file, newline='') as f:
reader = csv.reader(f, delimiter=',')
next(reader, None) # skip header row
for data in reader:
date = data[0]
description = data[1]
if description in uniques:
cumulative_total = uniques[description][0]
uniques[description] = [cumulative_total+float(data[2]), date]
else:
uniques[description] = [float(data[2]), date]
# print output
for desc, val in uniques.items():
print(f'{val[0]}, {desc}, {val[1]}')
I know that you've asked for a solution without pandas, but you'll save yourself a lot of time if you use it:
df = pd.read_csv(report_file)
totals = df.groupby(df['description']).sum()
print(totals)
I suggest you should use pandas, it'll be efficient.
or if you still want to go with your way then this will help.
import csv
with open('mycsv.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
value_dict = {}
line_no = 0
for row in csv_reader:
if line_no == 0:
line_no += 1
continue
cur_date = row[0]
cur_mon = row[1]
cur_val = float(row[2])
if row[1] not in value_dict.keys():
value_dict[cur_mon] = [cur_date, cur_val]
else:
old_date, old_val = value_dict[cur_mon]
value_dict[cur_mon] = [cur_date, (old_val + cur_val)]
line_no += 1
for key, val_list in value_dict.items():
print(f"{val_list[0]},{key},{val_list[1]}")
Output:
16/02/2020,march contract,51.0
17/02/2020,april contract,240.0
Mark this as answer if it helps you.
working with dictionary makes it easy to access values
import csv
from datetime import datetime
_dict = {}
with open("test.csv", "r") as f:
reader = csv.reader(f, delimiter=",")
for i, line in enumerate(reader):
if i==0:
headings = [line]
else:
if _dict.get(line[1],None) is None:
_dict[line[1]] = {
'date':line[0],
'amount':float(line[2])
}
else:
if datetime.strptime(_dict.get(line[1]).get('date'),'%d/%m/%Y') < datetime.strptime(line[0],'%d/%m/%Y'):
_dict[line[1]]['date'] = line[0]
_dict[line[1]]['amount'] = _dict[line[1]]['amount'] + float(line[2])
Here your _dict will contain unique description and values
>>> print(_dict)
{'march contract': {'date': '16/02/2020', 'amount': 51.0},
'april contract': {'date': '17/02/2020', 'amount': 240.0}}
convert to list and add headings
headings.extend([[value['date'],key,value['amount']] for key,value in _dict.items()])
>>>print(headings)
[['date', 'description', 'amount'],['16/02/2020', 'march contract', 51.0], ['17/02/2020', 'april contract', 240.0]]
save list to csv
with open("out.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(headings)
You can also use itertools.groupby and sum() for this if you don't mind outputting in sorted form.
from datetime import datetime
from itertools import groupby
import csv
with open(report_file, 'r') as f:
reader = csv.reader(f)
lst = list(reader)[1:]
sorted_input = sorted(lst, key=lambda x : (x[1], datetime.strptime(x[0],'%d/%m/%Y'))) #sort by description and date
groups = groupby(sorted_input, key=lambda x : x[1])
for k,g in groups:
rows = list(g)
total = sum(float(row[2]) for row in rows)
print(f'{rows[-1][0]},{k},{total}') #print last date, description, total
Output:
17/02/2020,april contract,240.0
16/02/2020,march contract,51.0

Summing values from duplicate keys in a CSV file without panda

I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)

csv.DictReader delimiter inside a csv field with multiple quotes

I have the following example csv, that I am reading with:
f = StringIO(response.encode('utf-8'))
reader = csv.DictReader(f, quotechar='"', delimiter=';', quoting=csv.QUOTE_ALL, skipinitialspace=True)
example csv:
id;name;community;owner;owns;description;uuid
3c;NP;NoProb;NoP;Text;text_with_no_issues;
3c;NP;NoProb;NoP;TextText;text_with_no_issues2;
1A;fooo;barr;Bar;TEXT1;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
1A;fooo;barr;Bar;TEXT2;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT3;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT4;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
the uuid column is empty in all cases.
within the "reader" there are multiple entries with the same 'name' and 'id' which I am "merging", but in lines like the last four (1A,2B) I am hitting an issue because of the ";" delimiter in the description.
Even with quotechar='"' and quoting=csv.QUOTE_ALL the description column gets spitted by the delimiter and goes to the next column (uuid) and to a "None" column which corrupts my data.
Any idea how to solve this one ?
P.S. for the merge logic I am using two variants:
##############################################################
name_index = []
result = []
for line in reader:
idx = line["name"]
if idx not in name_index:
name_index.append(idx)
result.append(line)
else:
idx_curr_dict = result[name_index.index(idx)]
merge_entries = [idx_curr_dict, line]
placeholder = {}
for key in idx_curr_dict:
placeholder[key] = ", ".join(list(set(d[key] for d in merge_entries if d[key] != "" and d[key])))
result[name_index.index(idx)] = placeholder
##############################################################
and a bit slower one, but not that complicated:
##############################################################
data = [line for line in reply] # Deplete the iterator
unique_names = set([line['name'] for line in data]) # List of unique names
column_names = [key for key in data[0] if key != 'name' and key != 'uuid'] # all other useful columns
result = []
for name in unique_names:
same_named_lines = [line for line in data if line['name'] == name]
unique_line = {'name': name}
for column in column_names:
value = ", ".join(set([line[column] for line in same_named_lines]))
unique_line[column] = value
result.append(unique_line)
##############################################################
Thanks a lot in advance!

Categories

Resources