Related
Consider the following CSV:
date,description,amount
14/02/2020,march contract,-99.00
15/02/2020,april contract,340.00
16/02/2020,march contract,150.00
17/02/2020,april contract,-100.00
What I'd like to do is:
Iterate through all of the rows
Total the amounts of lines which have the same description
Return the last line which has that newly-calculated amount
Applied to the above example, the CSV would look like this:
16/02/2020,march contract,51.00
17/02/2020,april contract,240.00
So far, I've tried nesting csv.reader()s inside of each other and I'm not getting the result I am wanting.
I'd like to achieve this without any libraries and/or modules.
Here is the code I have so far, where first_row is each row in the CSV and second_row is the iteration of looking for matching descriptions:
csv_reader = csv.reader(report_file)
for first_row in csv_reader:
description_index = 5
amount_index = 13
print(first_row)
for second_row in csv_reader:
if second_row is not first_row:
print(first_row[description_index] == second_row[description_index])
if first_row[description_index] == second_row[description_index]:
first_row[amount_index] = float(first_row[amount_index]) + float(second_row[amount_index])
This will work:
import csv
uniques = {} # dictionary to store key/value pairs
with open(report_file, newline='') as f:
reader = csv.reader(f, delimiter=',')
next(reader, None) # skip header row
for data in reader:
date = data[0]
description = data[1]
if description in uniques:
cumulative_total = uniques[description][0]
uniques[description] = [cumulative_total+float(data[2]), date]
else:
uniques[description] = [float(data[2]), date]
# print output
for desc, val in uniques.items():
print(f'{val[0]}, {desc}, {val[1]}')
I know that you've asked for a solution without pandas, but you'll save yourself a lot of time if you use it:
df = pd.read_csv(report_file)
totals = df.groupby(df['description']).sum()
print(totals)
I suggest you should use pandas, it'll be efficient.
or if you still want to go with your way then this will help.
import csv
with open('mycsv.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
value_dict = {}
line_no = 0
for row in csv_reader:
if line_no == 0:
line_no += 1
continue
cur_date = row[0]
cur_mon = row[1]
cur_val = float(row[2])
if row[1] not in value_dict.keys():
value_dict[cur_mon] = [cur_date, cur_val]
else:
old_date, old_val = value_dict[cur_mon]
value_dict[cur_mon] = [cur_date, (old_val + cur_val)]
line_no += 1
for key, val_list in value_dict.items():
print(f"{val_list[0]},{key},{val_list[1]}")
Output:
16/02/2020,march contract,51.0
17/02/2020,april contract,240.0
Mark this as answer if it helps you.
working with dictionary makes it easy to access values
import csv
from datetime import datetime
_dict = {}
with open("test.csv", "r") as f:
reader = csv.reader(f, delimiter=",")
for i, line in enumerate(reader):
if i==0:
headings = [line]
else:
if _dict.get(line[1],None) is None:
_dict[line[1]] = {
'date':line[0],
'amount':float(line[2])
}
else:
if datetime.strptime(_dict.get(line[1]).get('date'),'%d/%m/%Y') < datetime.strptime(line[0],'%d/%m/%Y'):
_dict[line[1]]['date'] = line[0]
_dict[line[1]]['amount'] = _dict[line[1]]['amount'] + float(line[2])
Here your _dict will contain unique description and values
>>> print(_dict)
{'march contract': {'date': '16/02/2020', 'amount': 51.0},
'april contract': {'date': '17/02/2020', 'amount': 240.0}}
convert to list and add headings
headings.extend([[value['date'],key,value['amount']] for key,value in _dict.items()])
>>>print(headings)
[['date', 'description', 'amount'],['16/02/2020', 'march contract', 51.0], ['17/02/2020', 'april contract', 240.0]]
save list to csv
with open("out.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(headings)
You can also use itertools.groupby and sum() for this if you don't mind outputting in sorted form.
from datetime import datetime
from itertools import groupby
import csv
with open(report_file, 'r') as f:
reader = csv.reader(f)
lst = list(reader)[1:]
sorted_input = sorted(lst, key=lambda x : (x[1], datetime.strptime(x[0],'%d/%m/%Y'))) #sort by description and date
groups = groupby(sorted_input, key=lambda x : x[1])
for k,g in groups:
rows = list(g)
total = sum(float(row[2]) for row in rows)
print(f'{rows[-1][0]},{k},{total}') #print last date, description, total
Output:
17/02/2020,april contract,240.0
16/02/2020,march contract,51.0
I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)
I'm working on a Python script that scrapes data from an Excel doc, then writes the output to a .csv.
I was able to grab the data and get it to write to the .csv, but all of the data goes into the first column.
I need the bar data to go into the 4th and the foo to go into the 5th column, so I tried to use csv.reader to select the row, and this runs without error but doesn't actually write to the .csv file.
Here's my code:
import xlrd
import csv
###Grab the data
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'map_test.xlsx'
output = []
output_bar = []
output_foo = []
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('Sheet1')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
print len(column_names)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append([this_row[0], column_names[x]])
output_bar.append([column_names[x]])
output_foo.append([this_row[0]])
print output
myData = [["number", "name", "version", "bar",
"foo"]]
##### Next section is the code in question, it
####doesn't error out, but won't write to the .csv######
myFile = open("test123.csv", "w")
writer = csv.writer(myFile)
with open('test123.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
row[5] = myFile.readline()
writer.writerows(output_foo)
row[4] = myFile.readline()
writer.writerows(outpu_bar)
#####This successfully writes to the csv, but
#####all data to first column#####
# myFile = open('test123.csv', 'w')
# with myFile:
# writer = csv.writer(myFile)
# writer.writerows(myData)
# #writer.writerows(output)
# writer.writerows(output_foo)
# writer.writerows(output_bar)
x += 1
print ("CSV Written")
I have a .csv file with some data that i would like to change.
It looks like this:
item_name,item_cost,item_priority,item_required,item_completed
item 1,11.21,2,r
item 2,411.21,3,r
item 3,40.0,1,r,c
My code runs most of what i need but i am unsure of how to write back on my .csv to produce this result
item_name,item_cost,item_priority,item_required,item_completed
item 1,11.21,2,x
item 2,411.21,3,r
item 3,40.0,1,r,c
My code:
print("Enter the item number:")
line_count = 0
marked_item = int(input())
with open("items.csv", 'r') as f:
reader = csv.DictReader(f, delimiter=',')
for line in reader:
if line["item_required"] == 'r':
line_count += 1
if marked_item == line_count:
new_list = line
print(new_list)
for key, value in new_list.items():
if value == "r":
new_list['item_required'] = "x"
print(new_list)
with open("items.csv", 'a') as f:
writer = csv.writer(f)
writer.writerow(new_list.values())
There are several problems here
you're using a DictReader, which is good to read data, but not as good to read and write data as the original file, since dictionaries do not ensure column order (unless you don't care, but most of the time people don't want columns to be swapped). I just read the title, find the index of the column title, and use this index in the rest of the code (no dicts = faster)
when you write you append to the csv. You have to delete old contents, not append. And use newline='' or you get a lot of blank lines (python 3) or "wb" (python 2)
when you read, you need to store all values, not only the one you want to change, or you won't be able to write back all the data (since you're replacing the original file)
when you modify, you do overcomplex stuff I just replaced by a simple replace in list at the given index (after all you want to change r to x at a given row)
Here's the fixed code taking all aforementioned remarks into account
EDIT: added the feature you request after: add a c after x if not already there, extending the row if needed
import csv
line_count = 0
marked_item = int(input())
with open("items.csv", 'r') as f:
reader = csv.reader(f, delimiter=',')
title = next(reader) # title
idx = title.index("item_required") # index of the column we target
lines=[]
for line in reader:
if line[idx] == 'r':
line_count += 1
if marked_item == line_count:
line[idx] = 'x'
# add 'c' after x (or replace if column exists)
if len(line)>idx+1: # check len
line[idx+1] = 'c'
else:
line.append('c')
lines.append(line)
with open("items.csv", 'w',newline='') as f:
writer = csv.writer(f,delimiter=',')
writer.writerow(title)
writer.writerows(lines)
Using pandas:
import pandas as pd
df = pd.read_csv("items.csv")
print("Enter the item number:")
marked_item = int(input())
df.set_value(marked_item - 1, 'item_required', 'x')
# This is the extra feature you required:
df.set_value(marked_item - 1, 'item_completed', 'c')
df.to_csv("items.csv", index = False)
Result when marked_item = 1:
item_name,item_cost,item_priority,item_required,item_completed
item 1,11.21,2,x,c
item 2,411.21,3,r,
item 3,40.0,1,r,c
Note that according to RFC4180 you should keep the trailing commas.
I guess this should do the trick:
Open a file which can read and written to update it (use "+r" for that)
instead of opening it again write it right there using csvfilewriter, which we create at the start.
file.py
import csv
fieldnames = ["item_name","item_cost","item_priority","item_required","item_completed"]
csvfile = open("items.csv", 'r+')
csvfilewriter = csv.DictWriter(csvfile, fieldnames=fieldnames,dialect='excel', delimiter=',')
csvfilewriter.writeheader()
print("Enter the item number:")
line_count = 0
marked_item = int(input())
with open("items.csv", 'r') as f:
reader = csv.DictReader(f, delimiter=',')
for line in reader:
if line["item_required"] == 'r':
line_count += 1
if marked_item == line_count:
new_list = line
print(new_list)
for key, value in new_list.items():
if value == "r":
new_list['item_required'] = "x"
print(new_list)
csvfilewriter.writerow(new_list)
If you don't want to update the csv but want to write a new one, below is the code:
import csv
fieldnames = ["item_name","item_cost","item_priority","item_required","item_completed"]
csvfile = open("items_new.csv", 'w')
csvfilewriter = csv.DictWriter(csvfile, fieldnames=fieldnames,dialect='excel', delimiter=',')
csvfilewriter.writeheader()
print("Enter the item number:")
line_count = 0
marked_item = int(input())
with open("items.csv", 'r') as f:
reader = csv.DictReader(f, delimiter=',')
for line in reader:
if line["item_required"] == 'r':
line_count += 1
if marked_item == line_count:
new_list = line
print(new_list)
for key, value in new_list.items():
if value == "r":
new_list['item_required'] = "x"
print(new_list)
csvfilewriter.writerow(new_list)
else:
csvfilewriter.writerow(line)
I am trying to get my function to work out the average, write it to a file and sort it using python. This is my code:
def average_score(filename):
with open(filename) as Class:
reader = c.reader(Class,delimiter=",")
for row in reader:
people = []
people.append(row[0])
user, *scores = row
average = sum([int(score) for score in scores]) / len(scores)
a = open(filename,"a").writer(Class)
data = [[average]]
a.writerows(data)
people.append(score)
count = count+1
list11.insert(count,people)
sort=sorted(list11, key = o.itemgetter(4), reverse = False)
for eachline in sort:
print( eachline)
csv file:
kieran,3,10,7
ben,4,8,5
ethan,9,1,4
oliver,7,2,3
Something like this should work
def average_score(filename):
averages = {}
with open(filename) as fd:
reader = c.reader(fd, delimiter=",")
for row in reader:
user, *scores = row
# the user has no score
if len(scores) == 0:
continue
averages[user] = sum([int(score) for score in scores]) / len(scores)
sorted_averages = sorted(averages.items(), key = o.itemgetter(1), reverse = False)
# This writes the average to the file, remove if necessary
with open(filename, 'a') as fd:
for item in sorted_averages:
fd.write("{}: {}\n".format(item[0], item[1]))
# This prints out to screen, remove if necessary
for item in sorted_averages:
print("{}: {}".format(item[0], item[1]))