I have a problem with appending data for missing rows in the csv file: I am reading rows from a csv file for each customer and appending lists with the data the rows have. Each customer needs to have the same id's that are highlighted in green in the example image. If the next customer doesn't have the rows with all needed id's, I still need to append 0 values to the lists for these missing rows. So the customer highlighted in yellow needs to have same number of values appended to the data lists as the one in green.
I am trying to read each row and compare its id with the list of all possible id's that I created, but I am always stuck on the first id and not sure if this is the right way to go and read the previous row again until it's id is equal to the id from the list for possible id's (I do this to add the missing row's data to the list). Please let me know if you have any suggestions?
Note: if take into consideration only the column with id's, for these two customers I would like the list to look like this: list_with_ids = [410, 409, 408, 407, 406, 405, 403, 402, **410, 409, 408, 407, 406, 405, 403, 402**]. So I am looking for a way - once I am on row 409 in yellow - to first append the first needed id 410, and only then 409 and so forth. And same - append the two missing ids at the end: 403, 402.
Code:
def write_data(workbook):
[...]
# Lists.
list_cust = []
list_quantity = [] # from Some_data columns
# Get the start row in the csv file.
for row in range(worksheet.nrows):
base_id = str(410)
value = worksheet.cell(row, 1).value
start = str(value)
if base_id [0] == start[0]:
num_of_row_for_id = row
# Append the first id.
first_cust = str(worksheet.cell(num_of_row_for_id, 0).value)
list_cust.append(first_cust)
# Needed to count id's.
count = 0
# List with all needed id's for each customer.
# instead of ... - all ids' in green from the image.
all_ids = [....]
# Get data.
for row in range(worksheet.nrows):
next_id = str(worksheet.cell(num_of_row_for_id, 1).value)
cust = str(worksheet.cell(num_of_row_for_id, 0).value)
# Append id to the list.
list_cust.append(cust)
# Needed to separate rows for each customer.
if list_cust[len(list_cust)-1] == list_cust[len(list_cust)-2]:
# Get data: I read columns to get data.
# Let's say I read col 4 to 21.
for col_num in range(3, 20):
# Here is the prolem: ############################
if next_id != all_ids[count]:
list_quantity.append(0)
if next_id == all_ids[count]:
qty = worksheet.cell(num_of_row_for_id, col_num).value
list_quantity.append(qty)
# Get the next row in reverse order.
num_of_row_for_id -= 1
# Increment count for id's index.
if list_cust[len(list_cust)-1] == list_cust[len(list_cust)-2]:
# 8 possible id's.
if count < 7:
count += 1
else:
count = 0
Consider the following data wrangling with list comprehensions and loops using following data input containing random data columns:
Input Data
# Cust ID Data1 Data2 Data3 Data4 Data5
# 2011 62,404 0.269101238 KPT 0.438881697 UAX 0.963170513
# 2011 62,405 0.142397746 XYD 0.51668728 PTQ 0.761695425
# 2011 62,406 0.782342616 QCN 0.259141256 FNX 0.870971924
# 2011 62,407 0.221750017 EIU 0.358439487 MAN 0.13633062
# 2011 62,408 0.097509568 CRU 0.410058705 BFK 0.680228327
# 2011 62,409 0.322871333 LAC 0.489425167 GUX 0.449476844
# 919 62,403 0.371461633 PUR 0.626146074 KWX 0.525711736
# 919 62,404 0.384859932 AJZ 0.223408599 JSU 0.914916663
# 919 62,405 0.020630503 SFY 0.260778598 VUU 0.213559498
# 919 62,406 0.952425138 EBI 0.59595738 ZYU 0.283794413
# 919 62,407 0.410368534 BTT 0.252698401 FFY 0.41080646
# 919 62,408 0.553390336 GMA 0.846309022 BIN 0.049852419
# 919 62,409 0.193437955 NBB 0.877311494 XQX 0.080656637
Python code
import csv
i = 0
data = []
# READ CSV AND CAPTURE HEADERS AND DATA
with open('Input.csv', 'r') as f:
rdr = csv.reader(f)
for line in rdr:
if i == 0:
headers = line
else:
line[1] = int(line[1].replace(',',''))
data.append(line)
i += 1
# CREATE NEEDED LISTS
cust_list = list(set([i[0] for i in data]))
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
# CAPTURE MISSING IDS BY CUSTOMER
for c in cust_list:
currlist = [d[1] for d in data if d[0] == c]
missingids = [i for i in id_list if i not in currlist]
for m in missingids:
data.append([c, m,'','','','',''])
# WRITE DATA TO NEW CSV IN SORTED ORDER
with open('Output.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(headers)
for c in cust_list:
for i in sorted(id_list, reverse=True):
for d in data:
if d[0] == c and d[1] == i:
wtr.writerow(d)
Output Data
Consider even Python third-party modules such as pandas, the data analysis package; and even an SQL solution using pyodbc since Windows' built-in Jet/ACE SQL Engine can query CSV files directly.
You will notice below and previous solution, quite a bit of handling is needed to remove the thousand comma separators in the ID column as modules consider them as string first. If you remove such commas from original csv file, you can reduce lines of code.
Pandas (with left merge on two dataframes)
import pandas as pd
df = pd.read_csv('Input.csv')
cust_list = df['Cust'].unique()
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
ids = pd.DataFrame({'Cust': [int(c) for i in id_list for c in cust_list],
'ID': [int(i) for i in id_list for c in cust_list]})
df['ID'] = df['ID'].str.replace(',','').astype(int)
df = ids.merge(df, on=['Cust', 'ID'], how='left').\
sort_values(['Cust', 'ID'], ascending=[True, False])
df.to_csv('Output_pandas.csv', index=False)
PyODBC (works only for Windows machines using left join on two csv files)
import pyodbc
conn = pyodbc.connect(r'Driver=Microsoft Access Text Driver (*.txt, *.csv);' + \
'DBQ=C:\Path\To\CSV\Files;Extensions=asc,csv,tab,txt;',
autocommit=True)
cur = conn.cursor()
cust_list = [i[0] for i in cur.execute("SELECT DISTINCT c.Cust FROM Input.csv c")]
id_list = [62402,62403,62404,62405,62406,62407,62408,62409,62410]
cur.close()
with open('ID_list.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(['Cust', 'ID'])
for item in [[int(c),int(i)] for c in cust_list for i in id_list]:
wtr.writerow(item)
i = 0
with open('Input.csv', 'r') as f1, open('Input_without_commas.csv', 'w') as f2:
rdr = csv.reader(f1); wtr = csv.writer(f2, lineterminator='\n')
for line in rdr:
if i > 0:
line[1] = int(line[1].replace(',',''))
wtr.writerow(line)
i += 1
strSQL = "SELECT i.Cust, i.ID, c.Data1, c.Data2, c.Data3, c.Data4, c.Data5 " +\
" FROM ID_list.csv i" +\
" LEFT JOIN Input_without_commas.csv c" +\
" ON i.Cust = c.Cust AND i.ID = c.ID" +\
" ORDER BY i.Cust, i.ID DESC"
cur = conn.cursor()
with open('Output_sql.csv', 'w') as f:
wtr = csv.writer(f, lineterminator='\n')
wtr.writerow(['Cust', 'ID', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5'])
for i in cur.execute(strSQL):
wtr.writerow(i)
cur.close()
conn.close()
Output (for both above solutions)
Related
I am having two csv files where I need a python code to do a vlookup that does match the values and takes only the needed column and creates a new csv file. I know it can be done with pandas but I need it to do this without pandas or any 3rd party tools.
INPUT 1 csv file
ID NAME SUBJECT
1 Raj CS
2 Allen PS
3 Bradly DP
4 Tim FS
INPUT 2 csv file
ID COUNTRY TIME
2 USA 1:00
4 JAPAN 14:00
1 ENGLAND 5:00
3 CHINA 0.00
OUTPUT csv file
ID NAME SUBJECT COUNTRY
1 Raj CS ENGLAND
2 Allen PS USA
3 Bradly DP CHINA
4 Tim FS JAPAN
Probably a more efficient way to do it, but basically create a nested dictionary (using the ID as the key) with the other column names and their values under the ID key. Then when you iterate through each file, it'll update the dictionary on the ID key.
Finally put them together into a list and write to file:
input_files = ['C:/test/input_1.csv', 'C:/test/input_2.csv']
lookup_column_name = 'ID'
output_dict = {}
for file in input_files:
file = open(file, 'r')
header = {}
# Read each line in the csv
for idx, line in enumerate(file.readlines()):
# If it's the first line, store as the header
if idx == 0:
header = line.split(',')
# Get the index value of the lookup column from the list of headers
header_dict = {idx:x.strip() for idx, x in enumerate(header)}
lookup_column_idx = dict((v,k) for k,v in header_dict.items())[lookup_column_name]
continue
line_split = line.split(',')
# Initialize the dictionary by look up column
if line_split[lookup_column_idx] not in output_dict.keys():
output_dict[line_split[lookup_column_idx]] = {}
# If not the lookup column, then add the other column and data to the dictionary
for idx, value in enumerate(line_split):
if idx != lookup_column_idx:
output_dict[line_split[lookup_column_idx]].update({header_dict[idx]:value})
# Create a list of the rows that will be written to file under the correct columns
rows = []
for k, v in output_dict.items():
header = [lookup_column_name] + list(v.keys())
row = [k] + [output_dict[k][x].strip() for x in header if x != lookup_column_name]
row = ','.join(row) + '\n'
rows.append(row)
# Final list of rows, begining with the header
output_lines = [','.join(header) + '\n'] + rows
# writing to file
output = open('C:/test/output.csv', 'w')
output.writelines(output_lines)
output.close()
To do this without pandas (and assuming you know the structure of your data + it fits in memory), you can iterate through the csv file and store the results in a dictionary, where you fill the entries where the ID maps to the other information that you want to keep.
You can do this for both csv files and join them manually afterwards by iterating over the keys of the dictionary.
input1='.\file1.csv'
input2='.\file2.csv'
with open(input1,'r',encoding='utf-8-sig') as inuputlist:
with open(input2, "r",encoding='utf-8-sig') as inputlist1:
with open('.\output.csv','w',newline='',encoding='utf-8-sig') as output:
reader = csv.reader(inputlist)
reader2 = csv.reader(inputlist1)
writer = csv.writer(output)
dict1 = {}
for xl in reader2:
dict1[xl[0]] = xl[1]
for i in reader:
if i[2] in dict1:
i.append(dict1[i[2]])
writer.writerow(i)
else:
i.append("N/A")
writer.writerow(i)
So I have a file that looks like this:
name,number,email,job1,job2,job3,job4
I need to convert it to one that looks like this:
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
How would I do this in Python?
As said in a comment that you can use pandas to read, write and manipulate csv file.
Here is one example of how you can solve your problem with pandas in python
import pandas as pd
# df = pd.read_csv("filename.csv") # read csv file from disk
# comment out below line when open from disk
df = pd.DataFrame([['ss','0152','ss#','student','others']],columns=['name','number','email','job1','job2'])
print(df)
this line output is
name number email job1 job2
0 ss 0152 ss# student others
Now we need to know how many columns are there:
x = len(df.columns)
print(x)
it will store the number of column in x
5
Now let's create a empty Dataframe with columns= [name,number,email,job]
c = pd.DataFrame(columns=['name','number','email','job'])
print(c)
output:
Columns: [name, number, email, job]
Index: []
Now we use loop from range 3 to end of the column and concat datafarme with our empty dataframe:
for i in range(3,x):
df1 = df.iloc[:,0:3].copy() # we took first 3 column
df2 = df.iloc[:,[i]].copy() # we took ith coulmn
df1['job'] = df2; # added ith coulmn to the df1
c = pd.concat([df1,c]); # concat df1 and c
print(c)
output:
name number email job
0 ss 0152 ss# others
0 ss 0152 ss# student
Dataframe c has your desired output. Now you can save it using
c.to_csv('ouput.csv')
Let's assume this is the dataframe:
import pandas as pd
df = pd.DataFrame(columns=['name','number','email','job1','job2','job3','job4'])
df = df.append({'name':'jon', 'number':123, 'email':'smth#smth.smth', 'job1':'a','job2':'b','job3':'c','job4':'d'},ignore_index=True)
We define a new dataframe:
new_df = pd.DataFrame(columns=['name','number','email','job'])
Now, we loop over the old one to split it based on the jobs. I assume you have 4 jobs to split:
for i, row in df.iterrows():
for job in range(1,5):
job_col = "job" + str(job)
new_df = new_df.append({'name':row['name'], 'number':row['number'], 'email':row['email'], 'job':row[job_col]}, ignore_index=True)
You can use the csv module and Python's unpacking syntax to get the data from the input file and write it to the output file.
import csv
with open('input.csv', newline='') as infile, open('output.csv', 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# Skip header row, if necessary
next(reader)
# Use sequence unpacking to get the fixed variables and
# and arbitrary number of "jobs".
for name, number, email, *jobs in reader:
for job in jobs:
writer.writerow([name, number, email, job])
Below:
with open('input.csv') as f_in:
lines = [l.strip() for l in f_in.readlines()]
with open('output.csv','w') as f_out:
for idx,line in enumerate(lines):
if idx > 0:
fields = line.split(',')
for idx in range(3,len(fields)):
f_out.write(','.join(fields[:3]) + ',' + fields[idx] + '\n')
input.csv
header row
name,number,email,job1,job2,job3,job4
name1,number1,email1,job11,job21,job31,job41
output.csv
name,number,email,job1
name,number,email,job2
name,number,email,job3
name,number,email,job4
name1,number1,email1,job11
name1,number1,email1,job21
name1,number1,email1,job31
name1,number1,email1,job41
I have two csv files:
csv1
csv2
(*note headers can be differ)
csv1 has 1 single column an csv2 has 5 columns
now column 1 of csv1 has some matching values in column2 of csv2
my concern is how can i write a csv where column1 of csv1 does not have a MATCHING VALUES to column2 of csv2
I have attached three files csv1, csv2 and expected output..
Expected Output:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
5,jlh,antriskh,ASDA,AD
CSV 1:
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh
CSV 2:
ProfileID,id,name,class ,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD
I tried using converting them into dictionary and match them csv1 keys to csv2 values but it is not working as expected
def read_csv1(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[1], val[2], val[3], val[4])
return prj_structure
def read_csv2(filename):
prj_structure = {}
f = open(filename, "r")
data = f.read()
f.close()
lst = data.split("\n")
prj = ""
for i in range(0, len(lst)):
val = lst[i].split(",")
if len(val)>0:
prj = val[0]
if prj!="":
if prj not in prj_structure.keys():
prj_structure[prj] = []
prj_structure[prj].append([val[0])
return prj_structure
csv1_data = read_csv1("csv1.csv")
csv2_data = read_csv2("csv2.csv")
for k, v in csv1_data.items():
for ks, vs in csv2_data.items():
if k==vs[0][0]:
#here it is not working
sublist = []
sublist.append(k)
Use the DictReader from the csv package.
import csv
f1 = open('csv1.csv')
csv_1 = csv.DictReader(f1)
f2 = open('csv2.csv')
csv_2 = csv.DictReader(f2)
first_dict = {}
for row in csv_1:
first_dict[row['name']]=row
f1.close()
f_out = open('output.csv','w')
csv_out = csv.DictWriter(f_out,csv_2.fieldnames)
csv_out.writeheader()
for second_row in csv_2:
if second_row['name'] in first_dict:
first_row = first_dict[second_row['name']]
if first_row['id']!=second_row['id']:
csv_out.writerow(second_row)
f2.close()
f_out.close()
If you have the option, I have always found pandas as a great tool to import and manipulate CSV files.
import pandas as pd
# Read in both the CSV files
df_1 = pd.DataFrame(pd.read_csv('csv1.csv'))
df_2 = pd.DataFrame(pd.read_csv('csv2.csv'))
# Iterate over both DataFrames and if any id's from in df_2 match
# df_1, remove them from df_2
for num1, row1 in df_1.iterrows():
for num2, row2 in df_2.iterrows():
if row1['id'] == row2['id']:
df_2.drop(num2, inplace=True)
df_2.head()
For any kind of csv processing, using the builtin csv module makes most of the error prone processing trivial. Given your example values, the following code should produce the desired results. I use comprehensions to do the filtering.
import csv
import io
# example data, as StringIO that will behave like file objects
raw_csv_1 = io.StringIO('''\
id,name
10927,prince
109582,kabir
f546416,rahul
g44674,saini
r7341,antriskh''')
raw_csv_2 = io.StringIO('''\
ProfileID,id,name,class,rollnumber
1,lkha,prince,sfasd,DAS
2,hgfhfk,kabir,AD,AD
3,f546416,rahul,AD,FF
44,g44674,saini,DD,FF
5,jlh,antriskh,ASDA,AD''')
# in your actual data, you would use actual file objects instead, like
# with open('location/of/your/csv_1') as file_1:
# raw_csv_1 = file_1.read()
# with open('location/of/your/csv_2') as file_2:
# raw_csv_2 = file_2.read()
Then we need to transform then into csv.reader objects:
csv_1 = csv.reader(raw_csv_1)
next(csv_1) # consume once to skip the header
csv_2 = csv.reader(raw_csv_2)
header = next(csv_2) # consume once to skip the header, but store it
Last but not least, collect the names of the first csv in a set to use them as lookup table, filter the second csv with it, and write it back as 'result.csv' into your file system.
skip_keys = {id_ for id_, name in vals_1}
result = [row for row in vals_2 if row[1] not in skip_keys]
# at this point, result contains
# [['1', 'lkha', 'prince', 'sfasd', 'DAS'],
# ['2', 'hgfhfk', 'kabir', 'AD', 'AD'],
# ['5', 'jlh', 'antriskh', 'ASDA', 'AD']]
with open('result.csv', 'w') as result_file:
csv.writer(result_file).writerows(header+result)
I am new to Python and I prepared a script that will modify the following csv file
accordingly:
1) Each row that contains multiple Gene entries separated by the /// such as:
C16orf52 /// LOC102725138 1.00551
should be transformed to:
C16orf52 1.00551
LOC102725138 1.00551
2) The same gene may have different ratio values
AASDHPPT 0.860705
AASDHPPT 0.983691
and we want to keep only the pair with the highest ratio value (delete the pair AASDHPPT 0.860705)
Here is the script I wrote but it does not assign the correct ratio values to the genes:
import csv
import pandas as pd
with open('2column.csv','rb') as f:
reader = csv.reader(f)
a = list(reader)
gene = []
ratio = []
for t in range(len(a)):
if '///' in a[t][0]:
s = a[t][0].split('///')
gene.append(s[0])
gene.append(s[1])
ratio.append(a[t][1])
ratio.append(a[t][1])
else:
gene.append(a[t][0])
ratio.append(a[t][1])
gene[t] = gene[t].strip()
newgene = []
newratio = []
for i in range(len(gene)):
g = gene[i]
r = ratio[i]
if g not in newgene:
newgene.append(g)
for j in range(i+1,len(gene)):
if g==gene[j]:
if ratio[j]>r:
r = ratio[j]
newratio.append(r)
for i in range(len(newgene)):
print newgene[i] + '\t' + newratio[i]
if len(newgene) > len(set(newgene)):
print 'missionfailed'
Thank you very much for any help or suggestion.
Try this:
with open('2column.csv') as f:
lines = f.read().splitlines()
new_lines = {}
for line in lines:
cols = line.split(',')
for part in cols[0].split('///'):
part = part.strip()
if not part in new_lines:
new_lines[part] = cols[1]
else:
if float(cols[1]) > float(new_lines[part]):
new_lines[part] = cols[1]
import csv
with open('clean_2column.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for k, v in new_lines.items():
writer.writerow([k, v])
First of all, if you're importing Pandas, know that you have I/O Tools to read CSV files.
So first, let's import it that way :
df = pd.read_csv('2column.csv')
Then, you can extract the indexes where you have your '///' pattern:
l = list(df[df['Gene Symbol'].str.contains('///')].index)
Then, you can create your new rows :
for i in l :
for sub in df['Gene Symbol'][i].split('///') :
df=df.append(pd.DataFrame([[sub, df['Ratio(ifna vs. ctrl)'][i]]], columns = df.columns))
Then, drop the old ones :
df=df.drop(df.index[l])
Then, I'll do a little trick to remove your lowest duplicate values. First, I'll sort them by 'Ratio (ifna vs. ctrl)' then I'll drop all the duplicates but the first one :
df = df.sort('Ratio(ifna vs. ctrl)', ascending=False).drop_duplicates('Gene Symbol', keep='first')
If you want to keep your sorting by Gene Symbol and reset indexes to have simpler ones, simply do :
df = df.sort('Gene Symbol').reset_index(drop=True)
If you want to re-export your modified data to your csv, do :
df.to_csv('2column.csv')
EDIT : I edited my answer to correct syntax errors, I've tested this solution with your csv and it worked perfectly :)
This should work.
It uses the dictionary suggestion of Peter.
import csv
with open('2column.csv','r') as f:
reader = csv.reader(f)
original_file = list(reader)
# gets rid of the header
original_file = original_file[1:]
# create an empty dictionary
genes_ratio = {}
# loop over every row in the original file
for row in original_file:
gene_name = row[0]
gene_ratio = row[1]
# check if /// is in the string if so split the string
if '///' in gene_name:
gene_names = gene_name.split('///')
# loop over all the resulting compontents
for gene in gene_names:
# check if the component is in the dictionary
# if not in dictionary set value to gene_ratio
if gene not in genes_ratio:
genes_ratio[gene] = gene_ratio
# if in dictionary compare value in dictionary to gene_ratio
# if dictionary value is smaller overwrite value
elif genes_ratio[gene] < gene_ratio:
genes_ratio[gene] = gene_ratio
else:
if gene_name not in genes_ratio:
genes_ratio[gene_name] = gene_ratio
elif genes_ratio[gene_name] < gene_ratio:
genes_ratio[gene_name] = gene_ratio
#loop over dictionary and print gene names and their ratio values
for key in genes_ratio:
print key, genes_ratio[key]
I would like to adapt the post here (Parse CSV file and aggregate the values) to sum multiple columns instead of just one.
So for these data:
CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57
New York,25,26,27
How can I get this:
CITY,AMOUNT,AMOUNT2,AMOUNTn
London,75,77,79
Tokyo,45,46,47
New York,25,26,27
I will have several thousand columns eventually, and unfortunately I can not use the pandas package for this task. Here is the code I have just aggregates all three AMOUNT cols into one, which is not what I am after
from __future__ import division
import csv
from collections import defaultdict
def default_factory():
return [0, None, None, 0]
reader = csv.DictReader(open('test_in.txt'))
cities = defaultdict(default_factory)
for row in reader:
headers = [r for r in row.keys()]
headers.remove('CITY')
for i in headers:
amount = int(row[i])
cities[row["CITY"]][0] += amount
max = cities[row["CITY"]][1]
cities[row["CITY"]][1] = amount if max is None else amount if amount > max else max
min = cities[row["CITY"]][2]
cities[row["CITY"]][2] = amount if min is None else amount if amount < min else min
cities[row["CITY"]][3] += 1
for city in cities:
cities[city][3] = cities[city][0]/cities[city][3] # calculate mean
with open('test_out.txt', 'wb') as myfile:
writer = csv.writer(myfile, delimiter="\t")
writer.writerow(["CITY", "AMOUNT", "AMOUNT2", "AMOUNTn ,"max", "min", "mean"])
writer.writerows([city] + cities[city] for city in cities)
Thank you for any help
Here is one way using itertools.groupby.
import StringIO
import csv
import itertools
data = """CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57
New York,25,26,27"""
# I use StringIO to create a file like object for demo purposes
f = StringIO.StringIO(data)
fieldnames = f.readline().strip().split(',')
key = lambda x: x[0] # the first column will be a grouping key
# rows must be sorted by city before passing to itertools.groupby
rows_sorted = sorted(csv.reader(f), key=key)
outfile = StringIO.StringIO('')
writer = csv.DictWriter(outfile, fieldnames=fieldnames, lineterminator='\n')
writer.writeheader()
for city, rows in itertools.groupby(rows_sorted, key=key):
# remove city column for aggregation, convert to ints
rows = [[int(x) for x in row[1:]] for row in rows]
agg = [sum(column) for column in zip(*rows)]
writer.writerow(dict(zip(fieldnames, [city] + agg)))
print outfile.getvalue()
# CITY,AMOUNT,AMOUNT2,AMOUNTn
# London,75,77,79
# New York,25,26,27
# Tokyo,45,46,47
Here is how I would do it.
import csv
from StringIO import StringIO
data = '''CITY,AMOUNT,AMOUNT2,AMOUNTn
London,20,21,22
Tokyo,45,46,47
London,55,56,57,99
New York,25,26,27'''
file_ = StringIO(data)
reader = csv.reader(file_)
headers = next(reader)
rows = {}
def add(col1, col2):
l = len(col1)
for i, n in enumerate(col2):
if i >= l:
col1.extend(col2[i:])
break
col1[i] += n
return col1
for row in reader:
key = row[0]
nums = map(int, row[1:])
if key in rows:
rows[key] = add(rows[key], nums)
else:
rows[key] = map(int, nums)