iam currently struggling with dictionaries of lists.
Given a dictionary like that:
GO_list = {'Seq_A': ['GO:1234', 'GO:2345', 'GO:3456'],
'Seq_B': ['GO:7777', 'GO:8888']}
No i wanted to write this dictionary to a csv file as
follows:
EDIT i have added the whole function to give more information
def map_GI2GO(gilist, mapped, gi_to_go):
with open(gilist) as infile:
read_gi = csv.reader(infile)
GI_list = {rows[0]:rows[1] for rows in read_gi} # read GI list into dictionary
GO_list = defaultdict(list) # set up GO list as empty dictionary of lists
infile.close()
with open(gi_to_go) as mapping:
read_go = csv.reader(mapping, delimiter=',')
for k, v in GI_list.items(): # iterate over GI list and mapping file
for row in read_go:
if len(set(row[0]).intersection(v)) > 0 :
GO_list[k].append(row[1]) # write found GOs into dictionary
break
mapping.close()
with open(mapped, 'wb') as outfile: # save mapped SeqIDs plus GOs
looked_up_go = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
for key, val in GO_list.iteritems():
looked_up_go.writerow([key] + val)
outfile.close()
However this gives me the following output:
Seq_A,GO:1234;GO2345;GO:3456
Seq_B,GO:7777;GO:8888
I would prefer to have the list entries in separate columns,
separated by a defined delimiter. I have a hard time to get
rid of the ;, which are apparently separating the list entries.
Any ideas are welcome
If I were you I would try out itertools izip_longest to match up columns of varying length...
from csv import writer
from itertools import izip_longest
GO_list = {'Seq_A': ['GO:1234', 'GO:2345', 'GO:3456'],
'Seq_B': ['GO:7777', 'GO:8888']}
with open("test.csv","wb") as csvfile:
wr = writer(csvfile)
wr.writerow(GO_list.keys())#writes title row
for each in izip_longest(*GO_list.values()): wr.writerow(each)
Related
Even thought this might sound as a repeated question, I have not found a solution. Well, I have a large .csv file that looks like:
prot_hit_num,prot_acc,prot_desc,pep_res_before,pep_seq,pep_res_after,ident,country
1,gi|21909,21 kDa seed protein [Theobroma cacao],A,ANSPV,L,F40,EB
1,gi|21909,21 kDa seed protein [Theobroma cacao],A,ANSPVL,D,F40,EB
1,gi|21909,21 kDa seed protein [Theobroma cacao],L,SSISGAGGGGLA,L,F40,EB
1,gi|21909,21 kDa seed protein [Theobroma cacao],D,NYDNSAGKW,W,F40,EB
....
The aim is to slice this .csv file into multiple smaller .csv files according to the last two columns ('ident' and 'country').
I have used a code from an answer in a previous post and is the following:
csv_contents = []
with open(outfile_path4, 'rb') as fin:
dict_reader = csv.DictReader(fin) # default delimiter is comma
fieldnames = dict_reader.fieldnames # save for writing
for line in dict_reader: # read in all of your data
csv_contents.append(line) # gather data into a list (of dicts)
# input to itertools.groupby must be sorted by the grouping value
sorted_csv_contents = sorted(csv_contents, key=op.itemgetter('prot_desc','ident','country'))
for groupkey, groupdata in it.groupby(sorted_csv_contents,
key=op.itemgetter('prot_desc','ident','country')):
with open(outfile_path5+'slice_{:s}.csv'.format(groupkey), 'wb') as fou:
dict_writer = csv.DictWriter(fou, fieldnames=fieldnames)
dict_writer.writerows(groupdata)
However, I need that my output .csv's just contain the column 'pep_seq', a desired output like:
pep_seq
ANSPV
ANSPVL
SSISGAGGGGLA
NYDNSAGKW
What can I do?
Your code was almost correct, it just needed the fieldsnames to be set correctly and for extraaction='ignore' to be set. This tells the DictWriter to only write the fields you specify:
import itertools
import operator
import csv
outfile_path4 = 'input.csv'
outfile_path5 = r'my_output_folder\output.csv'
csv_contents = []
with open(outfile_path4, 'rb') as fin:
dict_reader = csv.DictReader(fin) # default delimiter is comma
fieldnames = dict_reader.fieldnames # save for writing
for line in dict_reader: # read in all of your data
csv_contents.append(line) # gather data into a list (of dicts)
group = ['prot_desc','ident','country']
# input to itertools.groupby must be sorted by the grouping value
sorted_csv_contents = sorted(csv_contents, key=operator.itemgetter(*group))
for groupkey, groupdata in itertools.groupby(sorted_csv_contents, key=operator.itemgetter(*group)):
with open(outfile_path5+'slice_{:s}.csv'.format(groupkey), 'wb') as fou:
dict_writer = csv.DictWriter(fou, fieldnames=['pep_seq'], extrasaction='ignore')
dict_writer.writeheader()
dict_writer.writerows(groupdata)
This will give you an output csv file containing:
pep_seq
ANSPV
ANSPVL
SSISGAGGGGLA
NYDNSAGKW
The following would output a csv file per country containing only the field you need.
You could always add another step to group by the second field you need I think.
import csv
# use a dict so you can store the list of pep_seqs found for each country
# the country value with be the dict key
csv_rows_by_country = {}
with open('in.csv', 'rb') as csv_in:
csv_reader = csv.reader(csv_in)
for row in csv_reader:
if row[7] in csv_rows_by_country:
# add this pep_seq to the list we already found for this country
csv_rows_by_country[row[7]].append(row[4])
else:
# start a new list for this country - we haven't seen it before
csv_rows_by_country[row[7]] = [row[4],]
for country in csv_rows_by_country:
# create a csv output file for each country and write the pep_seqs into it.
with open('out_%s.csv' % (country, ), 'wb') as csv_out:
csv_writer = csv.writer(csv_out)
for pep_seq in csv_rows_by_country[country]:
csv_writer.writerow([pep_seq, ])
I have a CSV file, with columns holding specific values that I read into specific places in a dictionary, and rows separate instances of data that equal one full dictionary. I read in and then use this data to computer certain values, process some of the inputs, etc., for each row before moving on to the next row. My question is, if I have a header that specifics the names of the columns (Key1 versus Key 3A, etc.), can I use that information to avoid the somewhat draw out code I am currently using (below).
with open(input_file, 'rU') as controlFile:
reader = csv.reader(controlFile)
next(reader, None) # skip the headers
for row in reader:
# Grabbing all the necessary inputs
inputDict = {}
inputDict["key1"] = row[0]
inputDict["key2"] = row[1]
inputDict["key3"] = {}
inputDict["key3"].update({"A" : row[2]})
inputDict["key3"].update({"B" : row[3]})
inputDict["key3"].update({"C" : row[4]})
inputDict["key3"].update({"D" : row[5]})
inputDict["key3"].update({"E" : row[6]})
inputDict["Key4"] = {}
inputDict["Key4"].update({"F" : row[7]})
inputDict["Key4"].update({"G" : float(row[8])})
inputDict["Key4"].update({"H" : row[9]})
If you use a DictReader, you can improve your code a bit:
Create an object which operates like a regular reader but maps the
information read into a dict whose keys are given by the optional
fieldnames parameter. The fieldnames parameter is a sequence whose
elements are associated with the fields of the input data in order.
These elements become the keys of the resulting dictionary. If the
fieldnames parameter is omitted, the values in the first row of the
csvfile will be used as the fieldnames.
So, if we utilize that:
import csv
import string
results = []
mappings = [
[(string.ascii_uppercase[i-2], i) for i in range(2, 7)],
[(string.ascii_uppercase[i-2], i) for i in range(7, 10)]]
with open(input_file, 'rU') as control_file:
reader = csv.DictReader(control_file)
for row in reader:
row_data = {}
row_data['key1'] = row['key1']
row_data['key2'] = row['key2']
row_data['key3'] = {k:row[v] for k,v in mappings[0]}
row_data['key4'] = {k:row[v] for k,v in mappings[1]}
results.append(row_data)
yes you can.
import csv
with open(infile, 'rU') as infile:
reader = csv.DictReader(infile)
for row in reader:
print(row)
Take a look at this piece of code.
fields = csv_data.next()
for row in csv_data:
parsed_data.append(dict(zip(fields,row)))
I have a csv file whose structure is like this:
Year-Sem,Course,Studentid,Score
201001,CS301,100,363
201001,CS301,101,283
201001,CS301,102,332
201001,CS301,103,254
201002,CS302,101,466
201002,CS302,102,500
Here each year is divided into two semesters - 01 (for fall) and 02 (for spring) and data has years from 2008 till 2014 (for a total of 14 semesters). Now what I want to do is to form a dictionary where course and studentid become the key and there respective score ordered by the year-sem as values. So the output should be something like this for each student:
[(studentid,course):(year-sem1 score,year-sem2 score,...)]
I first tried to make a dictionary of [(studentid,course):(score)] using this code but I get error as IndexError: list index out of range:
with open('file1.csv', mode='rU') as infile:
reader = csv.reader(infile,dialect=csv.excel_tab)
with open('file2.csv', mode='w') as outfile:
writer = csv.writer(outfile)
mydict = {(rows[2],rows[1]): rows[3] for rows in reader}
writer.writerows(mydict)
When I was not using dialect=csv.excel_tab and rU then I was getting error as _csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?.
How can I resolve this error and form the dictionary with structure [(studentid,course):(year-sem1 score,year-sem2 score,...)] that I had mentioned in my post above?
The dialect you've chosen seems to be wrong. csv.excel_tab uses the tabulator character as delimiter. For your data, the default dialect should work.
You got the error message about newlines earlier because of the missing U in the rU mode.
with open(r"test.csv", "rU") as file:
reader = csv.reader(file)
for row in reader:
print(row)
This example seems to work for me (Python 3).
If you have repeating keys you need to store the values in some container, if you want the data ordered you will need to use an OrderedDict:
import csv
from collections import OrderedDict
with open("in.csv") as infile, open('file2.csv', mode='w') as outfile:
d = OrderedDict()
reader, writer = csv.reader(infile), csv.writer(outfile)
header = next(reader) # skip header
# choose whatever column names you want
writer.writerow(["id-crse","score"])
# unpack the values from each row
for yr, cre, stid, scr in reader:
# use id and course as keys and append scores
d.setdefault("{} {}".format(stid, cre),[]).append(scr)
# iterate over the dict keys and values and write each new row
for k,v in d.items():
writer.writerow([k] + v)
Which will give you something like:
id-crse,score
100 CS301,363
101 CS301,283
102 CS301,332
103 CS301,254
101 CS302,466
102 CS302,500
In your own code you would only store the last value for the key, you also only write the keys using writer.writerows(mydict) as you are just iterating over the keys of the dict, not the keys and values. If the data is not all in chronological order you will have to call sorted on the reader object using itemgetter:
for yr, cre, stid, scr in sorted(reader,key=operator.itemgetter(3)):
............
I have a csv file like this:
pos,place
6696,266835
6698,266835
938,176299
940,176299
941,176299
947,176299
948,176299
949,176299
950,176299
951,176299
770,272944
2751,190650
2752,190650
2753,190650
I want to convert it to a dictionary like the following:
{266835:[6696,6698],176299:[938,940,941,947,948,949,950,951],190650:[2751,2752,2753]}
And then, fill the missing numbers in the range in the values:
{{266835:[6696,6697,6698],176299:[938,939,940,941,942,943,944,945,946947,948,949,950,951],190650:[2751,2752,2753]}
}
Right now i have tried to build the dictionary using solution suggested here, but it overwrites the old value with new one.
Any help would be great.
Here is a function that i wrote for converting csv2dict
def csv2dict(filename):
"""
reads in a two column csv file, and the converts it into dictionary
"""
import csv
with open(filename) as f:
f.readline()#ignore first line
reader=csv.reader(f,delimiter=',')
mydict=dict((rows[1],rows[0]) for rows in reader)
return mydict
Easiest is to use collections.defaultdict() with a list:
import csv
from collections import defaultdict
data = defaultdict(list)
with open(inputfilename, 'rb') as infh:
reader = csv.reader(infh)
next(reader, None) # skip the header
for col1, col2 in reader:
data[col2].append(int(col1))
if len(data[col2]) > 1:
data[col2] = range(min(data[col2]), max(data[col2]) + 1)
This also expands the ranges on the fly as you read the data.
Based on what you have tried -
from collections import default dict
# open archive reader
myFile = open ("myfile.csv","rb")
archive = csv.reader(myFile, delimiter=',')
arch_dict = defaultdict(list)
for rows in archive:
arch_dict[row[1]].append(row[0])
print arch_dict
I have a text file containing key-value pairs, with the last two key-value pairs containing JSON-like objects that I would like to split out into columns and write with the other values, using the keys as column headings. The first three rows of the data file input.txt look like this:
InnerDiameterOrWidth::0.1,InnerHeight::0.1,Length2dCenterToCenter::44.6743867864386,Length3dCenterToCenter::44.6768028159989,Tag::<NULL>,{StartPoint::7858.35924983374[%2C]1703.69341358077[%2C]-3.075},{EndPoint::7822.85045874375[%2C]1730.80294308742[%2C]-3.53962362760298}
InnerDiameterOrWidth::0.1,InnerHeight::0.1,Length2dCenterToCenter::57.8689351603823,Length3dCenterToCenter::57.8700464193429,Tag::<NULL>,{StartPoint::7793.52927597915[%2C]1680.91224357457[%2C]-3.075},{EndPoint::7822.85045874375[%2C]1730.80294308742[%2C]-3.43363070193163}
InnerDiameterOrWidth::0.1,InnerHeight::0.1,Length2dCenterToCenter::68.7161350545728,Length3dCenterToCenter::68.7172034962765,Tag::<NULL>,{StartPoint::7858.35924983374[%2C]1703.69341358077[%2C]-3.075},{EndPoint::7793.52927597915[%2C]1680.91224357457[%2C]-3.45819643838485}
and we eventually came up with something that worked, but there must be a much better way:
import csv
with open('input.txt', 'rb') as fin, open('output.csv', 'wb') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout)
for i, line in enumerate(reader):
mysplit = [item.split('::') for item in line if item.strip()]
if not mysplit: # blank line
continue
keys, vals = zip(*mysplit)
start_vals = [item.split('[%2C]') for item in mysplit[-2]]
end_vals = [item.split('[%2C]') for item in mysplit[-1]]
a=list(keys[0:-2])
a.extend(['start1','start2','start3','end1','end2','end3'])
b=list(vals[0:-2])
b.append(start_vals[1][0])
b.append(start_vals[1][1])
b.append(start_vals[1][2][:-1])
b.append(end_vals[1][0])
b.append(end_vals[1][1])
b.append(end_vals[1][2][:-1])
if i == 0:
# if first line: write header
writer.writerow(a)
writer.writerow(b)
which produces the output file output.csv that looks like this
InnerDiameterOrWidth,InnerHeight,Length2dCenterToCenter,Length3dCenterToCenter,Tag,start1,start2,start3,end1,end2,end3
0.1,0.1,44.6743867864386,44.6768028159989,<NULL>,7858.35924983374,1703.69341358077,-3.075,7822.85045874375,1730.80294308742,-3.53962362760298
0.1,0.1,57.8689351603823,57.8700464193429,<NULL>,7793.52927597915,1680.91224357457,-3.075,7822.85045874375,1730.80294308742,-3.43363070193163
0.1,0.1,68.7161350545728,68.7172034962765,<NULL>,7858.35924983374,1703.69341358077,-3.075,7793.52927597915,1680.91224357457,-3.45819643838485
We don't want to write code like this in the future.
What is the best way to read data like this?
I'd use:
from itertools import chain
import csv
_header_translate = {
'StartPoint': ('start1', 'start2', 'start3'),
'EndPoint': ('end1', 'end2', 'end3')
}
def header(col):
header = col.strip('{}').split('::', 1)[0]
return _header_translate.get(header, (header,))
def cleancolumn(col):
col = col.strip('{}').split('::', 1)[1]
return col.split('[%2C]')
def chainedmap(func, row):
return list(chain.from_iterable(map(func, row)))
with open('input.txt', 'rb') as fin, open('output.csv', 'wb') as fout:
reader = csv.reader(fin)
writer = csv.writer(fout)
for i, row in enumerate(reader):
if not i: # first row, write header first
writer.writerow(chainedmap(header, row))
writer.writerow(chainedmap(cleancolumn, row))
The cleancolumn method takes any of your columns and returns a tuple (possibly with only one value) after removing the braces, removing everything before the first :: and splitting on the embedded 'comma'. By using itertools.chain.from_iterable() we turn the series of tuples generated from the columns into one list again for the csv writer.
When handling the first line we generate one header row from the same columns, replacing the StartPoint and EndPoint headers with the 6 expanded headers.