So I have a Python dictionary with protein sequences and their ids. I wanted to convert that dictionary to a CSV file to upload it as a dataset to fine-tune a transformer. However, when I create the CSV it appears with the dictionary shape (key-value, key-value...).
What I want is the CSV to have one key and in the next line it's value, and that shape consecutively. Is there a way to add a \n or something like that to have it's key and value in one line?
Shape of the dictionary:
{'NavAb:/1126': 'TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVAISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI', 'Shaker:/1656': 'SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIPYFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL', .....
What I want in the CSV:
protein id
protein sequence
protein id
protein sequence
.....
The code I have for the moment:
def parse_file(input_file):
parsed_seqs = {}
curr_seq_id = None
curr_seq = []
for line in newfile:
line = line.strip()
line = line.replace('-', '')
if line.startswith(">"):
if curr_seq_id is not None:
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
curr_seq_id = line[1:]
curr_seq = []
continue
curr_seq.append(line)
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
return parsed_seqs
newfile = open("/content/drive/MyDrive/Colab Notebooks/seqs.fasta")
parsed_seqs = parse_file(newfile)
with open('sequences.csv', 'w', newline='') as f:
w = csv.DictWriter(f, parsed_seqs.keys())
w.writeheader()
w.writerow(parsed_seqs)
The shape I want:
New shape:
To get CSV output with 2 columns, one for Protein ID and one for Protein Sequence, you can do this.
parsed_seqs = {
'NavAb:/1126': 'TNIVESS',
'Shaker:/1656': 'SSQAARVV'
}
column_names = ["Protein ID", "Protein Sequence"]
with open('sequences.csv', 'w', newline='') as f:
w = csv.writer(f, column_names)
w.writerow(column_names)
w.writerows(parsed_seqs.items())
Output:
Protein ID,Protein Sequence
NavAb:/1126,TNIVESS
Shaker:/1656,SSQAARVV
As an aside, the csv.DictWriter class works well when you have a list of dictionaries, where each dictionary is structured like {"column1": "value1", "column2": "value2"}. For example
parsed_seqs = [
{"ID": "NavAb", "Seq": "TINVESS"},
{"ID": "Shaker", "Seq": "SSQAARVV"}
]
with open("sequences.fa", "wt", newline="") as fd:
wrtr = csv.DictWriter(fd, ["ID", "Seq"])
wrtr.writeheader()
wrtr.writerows(parsed_seqs)
I create these for loops and at the end I append the value of a dict to a list, but every time I do append, it appends the same value in my list and don't know why.
My code:
data_list = []
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
data_list.append(data_new)
print(data_list)
Expected:
[{"a": 1}, {"b":2}]
Real result:
[{"a": 1}, {"a":1}]
My file is something like that:
{"Line1":"value","Key1":0}
{"Line2":"value","Key2":0}
{"Line3":"value","Key3":0}
And one thing that I discovered: if I write a file instead data_list.append() it works, in the same indent. So in data_list.append(data_new) I changed to call write_file(data_new):
write_file(data_new):
with open('test.json', 'a') as f:
json.dump(data, f)
f.write('\n')
Obs.:
printing and debuging I can see the value of data_list (list) and in the first append, the value is correct:
{"Line1":"value","Key1":0}
in the second time inside the loop the value of data_new is "Line2" (correctly) but when I do data_list.append(data_new) it changes all my array to the very last readed and so on:
{"Line2":"value","Key2":0}
{"Line2":"value","Key2":0}
_
def write_file(data):
with open('data-transfer-out-teste.json', 'a', encoding='utf-8') as f:
json.dump(data, f)
f.write('\n')
def main():
config = ler_config()
de = config["de"]
para = config["para"]
data_new = {}
arquivo = "entrada.json"
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
# item = chaves no arquivo lido
# indice = posicao da chave no array "de"
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
write_file(data_new)
I may be wrong, but it seems that you need to add an indentation here:
data_list = []
with open(arquivo, encoding="utf-8") as f:
for jsonObj in f:
data = json.loads(jsonObj)
count = count + 1
for item in data:
if item in de:
indice = de.index(item)
data_new[para[indice]] = data[item]
else:
data_new[item] = data[item]
data_list.append(data_new) ### here to add
print(data_list)
I have a problem.Every timei do this it will just replace the last users wallet.
data = {}
usern = ctx.message.author
usern = str(usern)
data[usern] = []
data[usern].append({
"money": 0
})
Got and answer for it:
with open('config.json', 'r') as infile:
data = json.load(infile) # load from existing
data1 = data
usern = ctx.message.author
usern = str(usern)
data1[usern] = []
data1[usern].append({
"money": 0
})
else: # no file, start from scratch
data = {}
data = {}
usern = ctx.message.author
usern = str(usern)
data[usern] = []
data[usern].append({
"money": 0
})
with open('config.json', 'w') as outfile:
json.dump(data, outfile)
Check whether the file exists & load the existing json structure from it. Change the beginning part of your program to this:
from os.path import isfile
if isfile('config.json'): # check if file exists
with open('config.json', 'r') as infile:
data = json.load(infile) # load from existing
else: # no file, start from scratch
data = {}
I have a large dataset that looks like the following
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
I would like to output :
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
Here is my python code so far:
headers = ["valuation_date","party_group_name","type","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open(t1file,'rb') as f:
reader = csv.reader(f)
headers = reader.next()
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
if cp in data.keys():
if new_qualifier in data.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
else:
data[cp] = cp
else:
data[party] = party
When I run the above code I get the following error:
data[party][cp][qualifier][amount] = data[party][cp][new_qualifier][amount]
TypeError: string indices must be integers, not str
Very rusty with python apologize if it's glaringly obivous but any insights as to what i'm doing wrong ?
Thanks !
you can use pandas.drop_duplicates to drop duplicates of multiple columns and combine it with pandas.groupby() & sum to get the desired result
>>>import pandas as pd
>>>#read file using pandas.read_csv()
>>>df
party cp qualifier amount
0 ABC DEF GOOGLE_2 100
1 ABC DEF GOOGLE_2 200
2 GHI JKL FACEBOOK_1 500
3 GHI JKL FACEBOOK_1 -600
>>>df['Total'] = df.groupby(['party','cp','qualifier'])['amount'].transform('sum')
>>>print(df.drop_duplicates(subset=['party','cp','qualifier'], keep='last'))
party cp qualifier amount Total
1 ABC DEF GOOGLE_2 200 300
3 GHI JKL FACEBOOK_1 -600 -100
Below
from collections import defaultdict
PARTY_IDX = 0
CP_IDX = 1
QUALIFIER_IDX = 2
AMOUNT_IDX = 3
data = defaultdict(int)
with open('del-me.csv') as f:
lines = [l.strip() for l in f.readlines()]
for idx, line in enumerate(lines):
if idx > 0:
fields = line.split(',')
party = fields[PARTY_IDX]
cp = fields[CP_IDX]
qualifier = fields[QUALIFIER_IDX]
qualifier = qualifier[:qualifier.find('_')]
key = ','.join([party, cp, qualifier])
amount = int(fields[AMOUNT_IDX])
data[key] += amount
with open('out.csv', 'w') as f:
for k, v in data.items():
f.write('{},{}\n'.format(k, v))
del-me.csv
party,cp,qualifier,amount
ABC,DEF,GOOGLE_2,100
ABC,DEF,GOOGLE_2,200
GHI,JKL,FACEBOOK_1,500
GHI,JKL,FACEBOOK_1,-600
out.csv
ABC,DEF,GOOGLE,300
GHI,JKL,FACEBOOK,-100
You have already enough answers, but let me correct your own code to help you derive the answer and understand the original issue:
import csv as csv
headers = ["valuation_date","party_group_name","party_name","cp_group_name","cp_name","qualifier","amount"]
data = {}
with open('test_data.csv','rt', encoding='utf-8') as f:
reader = csv.reader(f)
headers = next(reader)
for row in reader:
party = row[headers.index('party')]
cp = row[headers.index('cp')]
qualifier = row[headers.index('qualifier')]
amount = row[headers.index('amount')]
if row[headers.index('type')] == "Equity":
new_qualifier = qualifier.split("_")[0]
if party in data.keys():
cp_ = data[party]
if cp in cp_.keys():
qualifier_ = data[party][cp]
if new_qualifier in qualifier_.keys():
data[party][cp][new_qualifier] += float(amount)
else:
data[party][cp][qualifier][amount] = {}
else:
data[cp] = {}
else:
data[party] = {}
data[party][cp] = {}
data[party][cp][qualifier.split("_")[0]] = float(amount)
print(data)
This gives you
{'ABC': {'DEF': {'GOOGLE': 300.0}}, 'GHI': {'JKL': {'FACEBOOK': -100.0}}}
The problem was how you were populating your dictionary and how you were accessing it.
In order to simplify things, you might use just one key for the dict which is composed out of the identifying parts of a given line.
You might have to extract values by the header names like you already did. The following is based on the specified input. rsplit is used to split the string once at the end in order to use the party,cp,qualifier combination as a key and extract the amount.
def sumUp():
d = {}
with open(t1file,'rb') as f:
for line in f:
if 'party' in line:
continue # skip header
key, value = line.rsplit(',', 1) # split once at the end
d[key] = d[key] + int(value) if key in d else int(value)
You can do it like this:
from csv import DictReader, DictWriter
map_dic = dict()
with open('test1.csv', 'r') as fr:
csv_reader = DictReader(fr, delimiter=',')
for line in csv_reader:
key = '{}_{}_{}'.format(line['party'], line['cp'], line['qualifier'])
if key not in map_dic.keys():
map_dic[key] = {'party': line['party'], 'cp': line['cp'], 'qualifier': line['qualifier'], 'amount': int(line['amount'])}
else:
map_dic[key]['amount'] = map_dic[key]['amount'] + int(line['amount'])
with open('test2.csv', 'w') as csvfile:
writer = DictWriter(csvfile, fieldnames=['party', 'cp', 'qualifier', 'amount'])
writer.writeheader()
for key, data in map_dic.items():
writer.writerow(data)
I have a text file that has following structure:
mom:2
dad:3
mom:4
dad:2
me:4
And I need to make a dictionary that would display each name only once, but adding the numeric values together. In this case it would look like this:
{dad':5, 'me':4, 'mom':6}
How I should approach this problem?
I've tried
d = {}
try:
file = open(file.txt, "r")
for line in file:
(a, b) = line.split(":")
d[a] = float(b)
except IOError:
print()
but i haven't found a way to count up the values.
with open('file.txt', 'r') as f:
fp = f.readlines()
t = [l.strip().split(':') for l in fp if l != '\n']
d = {}
for l in t:
d[l[0]] = d.setdefault(l[0], 0) + int(l[1])