So I have a Python dictionary with protein sequences and their ids. I wanted to convert that dictionary to a CSV file to upload it as a dataset to fine-tune a transformer. However, when I create the CSV it appears with the dictionary shape (key-value, key-value...).
What I want is the CSV to have one key and in the next line it's value, and that shape consecutively. Is there a way to add a \n or something like that to have it's key and value in one line?
Shape of the dictionary:
{'NavAb:/1126': 'TNIVESSFFTKFIIYLIVLNGITMGLETSKTFMQSFGVYTTLFNQIVITIFTIEIILRIYVHRISFFKDPWSLFDFFVVAISLVPTSSGFEILRVLRVLRLFRLVTAVPQMRKI', 'Shaker:/1656': 'SSQAARVVAIISVFVILLSIVIFCLETLEDEVPDITDPFFLIETLCIIWFTFELTVRFLACPLNFCRDVMNVIDIIAIIPYFITTLNLLRVIRLVRVFRIFKLSRHSKGLQIL', .....
What I want in the CSV:
protein id
protein sequence
protein id
protein sequence
.....
The code I have for the moment:
def parse_file(input_file):
parsed_seqs = {}
curr_seq_id = None
curr_seq = []
for line in newfile:
line = line.strip()
line = line.replace('-', '')
if line.startswith(">"):
if curr_seq_id is not None:
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
curr_seq_id = line[1:]
curr_seq = []
continue
curr_seq.append(line)
parsed_seqs[curr_seq_id] = ''.join(curr_seq)
return parsed_seqs
newfile = open("/content/drive/MyDrive/Colab Notebooks/seqs.fasta")
parsed_seqs = parse_file(newfile)
with open('sequences.csv', 'w', newline='') as f:
w = csv.DictWriter(f, parsed_seqs.keys())
w.writeheader()
w.writerow(parsed_seqs)
The shape I want:
New shape:
To get CSV output with 2 columns, one for Protein ID and one for Protein Sequence, you can do this.
parsed_seqs = {
'NavAb:/1126': 'TNIVESS',
'Shaker:/1656': 'SSQAARVV'
}
column_names = ["Protein ID", "Protein Sequence"]
with open('sequences.csv', 'w', newline='') as f:
w = csv.writer(f, column_names)
w.writerow(column_names)
w.writerows(parsed_seqs.items())
Output:
Protein ID,Protein Sequence
NavAb:/1126,TNIVESS
Shaker:/1656,SSQAARVV
As an aside, the csv.DictWriter class works well when you have a list of dictionaries, where each dictionary is structured like {"column1": "value1", "column2": "value2"}. For example
parsed_seqs = [
{"ID": "NavAb", "Seq": "TINVESS"},
{"ID": "Shaker", "Seq": "SSQAARVV"}
]
with open("sequences.fa", "wt", newline="") as fd:
wrtr = csv.DictWriter(fd, ["ID", "Seq"])
wrtr.writeheader()
wrtr.writerows(parsed_seqs)
Related
I have a CSV file with headers on row 0. The headers are often unique but sometimes they are not, for "comments" in this example. For each of several comments, the header is "Comment".
The problem with my function that makes dicts from CSVs is that it only returns the last column of Comment.
def csv_to_list_with_dicts(csvfile):
with open(csvfile) as f:
list_of_issues = [{k: v for k, v in row.items()}
for row in csv.DictReader(f, skipinitialspace=True)]
return list_of_issues
My CSV file columns are like this:
User;ID;Comment;Comment;Comment
If one of the headers is repeating, I need to add an index to make it unique (like Comment1;Comment2 without changing the CSV) in the dict or all comments included under just Comment.
This did return just the way I wanted. Just tweaked yours a small bit Happy Ahmad! HUGE THANKS!!! <3
def csv_to_list_with_dicts(csvfile):
with open(csvfile, "r") as file:
keys = file.readline().split(",")
alteredKeys = []
for eachKey in keys:
counter = 0
while(eachKey in alteredKeys):
counter += 1
eachKey = eachKey[:len(eachKey)-(0 if counter == 1 else 1)] + str(counter)
alteredKeys.append(eachKey)
list_of_issues = []
reader = csv.reader(file, delimiter=',', skipinitialspace=True)
for eachLine in reader:
eachIssue = dict()
columnIndex = 0
for eachColumn in eachLine:
if columnIndex < len(alteredKeys):
eachIssue[alteredKeys[columnIndex]] = eachColumn
columnIndex += 1
list_of_issues.append(eachIssue)
return list_of_issues
In this solution, I use an alterKey list that changes any repeated key in the header by adding an index at its end. Then, I iterate on the other lines of the CSV file and make a dictionary from each one.
def csv_to_list_with_dicts(csvfile):
with open(csvfile, "r") as file:
keys = file.readline().split(";")
alteredKeys = []
for eachKey in keys:
counter = 0
while(eachKey in alteredKeys):
counter += 1
eachKey = eachKey[:len(eachKey)-(0 if counter == 1 else 1)] + str(counter)
alteredKeys.append(eachKey)
list_of_issues = []
for eachLine in file:
eachIssue = dict()
columnIndex = 0
for eachColumn in eachLine.split(";")
if columnIndex < len(alteredKeys):
eachIssue[alteredKeys[columnIndex]] = eachColumn
columnIndex += 1
list_of_issues.append(eachIssue)
return list_of_issues
It woujld be fairly easy to write code that will automatically generate unique keys for you by simply keeping track of those already seen and generating a unique name for any encountered that conflicted with one before it. Checking for that would be quick if those seen were kept in a set which features fast membership testing.
For example, assume this was in a CSV file named non-unique.csv:
User;ID;Comment;Comment;Comment
Jose;1138;something1;something2;something3
Gene;2907;abc;def;ghi
Guido;6450;jkl;mno;pqr
Code:
import csv
def csv_to_list_with_dicts(csv_filename):
# Read the first row of the csv file.
with open(csv_filename, encoding='utf-8', newline='') as csv_file:
reader = csv.reader(csv_file, delimiter=';', skipinitialspace=True)
names = next(reader) # Header row.
# Create list of unique fieldnames for the namee in the header row.
seen = set()
fieldnames = []
for i, name in enumerate(names):
if name in seen:
name = f'_{i}'
else:
seen.add(name)
fieldnames.append(name)
# Read entire file and make each row a dictionary with keys based on the fieldnames.
with open(csv_filename, encoding='utf-8', newline='') as csv_file:
reader = csv.DictReader(csv_file, fieldnames=fieldnames, delimiter=';',
skipinitialspace=True)
next(reader) # Ignore header row.
return list(reader)
results = csv_to_list_with_dicts('non-unique.csv')
from pprint import pprint
pprint(results, sort_dicts=False, width=120)
Results:
[{'User': 'Jose', 'ID': '1138', 'Comment': 'something1', '_3': 'something2', '_4': 'something3'},
{'User': 'Gene', 'ID': '2907', 'Comment': 'abc', '_3': 'def', '_4': 'ghi'},
{'User': 'Guido', 'ID': '6450', 'Comment': 'jkl', '_3': 'mno', '_4': 'pqr'}]
I have the following example csv, that I am reading with:
f = StringIO(response.encode('utf-8'))
reader = csv.DictReader(f, quotechar='"', delimiter=';', quoting=csv.QUOTE_ALL, skipinitialspace=True)
example csv:
id;name;community;owner;owns;description;uuid
3c;NP;NoProb;NoP;Text;text_with_no_issues;
3c;NP;NoProb;NoP;TextText;text_with_no_issues2;
1A;fooo;barr;Bar;TEXT1;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
1A;fooo;barr;Bar;TEXT2;"\"text\"\"None\"\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT3;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
2B;BAR;foo;Bar;TEXT4;"\"text\"\"None\"\";text\"\"TEXT\"\"text\";text\"\"TEXT\"\"text\"";
the uuid column is empty in all cases.
within the "reader" there are multiple entries with the same 'name' and 'id' which I am "merging", but in lines like the last four (1A,2B) I am hitting an issue because of the ";" delimiter in the description.
Even with quotechar='"' and quoting=csv.QUOTE_ALL the description column gets spitted by the delimiter and goes to the next column (uuid) and to a "None" column which corrupts my data.
Any idea how to solve this one ?
P.S. for the merge logic I am using two variants:
##############################################################
name_index = []
result = []
for line in reader:
idx = line["name"]
if idx not in name_index:
name_index.append(idx)
result.append(line)
else:
idx_curr_dict = result[name_index.index(idx)]
merge_entries = [idx_curr_dict, line]
placeholder = {}
for key in idx_curr_dict:
placeholder[key] = ", ".join(list(set(d[key] for d in merge_entries if d[key] != "" and d[key])))
result[name_index.index(idx)] = placeholder
##############################################################
and a bit slower one, but not that complicated:
##############################################################
data = [line for line in reply] # Deplete the iterator
unique_names = set([line['name'] for line in data]) # List of unique names
column_names = [key for key in data[0] if key != 'name' and key != 'uuid'] # all other useful columns
result = []
for name in unique_names:
same_named_lines = [line for line in data if line['name'] == name]
unique_line = {'name': name}
for column in column_names:
value = ", ".join(set([line[column] for line in same_named_lines]))
unique_line[column] = value
result.append(unique_line)
##############################################################
Thanks a lot in advance!
I have a JSON file with values in [] brackets as shown. I am trying to create a [key:value] dictionary that shows the [id_value : text_value.]
{"id":6127465, "users":{"name":[{"dr_info":[28,37],"text":"trees"}],"favorites":[]}}
{"id":9285628, "users":{"name":[{"dr_info":[16,24],"text":"grass"}, {"id_info":[30,34],"text":"trees"}],"favorites":[]}}
{"id":7625927, "users":{"name":[{"dr_info":[18,23],"text":"grass"}],"favorites":[], "type" : "photo"}}
{"id":8725946, "users":{"name":[{"dr_info":[23,33],"text":"grass"}, {"id_info":[37,41],"text":"trees"}],"favorites":[]}}
Taking as an example the first two JSON lines above. The output for the dictionary would be :
[6127465 : 'trees']
[9285628 : 'grass' , 'trees'] and so on.
Here is what I have coded so far but I can't get the values very well.
dict={}
with open(fileName, 'r') as file_to_read:
for line in file_to_read:
data = json.loads(line)
json_tree = objectpath.Tree(data)
json.dumps(data, ensure_ascii=True)
dict[json_tree.execute('$.id')] = json_tree.execute('$.users.name.text')
return dict
New edit. (Answer)
dict={}
with open(fileName, 'r') as file_to_read:
for line in file_to_read:
data = json.loads(line)
json_tree = objectpath.Tree(data)
json.dumps(data)
dict[json_tree.execute('$.id')] = list(json_tree.execute('$.users.name.text'))
return dict
New edit : Answer
dict={}
with open(fileName, 'r') as file_to_read:
for line in file_to_read:
data = json.loads(line)
json_tree = objectpath.Tree(data)
json.dumps(data)
dict[json_tree.execute('$.id')] = list(json_tree.execute('$.users.name.text'))
return dict
I wanted to edit a csv file which reads the value from one of my another json file in python 2.7
my csv is : a.csv
a,b,c,d
,10,12,14
,11,14,15
my json file is a.json
{"a":20}
i want my where the column 'a' will try to match in json file. if their is a match. it should copy that value from json and paste it to my csv file and the final output of my csv file should be looks like this.
a,b,c,d
20,10,12,14
20,11,14,15
Till now I what I have tried is
fileCSV = open('a.csv', 'a')
fileJSON = open('a.json', 'r')
jsonData = fileJSON.json()
for k in range(jsonData):
for i in csvRow:
for j in jsonData.keys():
if i == j:
if self.count == 0:
self.data = jsonData[j]
self.count = 1
else:
self.data = self.data + "," + jsonData[j]
self.count = 0
fileCSV.write(self.data)
fileCSV.write("\n")
k += 1
fileCSV.close()
print("File created successfully")
I will be really thankful if anyone can help me for this.
please ignore any syntactical and indentation error.
Thank You.
Some basic string parsing will get you here.. I wrote a script which works for the simple scenario which you refer to.
check if this solves your problem:
import json
from collections import OrderedDict
def list_to_csv(listdat):
csv = ""
for val in listdat:
csv = csv+","+str(val)
return csv[1:]
lines = []
csvfile = "csvfile.csv"
outcsvfile = "outcsvfile.csv"
jsonfile = "jsonfile.json"
with open(csvfile, encoding='UTF-8') as a_file:
for line in a_file:
lines.append(line.strip())
columns = lines[0].split(",")
data = lines[1:]
whole_data = []
for row in data:
fields = row.split(",")
i = 0
rowData = OrderedDict()
for column in columns:
rowData[columns[i]] = fields[i]
i += 1
whole_data.append(rowData)
with open(jsonfile) as json_file:
jsondata = json.load(json_file)
keys = list(jsondata.keys())
for key in keys:
value = jsondata[key]
for each_row in whole_data:
each_row[key] = value
with open(outcsvfile, mode='w', encoding='UTF-8') as b_file:
b_file.write(list_to_csv(columns)+'\n')
for row_data in whole_data:
row_list = []
for ecolumn in columns:
row_list.append(row_data.get(ecolumn))
b_file.write(list_to_csv(row_list)+'\n')
CSV output is not written to the source file but to a different file.
The output file is also always truncated and written, hence the 'w' mode.
I would recommend using csv.DictReader and csv.DictWriter classes which will read into and out of python dicts. This would make it easier to modify the dict values that you read in from the JSON file.
I've tried to put together a solution from similar questions but have failed miserably. I just don't know enough about Python yet :(
I have an inputlist containing elements in a particular order ex: ["GRE", "KIN", "ERD", "KIN"]
I have a datafile containing the elements, plus other data ex:
"ERD","Data","Data"...
"KIN","Data","Data"...
"FAC","Data","Data"...
"GRE","Data","Data"...
I need to create an outputlist that contains the lines from the datafile in the order they appear in the inputlist.
The code below returns the outputlist in the order the appear in the datafile, which is not the intended behavior... :-\
with open(inputfile, 'r') as f:
names = [line.strip() for line in f]
outputlist = []
with open(datafile, 'r') as f:
for line in f:
name = line.split(',')[0]
if name[1:-1] in names:
outputlist.append(line)
output = open(outputfile, 'w')
output.writelines(outputlist)
How can I have it return the list in the proper order? Thanks in advance for your help :-)
Edit
Thank's to Oscar, this is the solution I implemented:
datafile = 'C:\\testing\\bldglist.txt'
inputfile = 'C:\\testing\\inputlist.txt'
outputfile = "C:\\testing\\output.txt"
with open(inputfile, 'r') as f:
inputlist = [line.strip() for line in f]
def outputList(inputlist, datafile, outputfile):
d = {}
with open(datafile, 'r') as f:
for line in f:
line = line.strip()
key = line.split(',')[0]
d[key] = line
with open(outputfile, 'w') as f:
f.write('"Abbrev","Xcoord","Ycoord"\n')
for key in inputlist:
f.write(d[key] + '\n')
outputList(inputlist, datafile, outputfile)
This is the easy solution. It reads the entire input file into memory as a dictionary of first letter: line. It's then easy to write the lines in the write order.
If the file is very large (gigabytes) or you don't have a lot of memory, there are other ways. But they're not nearly as nice.
I haven't tested this.
import csv
data = {}
with open(datafile) as f:
for line in csv.reader(f):
data[line[0]] = line
with open(outputfile, "w") as f:
f = csv.writer(f)
for entry in inputlist:
f.writerow(data[entry])
Assuming a data file with this format:
"ERD","Data","Data"...
"KIN","Data","Data"...
"FAC","Data","Data"...
"GRE","Data","Data"...
Try this solution:
def outputList(inputlist, datafile, outputfile):
d = {}
with open(datafile, 'r') as f:
for line in f:
line = line.lstrip()
key = line.split(',')[0]
d[key] = line
with open(outputfile, 'w') as f:
for key in inputlist:
f.write(d[key])
Use it like this:
outputList(['"GRE"', '"KIN"', '"ERD"', '"KIN"'],
'/path/to/datafile',
'/path/to/outputfile')
It will write the output file with the expected order.
1) Create a list with the elements you wish to map to. In this case, ["GRE", "KIN", "ERD", "FAC"]
2) Read the file and map (using a dictionary of lists) the first elements.
3) Output to a file.
import csv
out_index=["GRE", "KIN", "ERD", "FAC"]
d={}
with open('/Users/andrew/bin/SO/abcd.txt','r') as fr:
for e in csv.reader(fr):
if e[0] not in d: d[e[0]]=[]
for ea in e[1:]:
d[e[0]].append(ea)
for i in out_index:
print i,":"
for e in d[i]:
print ' ',e
Given this example data:
"ERD","Data-a1","Data-a2"
"KIN","Data-b1","Data-b2"
"FAC","Data-c1","Data-c2"
"GRE","Data-d1","Data-d2"
"ERD","Data-a3","Data-a4"
"GRE","Data-d3","Data-d4"
Output:
GRE :
Data-d1
Data-d2
Data-d3
Data-d4
KIN :
Data-b1
Data-b2
ERD :
Data-a1
Data-a2
Data-a3
Data-a4
FAC :
Data-c1
Data-c2
Done!