Working with nested dictionaries and formatting for display - python

I have a partial answer from here Construct a tree from list os file paths (Python) - Performance dependent
My specific problem requires me to go from
this
dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file3 10
dir3/file4 10
dir3/file5 10
To
dir/ **50**
dir2/ **30**
file2
file3
file
file3
dir3/ **20**
file4
file5
Basically the numbers at the end are the file sizes and
I have been trying to figure out how to display the size of all the files to the parent directory
Edit:
r = re.compile(r'(.+\t)(\d+)')
def prettify(d, indent=0):
for key, value in d.iteritems():
ss = 0
if key == FILE_MARKER:
if value:
for each in value:
mm = r.match(each)
ss += int(mm.group(2))
print ' ' * indent + each
***print ' ' * indent + format_size(ss)***
else:
print ' ' * indent + str(key)
if isinstance(value, dict):
addSizes(value, indent+1)
else:
print ' ' * (indent+1) + str(value)
This is mac's answer from the above link which i edited to use regExp
Solutions that occurred to me led me to create a new dict or adding an inner function.
I have lost my whole day and wished i had asked for help earlier in the day.
Please help.

Not the most elegant thing in the world, but this should get you where you need to be. You'll need to change the tree creation function to deal with whatever form of input you are getting. Once the tree is generated it's just using a recursive tree traversal to form the output.
import re
input_dirs = """dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file 10
dir3/file4 10
dir3/file5 10
dir/dir2/dir4/file2 10"""
def create_file_tree(input_string):
dir_dict = {}
for file_path in input_string.split('\n'):
path_list = re.sub('/',' ',file_path).split()
path_list[-1] = int(path_list[-1])
path_dict = dir_dict
final_item = ""
for item in path_list[:-1]:
parent_dict = path_dict
last_item = item
path_dict = path_dict.setdefault(item,{})
parent_dict[last_item] = path_list[-1]
return dir_dict
def pretty_file_tree(file_tree):
def traverse(sub_dict,indent=0, total=0):
string_out = ""
indent += 1
for key in sorted(sub_dict.keys()):
if type(sub_dict[key]) == dict:
sub_total = traverse(sub_dict[key],indent,0)
total += sub_total[0]
string_out += ' '*indent + key + ' ' + '**' + str(sub_total[0]) + '**' + '\n' + sub_total[1]
else:
string_out += ' '*indent + key + '\n'
total += sub_dict[key]
return total, string_out
output_string = traverse(file_tree)
print(output_string[1])
pretty_file_tree(create_file_tree(input_dirs))
Sorry it's not following the code you posted, but i'd begun to produce this before the edit...

As you process the input build a string with place holders (%d) for the numbers, then print out the string.

Related

TreeView to JSON in Python

[Edit: apparently this file looks similar to h5 format]
I am trying to extract metadata from a file with extension of (.dm3) using hyperspy in Python, I am able to get all the data but it's getting saved in a treeview, but I need the data in Json I tried to make my own parser to convert it which worked for most cases but then failed:
TreeView data generated
Is there a library or package I can use to convert the treeview to JSON in pyhton?
My parser:
def writearray(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '[')
for char in k[1]:
file.write(char)
file.write(']')
def writenum(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + k[1])
def writestr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' +'"'+ k[1]+'"')
def startnew(file,string):
file.write('"'+string+'":'+'{\n')
def closenum(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + k[1] + '\n')
file.write('},\n')
def closestr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '"' + k[1] + '"' + '\n')
file.write('},\n')
def closearr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '[')
for char in k[1]:
file.write(char)
file.write(']\n')
file.write('},\n')
def strfix(string):
temp = ''
for char in string:
if char != ' ':
temp += char
return temp
def writethis(file,string):
stripped = strfix(string)
if "=" in stripped:
temp = stripped.split("=")
if ',' in temp[1]:
writearray(file,stripped)
elif temp[1].isdigit() or temp[1].isdecimal():
writenum(file,stripped)
else:
writestr(file,stripped)
def createMetaData(dm3file):
txtfile = os.path.splitext(dm3file)[0] + '.txt'
jsonfile = os.path.splitext(dm3file)[0] + '.json'
s = hs.load(dm3file)
s.original_metadata.export(txtfile)
file1 = open(txtfile, 'r', encoding="utf-8")
Lines = file1.readlines()
k = []
for line in Lines:
k.append(line)
L = []
for string in k:
temp = ''
for char in string:
if char.isalpha() or char.isdigit() or char == '=' or char == ' ' or char == '<' or char == '>' or char == ',' or char == '.' or char == '-' or char == ':':
temp += char
L.append(temp)
file2 = open(jsonfile, 'w', encoding="utf-8")
file2.write('{\n')
for i in range(0, len(L) - 1):
currentspaces = len(L[i]) - len(L[i].lstrip())
nextspaces = len(L[i + 1]) - len(L[i + 1].lstrip())
sub = nextspaces - currentspaces
if i != len(L) - 2:
if (sub == 0):
writethis(file2, L[i])
if '=' in L[i]:
file2.write(',\n')
else:
file2.write('\n')
elif sub > 0:
startnew(file2, L[i])
else:
if sub == -3:
writethis(file2, L[i])
file2.write('\n},\n')
elif sub == -7:
writethis(file2, L[i])
file2.write('\n}\n},\n')
else:
writethis(file2, L[i])
file2.write('\n}\n}\n}\n}')
file1.close()
os.remove(txtfile)
enter code here
I wrote a parser for the tree-view format:
from ast import literal_eval
from collections import abc
from more_itertools import peekable
def parse_literal(x: str):
try:
return literal_eval(x)
except Exception:
return x.strip()
def _treeview_parse_list(lines: peekable) -> list:
list_as_dict = {}
for line in (x.strip() for x in lines):
raw_k, raw_v = line.split(' = ')
list_as_dict[int(raw_k.split()[-1][1:-1])] = parse_literal(raw_v)
peek = lines.peek(None)
if '╚' in line or (peek is not None and '├' in peek):
break
list_as_list = [None] * (max(list_as_dict) + 1)
for idx, v in list_as_dict.items():
list_as_list[idx] = v
return list_as_list
def _treeview_parse_dict(lines: peekable) -> dict:
node = {}
for line in (x.strip() for x in lines):
if ' = ' in line:
raw_k, raw_v = line.split(' = ')
node[raw_k.split()[-1]] = parse_literal(raw_v)
elif '<list>' in line:
node[line.split()[-2]] = _treeview_parse_list(lines)
else:
try:
idx = line.index('├')
except ValueError:
idx = line.index('└')
peek = lines.peek(None)
if peek is not None and '├' in peek and idx == peek.index('├'):
node[line.split()[-1]] = {}
else:
node[line.split()[-1]] = _treeview_parse_dict(lines)
if '└' in line:
break
return node
def treeview_to_dict(lines: abc.Iterable) -> dict:
return _treeview_parse_dict(peekable(lines))
Usage:
with open('meta.txt') as f:
d = treeview_to_dict(f)
You can obtain the metadata as a JSON file using Python's built-in json library:
import json
with open('meta.txt') as txt_file:
with open('meta.json', 'w') as json_file:
json.dump(treeview_to_dict(txt_file), json_file, indent=4)
I've added indent=4 to make the JSON file more human-readable, so that you can verify it against the original format. As far as I can tell they match up in a sensible way.
As I've written this, it uses the third-party more_itertools.peekable class. If you can't use more_itertools, it shouldn't be too hard to implement that functionality yourself, or just refactor the code so that it is no longer necessary to look ahead.
License:
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to https://unlicense.org
A more straightforward approach is to use the as_dictionary method to convert the metadata to a python dictionary, then you can convert it to json.
import hyperspy.api as hs
s = hs.load('file.dm3')
metadata_dictionary = s.original_metadata.as_dictionary()
A different approach is to use the new RosettaSciIO library, which has been split from hyperspy to extract the metadata, for more information see the documentation https://hyperspy.org/rosettasciio/

How to select sequences using substrings from several dictionaries

I have four dictionaries that contain substrings:
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
and a fasta file:
>MN00153:75:000H37WNG:1:11102:8051:1085
NNNNNNNNCCCFFFNNNNNGGGTTTNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
>MN00153:75:000H37WNG:1:11102:16389:1090
NNNHHHFFFNNNNNNNOOOPPPNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
I want to select sequences if two substrings are from specific dictionaries. The order of substrings is not important.
In other words, I want my code to select reads if one substring is from fw1 and another one is from fw2_rc dictionary OR one substring is from fw1_rc and another one is from fw2 dictionary.
This is my code; It selects correct reads but repeats outputs many times:
from Bio import SeqIO
count=0
with open('file.fasta','r') as f:
for record in SeqIO.parse(f,'fasta'):
for k1,Fw1 in fw1.items():
for k2,Fw1_rc in fw1_rc.items():
for k3,Fw2 in fw2.items():
for k4,Fw2_rc in fw2_rc.items():
if Fw1 in record.seq and Fw2_rc in record.seq:
pos1 = record.seq.find(Fw1) + 1
pos2 = record.seq.find(Fw2_rc) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print("sample_2")
print(record.id)
print(record.seq)
print(str(k1) + " : " + str(Fw1) + " - The position is " + str(pos1))
print(str(k4) + " : " + str(Fw2_rc) + " - The position is " + str(pos2))
print('\n')
if Fw1_rc in record.seq and Fw2 in record.seq:
pos1 = record.seq.find(Fw1_rc) + 1
pos2 = record.seq.find(Fw2) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print(record.id)
print(record.seq)
print(str(k2) + " : " + str(Fw1_rc) + " - The position is " + str(pos1))
print(str(k3) + " : " + str(Fw2) + " - The position is " + str(pos2))
print('\n')
count+=1
print("The total number of reads that have both 21nt protein-specific sequences is " + str(count))
The desired output should be:
sample_2
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF - The position is 7
PLAU_fw2_rc : rrreee - The position is 18
sample_2
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff - The position is 1
MIF_fw2 : KKKZZZ - The position is 14
The total number of reads that have both 21nt protein-specific sequences is 2
I wasn't able to get the counts and I reversed the key/value items in the dictionaries to allow lookups (that otherwise wouldn't have been possible for your desired results)
Also, I wasn't able to use Bio but just read from a text file but the changes to my code could be easily changed to the Bio seq and id.
import re
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
one_upcase = '(' + '|'.join(fw1.keys()) + ')'
one_locase = '(' + '|'.join(fw1_rc.keys()) + ')'
two_upcase = '(' + '|'.join(fw2.keys()) + ')'
two_locase = '(' + '|'.join(fw2_rc.keys()) + ')'
with open('f1.txt', 'r') as f:
_id = ''
count = 0
for line in f:
line = line.rstrip()
if line.startswith('>'):
_id = line[1:]
else:
if match := re.search(f'(?=.*{one_upcase})(?=.*{two_locase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw1[item], ': ', end='')
else:
print(fw2_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
elif match := re.search(f'(?=.*{one_locase})(?=.*{two_upcase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw2[item], ': ', end='')
else:
print(fw1_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
The ouput matches your output:
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF The position is 7
PLAU_fw2_rc : rrreee The position is 18
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff The position is 1
MIF_fw2 : KKKZZZ The position is 14
UPDATE
Here is the solution not using regular expressions
(Also, I installed Bio to use it for this solution)
from Bio import SeqIO
# make values, (fasta seq), keys in dict and original keys now become values
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
# store fasta substrings in lists
one_upcase = list(fw1.keys())
one_locase = list(fw1_rc.keys())
two_upcase = list(fw2.keys())
two_locase = list(fw2_rc.keys())
with open('fasta.txt', 'r') as f:
count = 0
for record in SeqIO.parse(f,'fasta'):
_id = record.id
seq = record.seq
last = False
for token_fw1 in one_upcase:
if last == True:
break
for token_fw2_rc in two_locase:
if token_fw1 in seq and token_fw2_rc in seq:
print(_id)
print(seq)
print(fw1[token_fw1], ':', token_fw1,
'in position', str(1+seq.index(token_fw1)))
print(fw2_rc[token_fw2_rc], ':', token_fw2_rc,
'in position', str(1+seq.index(token_fw2_rc)))
print()
last = True
break
for token_fw1_rc in one_locase:
if last == True:
break
for token_fw2 in two_upcase:
if token_fw1_rc in seq and token_fw2 in seq:
print(_id)
print(seq)
print(fw1_rc[token_fw1_rc], ':', token_fw1_rc,
'in position', str(1+seq.index(token_fw1_rc)))
print(fw2[token_fw2], ':', token_fw2,
'in position', str(1+seq.index(token_fw2)))
print()
last = True
break
I didn't have a count here as I didn't know what it is you wanted to count.
I reversed the dictionaries (I think you created the dictionaries wrong) where the values, (fasta substrings), became the keys and the keys became the values. This permitted lookups in my solution:
print(fw1_rc[token_fw1_rc] etc and print(fw2_rc[token_fw2_rc] etc
(Two of the 4 times this lookup was done).
Also, a note, I said token here to mean the fasta substring.

Removing rows from a pandas dataframe while iterating through it

I have the following python script. In it, I am iterating through a CSV file which has rows and rows of loyalty cards. In many cases, there is more than one entry per card. I am currently looping through each row, then using loc to find all other instances of the card in the current row, so I can combine them together to post to an API. What I'd like to do however, is when that post is done, remove all the rows I've just merged, so that way the iteration doesn't hit them again.
That's the part I'm stuck on. Any ideas? Essentially I want to remove all the rows in card_list from csv before I go for the next iteration. That way even though there might be 5 rows with the same card number, I only process that card once. I tried by using
csv = csv[csv.card != row.card]
At the end of the loop, thinking it might re-generate the dataframe without any rows with a card matching the one just processed, but it didn't work.
import urllib3
import json
import pandas as pd
import os
import time
import pyfiglet
from datetime import datetime
import array as arr
for row in csv.itertuples():
dt = datetime.now()
vouchers = []
if minutePassed(time.gmtime(lastrun)[4]):
print('Getting new token...')
token = get_new_token()
lastrun = time.time()
print('processing ' + str(int(row.card)))
card_list = csv.loc[csv['card'] == int(row.card)]
print('found ' + str(len(card_list)) + ' vouchers against this card')
for row in card_list.itertuples():
print('appending card ' + str(int(row.card)) + ' voucher ' + str(row.voucher))
vouchers.append(row.voucher)
print('vouchers, ', vouchers)
encoded_data = json.dumps({
"store_id":row.store,
"transaction_id":"11111",
"card_number":int(row.card),
"voucher_instance_ids":vouchers
})
print(encoded_data)
number += 1
r = http.request('POST', lcs_base_path + 'customer/auth/redeem-commit',body=encoded_data,headers={'x-api-key': api_key, 'Authorization': 'Bearer ' + token})
response_data = json.loads(r.data)
if (r.status == 200):
print (str(dt) + ' ' + str(number) + ' done. processing card:' + str(int(row.card)) + ' voucher:' + str(row.voucher) + ' store:' + str(row.store) + ' status: ' + response_data['response_message'] + ' request:' + response_data['lcs_request_id'])
else:
print (str(dt) + ' ' + str(number) + 'done. failed to commit ' + str(int(row.card)) + ' voucher:' + str(row.voucher) + ' store:' + str(row.store) + ' status: ' + response_data['message'])
new_row = {'card':row.card, 'voucher':row.voucher, 'store':row.store, 'error':response_data['message']}
failed_csv = failed_csv.append(new_row, ignore_index=True)
failed_csv.to_csv(failed_csv_file, index=False)
csv = csv[csv.card != row.card]
print ('script completed')
print (str(len(failed_csv)) + ' failed vouchers will be saved to failed_commits.csv')
print("--- %s seconds ---" % (time.time() - start_time))
First rule of thumb is never alternate what you are iterating on. Also, I think you are doing it wrong with itertuples. Let's do groupby:
for card, card_list in csv.groupby('card'):
# card_list now contains all the rows that have a specific cards
# exactly like `card_list` in your code
print('processing, card)
print('found', len(card_list), 'vouchers against this card')
# again `itertuples` is over killed -- REMOVE IT
# for row in card_list.itertuples():
encoded_data = json.dumps({
"store_id": card_list['store'].iloc[0], # same as `row.store`
"transaction_id":"11111",
"card_number":int(card),
"voucher_instance_ids": list(card_list['voucher']) # same as `vouchers`
})
# ... Other codes

dict to list, and compare lists python

I have made a function, were I count how many times each word is used in a file, that will say the word frequency. Right now the function can calculate the sum of all words, and show me the seven most common words and how many times they are used. Now I want to compare my first file were I have analyzed the word frequency with another file were I have the most common words used in the english language, and I want to compare those words with the words I have in my first file to see if any of the words matches.
What I have come up to is to make lists of the two files and then compare them with each other. But the code I wrote for this doesn't give me any output, any idea on how I can solve this?
def CountWords():
filename = input('What is the name of the textfile you want to open?: ')
if filename == "alice" or "alice-ch1.txt" or " ":
file = open("alice-ch1.txt","r")
print('You want to open alice-ch1.txt')
wordcount = {}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
wordcount = {k.lower(): v for k, v in wordcount.items() }
print (wordcount)
sum = 0
for val in wordcount.values():
sum += val
print ('The total amount of words in Alice adventures in wonderland: ' + str(sum))
sortList = sorted(wordcount.values(), reverse = True)
most_freq_7 = sortList[0:7]
#print (most_freq_7)
print ('Totoro says: The 7 most common words in Alice Adventures in Wonderland:')
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[0])] + " " + str(most_freq_7[0]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[1])] + " " + str(most_freq_7[1]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[2])] + " " + str(most_freq_7[2]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[3])] + " " + str(most_freq_7[3]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[4])] + " " + str(most_freq_7[4]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[5])] + " " + str(most_freq_7[5]))
print(list(wordcount.keys())[list(wordcount.values()).index(most_freq_7[6])] + " " + str(most_freq_7[6]))
file_common = open("common-words.txt", "r")
commonwords = []
contents = file_common.readlines()
for i in range(len(contents)):
commonwords.append(contents[i].strip('\n'))
print(commonwords)
#From here's the code were I need to find out how to compare the lists:
alice_keys = wordcount.keys()
result = set(filter(set(alice_keys).__contains__, commonwords))
newlist = list()
for elm in alice_keys:
if elm not in result:
newlist.append(elm)
print('Here are the similar words: ' + str(newlist)) #Why doesn't show?
else:
print ('I am sorry, that filename does not exist. Please try again.')
I'm not in front of an interpreter so my code might be slightly off. But try something more like this.
from collections import Counter
with open("some_file_with_words") as f_file
counter = Counter(f_file.read())
top_seven = counter.most_common(7)
with open("commonwords") as f_common:
common_words = f_common.read().split()
for word, count in top_seven:
if word in common_words:
print "your word " + word + " is in the most common words! It appeared " + str(count) + " times!"

.split(",") separating every character of a string

At some point of the program I ask it to take the user's text input and separate the text according to it's commas, and then I ",".join it again in a txt file. The idea is to have a list with all the comma separated information.
The problem is that, apparently, when I ",".join it, it separates every single character with commas, so if I've got the string info1,info2 it separates, getting info1 | info2, but then, when joining it back again it ends like i,n,f,o,1,,,i,n,f,o,2, which is highly unconfortable, since it get's the text back from the txt file to show it to the user later in the program. Can anyone help me with that?
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'a')
categories.write(BookCategory + '\n')
categories.close()
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'r')
categoryList = categories.readlines()
categories.close()
for category in BookCategory.split(','):
for readCategory in lastReadCategoriesList:
if readCategory.split(',')[0] == category.strip():
count = int(readCategory.split(',')[1])
count += 1
i = lastReadCategoriesList.index(readCategory)
lastReadCategoriesList[i] = category.strip() + "," + str(count).strip()
isThere = True
if not isThere:
lastReadCategoriesList.append(category.strip() + ",1")
isThere = False
lastReadCategories = open('c:/digitalLibrary/' + connectedUser + '/lastReadCategories.txt', 'w')
for category in lastReadCategoriesList:
if category.split(',')[0] != "" and category != "":
lastReadCategories.write(category + '\n')
lastReadCategories.close()
global finalList
finalList.append({"Title":BookTitle + '\n', "Author":AuthorName + '\n', "Borrowed":IsBorrowed + '\n', "Read":readList[len(readList)-1], "BeingRead":readingList[len(readingList)-1], "Category":BookCategory + '\n', "Collection":BookCollection + '\n', "Comments":BookComments + '\n'})
finalList = sorted(finalList, key=itemgetter('Title'))
for i in range(len(finalList)):
categoryList[i] = finalList[i]["Category"]
toAppend = (str(i + 1) + ".").ljust(7) + finalList[i]['Title'].strip()
s.append(toAppend)
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'w')
for i in range(len(categoryList)):
categories.write(",".join(categoryList[i]))
categories.close()
You should pass ''.join() a list, you are passing in a single string instead.
Strings are sequences too, so ''.join() treats every character as a separate element instead:
>>> ','.join('Hello world')
'H,e,l,l,o, ,w,o,r,l,d'
>>> ','.join(['Hello', 'world'])
'Hello,world'

Categories

Resources