only get the last result of the xml file - python

here is my code, I try to use the print function to check and I tag what I have found next to the code using#
def file(entry):
file_name = str(entry)
if file_name.endswith('.xml'):
tree = ET.parse(file_name)
root = tree.getroot()
for i in range(len(root)):
in_text = str(root[i][5].text).lower()
print(in_text)# here I still get all data
elif file_name.endswith('.json'):
with open(file_name) as f:
j_text = json.load(f)
in_text = (j_text['text']).lower()
else:
root_error = tk.Tk()
root_error.title('Error !')
canvas_error = tk.Canvas(root_error, height=10, width=100 )
canvas_error.pack()
label_error = tk.Label(root_error, text= 'file type dont support')
label_error.pack()
root_error.mainloop()
remove_digits = str.maketrans('', '', digits)
res = in_text.translate(remove_digits)
print(res)# here I get only the last one
token_text = sent_tokenize(res)
sent_string = ('\n'.join(token_text))
removed_pun = str(sent_string).translate(str.maketrans('', '', string.punctuation))
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(str(removed_pun))
result = [i for i in tokens if not i in stop_words]
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in result]
lemmatizer = WordNetLemmatizer()
final_text = ' '.join([lemmatizer.lemmatize(w) for w in stemmed])
lower_label_out['text'] = final_text
but when I use the code only like this
tree = ET.parse('books.xml')
root = tree.getroot()
for i in range(len(root)):
print(root[i][5].text)
I get all the data, I don't know why I only get the last data, how can I fix it

As written in the comment, your problem is that you overwrite the label['text'] value in each iteration. With the new indentation, you just shifted the problem from the out_text variable to the label['text'] variable. If you want to get a list of all out_texts, I'd suggest you to do the following.
out_text = []
for i in range(len(root)):
# in each iteration, append the new string to the list
out_text.append(str(root[i][0].text)
label_out['text'] = out_text
In each iteration, the value of str(root[i][0].text) is appended to the list, and finally assigned to the label_out['text'] value.
However, I'd suggest you to look into how for loops work in python, as you could write the same statement as follows:
out_text = []
for ro in root:
out_text.append(str(ro[0]).text)
label_out['text'] = out_text
The reason why the print() statement works is that you put it into the for loop, so each time the code passes there, the current value is printed to the screen.

The last line in your for lop is mis-indented, so it only prints the last element.
Try changing it to:
for i in range(len(root)):
out_text = str(root[i][0].text)
label_out['text'] = out_text #note the new indentation
and see if it works.

Related

Print from High to Low occurrences of Dictionary in Python

I have a code that counts every word in a file and counts how many times they occurred.
filename = "test.txt"
output = []
with open(filename) as f:
content = f.readlines()
content = [x.strip() for x in content]
wordlist = {}
for line in content:
for entry in line.split():
word = entry.replace('.', '')
word = word.replace(',', '')
word = word.replace('!', '')
word = word.replace('?', '')
if word not in wordlist:
wordlist[word] = 1
else:
wordlist[word] = wordlist[word] + 1
print(wordlist)
However, when I print this, I am not able to specify to go from high to low occurrences.
Here is a test file.
hello my friend. hello sir.
How do I print such that it looks like
hello: 2 (newline)
my: 1
etc?
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
for key,value in Counter(word_list).items():
print(f'{key} : {value}')
In python3.7 and up the dict preserve the insertion order. So we can just sort the dictionary item by values and then insert(or create) in new dict.
Use:
print(dict(sorted(wordlist.items(), key = lambda x: -x[1])))

Use of For loop in processing directory contents in Python

I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.

Stopword not removing one word

i want to remove 'dan' in filtering process, but didnt work.
here is my code
for row in readCSV:
_word = []
username = row[0]
date = row[1]
text = row[2].lower()
text = re.sub(r'#[A-Za-z0-9_]+','',text)
text = re.sub(r'http\S+', '',text)
text = replaceMultiple(text, ["!","#","#","$","%","^","&","*","(",
")","_","-","+","=","{","}","[","]",
"\\","/",",",".","?","<",">",":",";",
"'",'"',"~","0","1","2","3","4","5","6","7","8","9"], '')
text = text.strip()
nltk_tokens = nltk.word_tokenize(text)
stop_words = set(stopwords.words("indonesian"))
stop_words_new = ['aku','dan','duh','hhhmmm','thn','nih','tgl',
'hai','jazz','bro','broo','msh','']
new_stopwords_list = stop_words.union(stop_words_new)
words in stop_words_new is removed except 'dan'.
why?
The code should not be working because you are joining a set with a list. Try making the stop_words_new a set instead of a list

Using Binary Search for Spelling Check

I am trying to use binary search to check the spelling of words in a file, and print out the words that are not in the dictionary. But as of now, most of the correctly spelled words are being printed as misspelled (words that cannot be find in the dictionary).
Dictionary file is also a text file that looks like:
abactinally
abaction
abactor
abaculi
abaculus
abacus
abacuses
Abad
abada
Abadan
Abaddon
abaddon
abadejo
abadengo
abadia
Code:
def binSearch(x, nums):
low = 0
high = len(nums)-1
while low <= high:
mid = (low + high)//2
item = nums[mid]
if x == item :
print(nums[mid])
return mid
elif x < item:
high = mid - 1
else:
low = mid + 1
return -1
def main():
print("This program performs a spell-check in a file")
print("and prints a report of the possibly misspelled words.\n")
# get the sequence of words from the file
fname = input("File to analyze: ")
text = open(fname,'r').read()
for ch in '!"#$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.split()
#import dictionary from file
fname2 =input("File of dictionary: ")
dic = open(fname2,'r').read()
dic = dic.split()
#perform binary search for misspelled words
misw = []
for w in words:
m = binSearch(w,dic)
if m == -1:
misw.append(w)
Your binary search works perfectly! You don't seem to be removing all special characters, though.
Testing your code (with a sentence of my own):
def main():
print("This program performs a spell-check in a file")
print("and prints a report of the possibly misspelled words.\n")
text = 'An old mann gathreed his abacus, and ran a mile. His abacus\n ran two miles!'
for ch in '!"#$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.lower().split(' ')
dic = ['a','abacus','an','and','arranged', 'gathered', 'his', 'man','mile','miles','old','ran','two']
#perform binary search for misspelled words
misw = []
for w in words:
m = binSearch(w,dic)
if m == -1:
misw.append(w)
print misw
prints as output ['mann', 'gathreed', '', '', 'abacus\n', '']
Those extra empty strings '' are the extra spaces for punctuation that you replaced with spaces. The \n (a line break) is a little more problematic, as it is something you definitely see in external text files but is not something intuitive to account for. What you should do instead of for ch in '!"#$%&()*+,-./:;<=>?#[\\]^_``{|}~': is just check to see if every character .isalpha() Try this:
def main():
...
text = 'An old mann gathreed his abacus, and ran a mile. His abacus\n ran two miles!'
for ch in text:
if not ch.isalpha() and not ch == ' ':
#we want to keep spaces or else we'd only have one word in our entire text
text = text.replace(ch, '') #replace with empty string (basically, remove)
words = text.lower().split(' ')
#import dictionary
dic = ['a','abacus','an','and','arranged', 'gathered', 'his', 'man','mile','miles','old','ran','two']
#perform binary search for misspelled words
misw = []
for w in words:
m = binSearch(w,dic)
if m == -1:
misw.append(w)
print misw
Output:
This program performs a spell-check in a file
and prints a report of the possibly misspelled words.
['mann', 'gathreed']
Hope this was helpful! Feel free to comment if you need clarification or something doesn't work.

Loading a classifier using Pickle?

I am trying run a sentiment analysis. I have managed to use Naive Bayes through nltk to classify a corpus of negative and positive tweets. However I do not want to go through the process of running this classifier every time I run this program so I tried to use pickle to save, and then load into a different script the classifier. However when I try to run the script it returns the error NameError: name classifier is not defined, although I thought it was defined through the def load_classifier():
The code I have atm is below:
import nltk, pickle
from nltk.corpus import stopwords
customstopwords = ['']
p = open('xxx', 'r')
postxt = p.readlines()
n = open('xxx', 'r')
negtxt = n.readlines()
neglist = []
poslist = []
for i in range(0,len(negtxt)):
neglist.append('negative')
for i in range(0,len(postxt)):
poslist.append('positive')
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
taggedtweets = postagged + negtagged
tweets = []
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
def getwordfeatures(listoftweets):
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in stopwords.words('english')]
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in customstopwords]
def feature_extractor(doc):
docwords = set(doc)
features = {}
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
training_set = nltk.classify.apply_features(feature_extractor, tweets)
def load_classifier():
f = open('my_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close
return classifier
while True:
input = raw_input('I hate this film')
if input == 'exit':
break
elif input == 'informfeatures':
print classifier.show_most_informative_features(n=30)
continue
else:
input = input.lower()
input = input.split()
print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
p.close()
n.close()
Any help would be great, the script seems to make it to the print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'" before returning the error...
Well, you have declared and defined the load_classifier() method but never called it and assigned a variable using it. That means, by the time, the execution reaches the print '\nSentiment is... ' line, there is no variable names classifier. Naturally, the execution throws an exception.
Add the line classifier = load_classifier() just before while loop. (without any indentation)

Categories

Resources