error while make wikicorpus.txt NotImplementedError - python

I want to make a dictionary from wiki but i got this error and i dont know what exactly the error mean, this the code:
! wget https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2
-----------------------------------------------------------------------------------------
from gensim.corpora import WikiCorpus
wiki = WikiCorpus("idwiki-latest-pages-articles.xml.bz2", lemmatize=False, dictionary={})
with open("wiki-id-formatted.txt", 'w', encoding="utf8") as output:
counter = 0
for text in wiki.get_texts():
output.write(' '.join(text)+"\n")
counter = counter + 1
if counter > 200000:
break
and this the error
NotImplementedError Traceback (most recent call last)
<ipython-input-38-1b4f97b88e9f> in <module>()
1 # create txt file for spell check dictionary
----> 2 wiki = WikiCorpus("idwiki-latest-pages-articles.xml.bz2", lemmatize=False, dictionary={})
3
4 with open("wiki-id-formatted.txt", 'w', encoding="utf8") as output:
5 counter = 0
/usr/local/lib/python3.7/dist-packages/gensim/corpora/wikicorpus.py in __init__(self, fname, processes, lemmatize, dictionary, metadata, filter_namespaces, tokenizer_func, article_min_tokens, token_min_len, token_max_len, lower, filter_articles)
618 if lemmatize is not None:
619 raise NotImplementedError(
--> 620 'The lemmatize parameter is no longer supported. '
621 'If you need to lemmatize, use e.g. <https://github.com/clips/pattern>. '
622 'Perform lemmatization as part of your tokenization function and '
NotImplementedError: The lemmatize parameter is no longer supported. If you need to lemmatize, use e.g. <https://github.com/clips/pattern>. Perform lemmatization as part of your tokenization function and pass it as the tokenizer_func parameter to this initializer.

Related

sequence item 0: expected str instance, tuple found(2)

I analyzed the data in the precedent and tried to use topic modeling. Here is a
syntax I am using:
According to the error, I think it means that the string should go in when
joining, but the tuple was found. I don't know how to fix this part.
class FacebookAccessException(Exception): pass
def get_profile(request, token=None):
...
response = json.loads(urllib_response)
if 'error' in response:
raise FacebookAccessException(response['error']['message'])
access_token = response['access_token'][-1]
return access_token
#Join the review
word_list = ",".join([",".join(i) for i in sexualhomicide['tokens']])
word_list = word_list.split(",")
This is Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\Users\Public\Documents\ESTsoft\CreatorTemp\ipykernel_13792\3474859476.py in <module>
1 #Join the review
----> 2 word_list = ",".join([",".join(i) for i in sexualhomicide['tokens']])
3 word_list = word_list.split(",")
C:\Users\Public\Documents\ESTsoft\CreatorTemp\ipykernel_13792\3474859476.py in <listcomp>(.0)
1 #Join the review
----> 2 word_list = ",".join([",".join(i) for i in sexualhomicide['tokens']])
3 word_list = word_list.split(",")
TypeError: sequence item 0: expected str instance, tuple found
This is print of 'sexual homicide'
print(sexualhomicide['cleaned_text'])
print("="*30)
print(twitter.pos(sexualhomicide['cleaned_text'][0],Counter('word')))
I can't upload the results of this syntax. Error occurs because it is classified as spam during the upload process.

Python OOP error calling function within function

I just started learning OOP and was trying to create a class
but apperently i am not able to call the fuction within function
class WordPic:
def __init__(self,filename,outputname):
self.skipped = ["was","in","the","have","think","these","we","as"]
self.filename = filename
self.outputname = outputname
self.txt_freq = {}
def get_frequancy(self):
with open (self.file_location,"r") as f:
lines = f.read().lower()
splited_lines = lines.split()
for line in splited_lines:
if line not in self.skipped and line.isalpha():
line = line[0].upper() + line[1:]
if line not in self.txt_freq:
self.txt_freq[line] = 1
else:
self.txt_freq[line] += 1
return self.txt_freq
def create_pic(self):
cloud = wordcloud.WordCloud(background_color="white")
cloud.generate_from_frequencies(self.txt_freq)
cloud.to_file("{}.jpg".format(self.outputname))
def create(self):
get_frequancy(self)
create_pic(self)
print("created")
wc = WordPic("try.txt","done")
wc.create()
the error that i encounter is
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [190], in <cell line: 2>()
1 wc= WordPic("try.txt","done")
----> 2 wc.create()
Input In [188], in WordPic.create(self)
28 def create(self):
---> 29 get_frequancy(self)
30 create_pic(self)
31 print("created")
NameError: name 'get_frequancy' is not defined
i am not able to find my way around if anyone can help. thank you
get_frequancy is not a nonlocal variable; it's a class attribute. It has to be accessed as such. (The same goes for create_pic.)
def create(self):
self.get_frequancy()
self.create_pic()
print("created")
(While WordPic.get_frequancy(self) would be sufficient in the example shown, calling instance methods like this runs into problems once you start taking inheritance into account.)

Pytest `pytest.raises(ValueError)` does not seem to detect a `ValueError`

EDIT. The issue was that everytime I would import the function, it would not changed with updates. For this I needed to do
import sys, importlib
importlib.reload(sys.modules['foo'])
from foo import bar
And it started working
I am trying to write a test using Pytest to detect a ValueError if a json file passed into a function is invalid. However, when I follow the example, the test doesn't detect that the ValueError was raised.
This is the function I want to test
import pytest
import json
def read_file(input_file):
try:
with open(input_file, "r", encoding='utf-8') as reader:
pre_input_data = json.load(reader)
except ValueError:
raise ValueError
And this is my test function
def test_read_file():
with pytest.raises(ValueError):
read_file("invalidJsonFile.json")
If I just run the original function, it raises the ValueError
read_file("invalidJsonFile.json")
Invalid json file: Expecting value: line 1 column 1 (char 0)
However, when I run the test, it says it did not get a ValueError
test_read_file()
Invalid json file: Expecting value: line 1 column 1 (char 0)
---------------------------------------------------------------------------
Failed Traceback (most recent call last)
<ipython-input-47-c42b81670a67> in <module>()
----> 1 test_read_file()
2 frames
<ipython-input-46-178e6c645f01> in test_read_file()
1 def test_read_file():
2 with pytest.raises(Exception):
----> 3 read_file("invalidJsonFile.json")
/usr/local/lib/python3.6/dist-packages/_pytest/python_api.py in __exit__(self, *tp)
727 __tracebackhide__ = True
728 if tp[0] is None:
--> 729 fail(self.message)
730 self.excinfo.__init__(tp)
731 suppress_exception = issubclass(self.excinfo.type, self.expected_exception)
/usr/local/lib/python3.6/dist-packages/_pytest/outcomes.py in fail(msg, pytrace)
115 """
116 __tracebackhide__ = True
--> 117 raise Failed(msg=msg, pytrace=pytrace)
118
119
Failed: DID NOT RAISE <class 'Exception'>
Are you sure you're running the same code you sent here? because in a stack trace it looks like you're reading a different file (which could be valid and then no exception will be raised, if it's empty for example).
----> 3 read_file("sampleData.csv")
Also, you do not need to except ValueError just to raise ValueError, when you use pytest.raises(ValueError): pytest will check if the exception is instanceof ValueError.

rdd.first() does not give an error but rdd.collect() does

I am working in pyspark and have the following code, where I am processing tweet and making an RDD with the user_id and text. Below is the code
"""
# Construct an RDD of (user_id, text) here.
"""
import json
def safe_parse(raw_json):
try:
json_object = json.loads(raw_json)
if 'created_at' in json_object:
return json_object
else:
return;
except ValueError as error:
return;
def get_usr_txt (line):
tmp = safe_parse (line)
return ((tmp.get('user').get('id_str'),tmp.get('text')));
usr_txt = text_file.map(lambda line: get_usr_txt(line))
print (usr_txt.take(5))
and the output looks okay (as shown below)
[('470520068', "I'm voting 4 #BernieSanders bc he doesn't ride a CAPITALIST PIG adorned w/ #GoldmanSachs $. SYSTEM RIGGED CLASS WAR "), ('2176120173', "RT #TrumpNewMedia: .#realDonaldTrump #America get out & #VoteTrump if you don't #VoteTrump NOTHING will change it's that simple!\n#Trump htt…"), ('145087572', 'RT #Libertea2012: RT TODAY: #Colorado’s leading progressive voices to endorse #BernieSanders! #Denver 11AM - 1PM in MST CO State Capitol…'), ('23047147', '[VID] Liberal Tears Pour After Bernie Supporter Had To Deal With Trump Fans '), ('526506000', 'RT #justinamash: .#tedcruz is the only remaining candidate I trust to take on what he correctly calls the Washington Cartel. ')]
However, as soon as I do
print (usr_txt.count())
I get an error like below
Py4JJavaError Traceback (most recent call last)
<ipython-input-60-9dacaf2d41b5> in <module>()
8 usr_txt = text_file.map(lambda line: get_usr_txt(line))
9 #print (usr_txt.take(5))
---> 10 print (usr_txt.count())
11
/usr/local/spark/python/pyspark/rdd.py in count(self)
1054 3
1055 """
-> 1056 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1057
1058 def stats(self):
What am I missing? Is the RDD not created properly? or there is something else? how do I fix it?
You have returned None from safe_parse method when there is no created_at element in the parsed json line or when there is an error in parsing. This created error while getting elements from the parsed jsons in (tmp.get('user').get('id_str'),tmp.get('text')). That caused the error to occur
The solution is to check for None in get_usr_txt method
def get_usr_txt (line):
tmp = safe_parse(line)
if(tmp != None):
return ((tmp.get('user').get('id_str'),tmp.get('text')));
Now the question is why print (usr_txt.take(5)) showed the result and print (usr_txt.count()) caused the error
Thats because usr_txt.take(5) considered only the first five rdds and not the rest and didn't have to deal with None datatype.

NodeBox error for a verb in python

I downloaded the package http://nodebox.net/code/index.php/Linguistics#verb_conjugation
I'm getting an error even when I tried to get a tense of a verb .
import en
print en.is_verb('use')
#prints TRUE
print en.verb.tense('use')
KeyError Traceback (most recent call last)
/home/cse/version2_tense.py in <module>()
----> 1
2
3
4
5
/home/cse/en/__init__.pyc in tense(self, word)
124
125 def tense(self, word):
--> 126 return verb_lib.verb_tense(word)
127
128 def is_tense(self, word, tense, negated=False):
/home/cse/en/verb/__init__.pyc in verb_tense(v)
175
176 infinitive = verb_infinitive(v)
--> 177 a = verb_tenses[infinitive]
178 for tense in verb_tenses_keys:
179 if a[verb_tenses_keys[tense]] == v:
KeyError: ''
The reason you are getting this error is because there is a mistake in the ~/Library/Application Support/NodeBox/en/verb/verb.txt file they are using to create the dictionary.
use is the infinitive form, however, "used" is entered as the infinitive.
at line 5857:
used,,,uses,,using,,,,,used,used,,,,,,,,,,,,
should be:
use,,,uses,,using,,,,,used,used,,,,,,,,,,,,
after editing and saving the file:
import en
print en.is_verb("use")
print en.verb.infinitive('use')
print en.verb.tense('use')
gives:
True
use
infinitive
extra:
import en
print 'use %s' % en.verb.tense("use")
print 'uses %s' % en.verb.tense("uses")
print 'using %s' % en.verb.tense('using')
print 'used %s' % en.verb.tense('used')
use infinitive
uses 3rd singular present
using present participle
used past

Categories

Resources