I have an issue with my function. The design is to aggregate word tokens into dictionaries.
This is the code:
def preprocess (texts):
case = truecase.get_true_case(texts)
doc = nlp(case)
return doc
def summarize_texts(texts):
doc = preprocess(texts)
actions = {}
entities = {}
for token in doc:
if token.pos_ == "VERB":
actions[token.lemma_] = actions.get(token.text, 0) +1
for token in doc.ents:
entities[token.label_] = [token.text]
return {
'actions': actions,
'entities': entities
})
The problem I am having is the function works as expected for a single input:
summarize_texts("Play something by Billie Holiday")
{'actions': {'play': 1}, 'entities': {'PERSON': ['Billie']}}
but the objective is to be able to pass a list or csv file through it and have it aggregate it all.
When I try:
docs = [
"Play something by Billie Holiday",
"Set a timer for five minutes",
"Play it again, Sam"
]
summarize_texts(docs)
I get the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-200347d5cac5> in <module>()
4 "Play it again, Sam"
5 ]
----> 6 summarize_texts(docs)
5 frames
<ipython-input-16-08c879553d6e> in summarize_texts(texts)
1 def summarize_texts(texts):
----> 2 doc = preprocess(texts)
3 actions = {}
4 entities = {}
5 for token in doc:
<ipython-input-12-fccf767830b1> in preprocess(texts)
1 def preprocess (texts):
----> 2 case = truecase.get_true_case(texts)
3 doc = nlp(case)
4 return doc
/usr/local/lib/python3.6/dist-packages/truecase/__init__.py in get_true_case(sentence, out_of_vocabulary_token_option)
14 return get_truecaser().get_true_case(
15 sentence,
---> 16 out_of_vocabulary_token_option=out_of_vocabulary_token_option)
/usr/local/lib/python3.6/dist-packages/truecase/TrueCaser.py in get_true_case(self, sentence, out_of_vocabulary_token_option)
97 as-is: Returns OOV tokens as is
98 """
---> 99 tokens = self.tknzr.tokenize(sentence)
100
101 tokens_true_case = []
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/casual.py in tokenize(self, text)
293 """
294 # Fix HTML character entities:
--> 295 text = _replace_html_entities(text)
296 # Remove username handles
297 if self.strip_handles:
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/casual.py in _replace_html_entities(text, keep, remove_illegal, encoding)
257 return "" if remove_illegal else match.group(0)
258
--> 259 return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
260
261
TypeError: expected string or bytes-like object
I expect to get the output:
{'actions': {'play': 2, 'set': 1}, 'entities': {'PERSON': ['Billie', 'Sam'], 'TIME': ['five minutes']}}
Not sure what's wrong with my function syntax.
Looks like your problem is that truecase.get_true_case(texts) expects to receive a string/bytes like argument, and you're passing it a list of strings.
You'll need to iterate through texts and preprocess each item in the list separately:
def preprocess (text):
case = truecase.get_true_case(text)
doc = nlp(case)
return doc
def summarize_texts(texts):
actions = {}
entities = {}
for text in texts:
doc = preprocess(text)
for token in doc:
if token.pos_ == "VERB":
actions[token.lemma_] = actions.get(token.text, 0) +1
for token in doc.ents:
entities[token.label_] = [token.text]
return {
'actions': actions,
'entities': entities
})
Try using a for loop for texts, before calling preprocess
for i in texts:
doc = preprocess(i)
Related
I want to use stanza for tokenizing, pos tagging and parsing some text I have, but it keeps giving me this error. I've tried changing the way a I call it but nothing happens. Any ideas?
My code(Here a iterate through a list of list of text and appli stanza to each one)
t = time()
data_stanza = []
for text in data:
stz = apply_stanza(text[0])
data_stanza.append(stz)
print('Time to run: {} mins'.format(round((time() - t) / 60, 2)))
This is the function I use to apply_stanza to each text:
nlp = stanza.Pipeline('pt')
def apply_stanza(text):
doc = nlp(text)
All = []
for sent in doc.sentences:
for word in sent.words:
All.append((word.id,word.text,word.lemma,word.upos,word.feats,word.head,word.deprel))
return All
The error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-7ac303eec8e8> in <module>
3 data_staza = []
4 for text in data:
----> 5 stz = apply_stanza(text[0])
6 data_stanza.append(stz)
7
<ipython-input-16-364c3ac30f32> in apply_stanza(text)
2
3 def apply_stanza(text):
----> 4 doc = nlp(text)
5 All = []
6 for sent in doc.sentences:
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in __call__(self, doc)
174 assert any([isinstance(doc, str), isinstance(doc, list),
175 isinstance(doc, Document)]), 'input should be either str, list or Document'
--> 176 doc = self.process(doc)
177 return doc
178
~\anaconda3\lib\site-packages\stanza\pipeline\core.py in process(self, doc)
168 for processor_name in PIPELINE_NAMES:
169 if self.processors.get(processor_name):
--> 170 doc = self.processors[processor_name].process(doc)
171 return doc
172
~\anaconda3\lib\site-packages\stanza\pipeline\mwt_processor.py in process(self, document)
31 preds = []
32 for i, b in enumerate(batch):
---> 33 preds += self.trainer.predict(b)
34
35 if self.config.get('ensemble_dict', False):
~\anaconda3\lib\site-packages\stanza\models\mwt\trainer.py in predict(self, batch, unsort)
77 self.model.eval()
78 batch_size = src.size(0)
---> 79 preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
80 pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens
81 pred_seqs = utils.prune_decoded_seqs(pred_seqs)
~\anaconda3\lib\site-packages\stanza\models\common\seq2seq_model.py in predict(self, src, src_mask, pos, beam_size)
259 done = []
260 for b in range(batch_size):
--> 261 is_done = beam[b].advance(log_probs.data[b])
262 if is_done:
263 done += [b]
~\anaconda3\lib\site-packages\stanza\models\common\beam.py in advance(self, wordLk, copy_indices)
82 # bestScoresId is flattened beam x word array, so calculate which
83 # word and beam each score came from
---> 84 prevK = bestScoresId / numWords
85 self.prevKs.append(prevK)
86 self.nextYs.append(bestScoresId - prevK * numWords)
RuntimeError: Integer division of tensors using div or / is no longer supported, and in a future release div will perform
true division as in Python 3. Use true_divide or floor_divide (// in Python) instead.
ATT: It turns after all that it was and error with the mwt module of stanza pipeline, so I just specified not to use it.
Use // for division instead of /.
Try to edit your code as follows:
print('Time to run: {} mins'.format(round((time() - t) // 60, 2)))
Using floor division (//) will floor the result to the largest possible integer.
Using torch.true_divide(Dividend, Divisor) or numpy.true_divide(Dividend, Divisor) in stead.
For example: 3/4 = torch.true_divide(3, 4)
https://pytorch.org/docs/stable/generated/torch.true_divide.html
https://numpy.org/doc/stable/reference/generated/numpy.true_divide.html
I'm trying to translate part of SQuAD 1.1 dataset to Sinhalese. I don't know whether i can use the json file straight into translation
What i tried so far is making a little dataframe of SQuAD dataset and try to translate that as a demo to myself. But i got different errors. Below is the error i'm getting now. Can you help me to fix that error or tell me a better way to complete my task using python.
```import googletrans
from googletrans import Translator
import os
from google.cloud import translate_v2 as translate
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=r"C:\Users\Sathsara\Documents\Python Learning\Translation test\translationAPI\flash-medley-278816-b2012b874797.json"
# create a translator object
translator = Translator()
# use translate method to translate a string - by default, the destination language is english
translated = translator.translate('I am Sathsara Rasantha',dest='si')
# the translate method returns an object
print(translated)
# obtain translated string by using attribute .text
translated.text
import pandas as pd
translate_example = pd.read_json("example2.json")
translate_example
contexts = []
questions = []
answers_text = []
answers_start = []
for i in range(translate_example.shape[0]):
topic = translate_example.iloc[i,0]['paragraphs']
for sub_para in topic:
for q_a in sub_para['qas']:
questions.append(q_a['question'])
answers_start.append(q_a['answers'][0]['answer_start'])
answers_text.append(q_a['answers'][0]['text'])
contexts.append(sub_para['context'])
df = pd.DataFrame({"context":contexts, "question": questions, "answer_start": answers_start, "text": answers_text})
df
df=df.loc[0:2,:]
df
# make a deep copy of the data frame
df_si = df.copy()
# translate columns' name using rename function
df_si.rename(columns=lambda x: translator.translate(x).text, inplace=True)
df_si.columns
translations = {}
for column in df_si.columns:
# unique elements of the column
unique_elements = df_si[column].unique()
for element in unique_elements:
# add translation to the dictionary
translations[element] = translator.translate(element,dest='si').text
print(translations)
# modify all the terms of the data frame by using the previously created dictionary
df_si.replace(translations, inplace = True)
# check translation
df_si.head()```
This is the error i get
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) <ipython-input-24-f55a5ca59c36> in <module>
> 5 for element in unique_elements:
> 6 # add translation to the dictionary
> ----> 7 translations[element] = translator.translate(element,dest='si').text
> 8
> 9 print(translations)
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in translate(self,
> text, dest, src)
> 170
> 171 origin = text
> --> 172 data = self._translate(text, dest, src)
> 173
> 174 # this code will be updated when the format is changed.
>
> ~\Anaconda3\lib\site-packages\googletrans\client.py in
> _translate(self, text, dest, src)
> 73 text = text.decode('utf-8')
> 74
> ---> 75 token = self.token_acquirer.do(text)
> 76 params = utils.build_params(query=text, src=src, dest=dest,
> 77 token=token)
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in do(self, text)
> 199 def do(self, text):
> 200 self._update()
> --> 201 tk = self.acquire(text)
> 202 return tk
>
> ~\Anaconda3\lib\site-packages\googletrans\gtoken.py in acquire(self,
> text)
> 144 a = []
> 145 # Convert text to ints
> --> 146 for i in text:
> 147 val = ord(i)
> 148 if val < 0x10000:
>
> TypeError: 'numpy.int64' object is not iterable
def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary[word] for word in words]
if __name__ == "__main__":
vocabulary = open("G:\clickbait-detector-master\data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))
clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
genuine = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in genuine], maxlen=SEQUENCE_LENGTH)
my error is:
KeyError Traceback (most recent call last)
<ipython-input-6-692b7e251048> in <module>()
25
26 clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
---> 27 clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
28
29 genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
<ipython-input-6-692b7e251048> in <listcomp>(.0)
25
26 clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
---> 27 clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
28
29 genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
<ipython-input-6-692b7e251048> in words_to_indices(inverse_vocabulary, words)
16
17 def words_to_indices(inverse_vocabulary, words):
---> 18 return [inverse_vocabulary[word] for word in words]
19
20 if __name__ == "__main__":
<ipython-input-6-692b7e251048> in <listcomp>(.0)
16
17 def words_to_indices(inverse_vocabulary, words):
---> 18 return [inverse_vocabulary[word] for word in words]
19
20 if __name__ == "__main__":
KeyError: 'C'
I don't know how you want to handle missing values but the error is telling you that it comes from this:
def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary[word] for word in words]
Specifically the issue is inverse_vocabulary[word] which will throw a KeyError when the key you provide does not exist, in this case the key that is throwing the error is C.
Since I don't know how you want to handle missing keys that don't exist I will show you one way to handle this situation so that the error does not get thrown.
def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary.get(word, '{} does not exist'.format(word) for word in words]
In which case the output of words_to_indices should look something like this:
['val1', 'val2', 'val3', 'C does not exist', etc...]
You'll have to modify it to do what you want however.
I'm using python 3.2.2 on windows 7.This is part of my code.it reads from an excel file .But when I run the code it just prints from 0 to 10 and gives" TypeError: 'float' object is not iterable".
Thanks for any help!
pages = [i for i in range(0,19634)]
for page in pages:
x=df.loc[page,["id"]]
x=x.values
x=str(x)[2:-2]
text=df.loc[page,["rev"]]
def remove_punct(text):
text=''.join([ch.lower() for ch in text if ch not in exclude])
tokens = re.split('\W+', text)
tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
removetable = str.maketrans('', '', '1234567890')
out_list = [s.translate(removetable) for s in tokens1]
str_list = list(filter(None,out_list))
line = [i for i in str_list if len(i) > 1]
return line
s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
with open('FileNamex.csv', 'a', encoding="utf-8") as f:
s.to_csv(f, header=False)
print(s)
this is the Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-54-c71f66bdaca6> in <module>()
33 return line
34
---> 35 s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
36
37 with open('FileNamex.csv', 'a', encoding="utf-8") as f:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
3190 else:
3191 values = self.astype(object).values
-> 3192 mapped = lib.map_infer(values, f, convert=convert_dtype)
3193
3194 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src\inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-54-c71f66bdaca6> in <lambda>(x)
33 return line
34
---> 35 s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
36
37 with open('FileNamex.csv', 'a', encoding="utf-8") as f:
<ipython-input-54-c71f66bdaca6> in remove_punct(text)
22
23 def remove_punct(text):
---> 24 text=''.join([ch.lower() for ch in text if ch not in exclude])
25 tokens = re.split('\W+', text)
26 tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
TypeError: 'float' object is not iterable
Thanks for any help!
You are trying to apply a function that iterates text (whatever it is) - and ou call it using a float value.
floats can not be iterated. You can use text = str(text) to convert any input to text first - but looking at your code I hesitate to say that would make sense.
You can check if you are handling a float like this:
def remove_punct(text):
if isinstance(text,float):
pass # do something sensible with floats here
return # something sensible
text=''.join([ch.lower() for ch in text if ch not in exclude])
tokens = re.split('\W+', text)
tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
removetable = str.maketrans('', '', '1234567890')
out_list = [s.translate(removetable) for s in tokens1]
str_list = list(filter(None,out_list))
line = [i for i in str_list if len(i) > 1]
return line
You can either tackle float via isinstance or get inspiration from
In Python, how do I determine if an object is iterable? on how to detect if you provide any iterable. You need to handle non-iterables differently.
I am trying to import twitter data saved as text file and use the keyword function for designating columns that would show the details.
I have used this code in ipython3 notebook:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
The next cell has the following code:
#adding column
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
The error I get is as follows:
AttributeError Traceback (most recent
call last)
<ipython-input-35-b172c4e07d29> in <module>()
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
/usr/lib/python3/dist-packages/pandas/core/series.py in apply(self,
func, convert_dtype, args, **kwds)
2292 else:
2293 values = self.asobject
-> 2294 mapped = lib.map_infer(values, f,
convert=convert_dtype)
2295
2296 if len(mapped) and isinstance(mapped[0], Series):
pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()
<ipython-input-35-b172c4e07d29> in <lambda>(tweet)
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet:
word_in_text('Trade', tweet))
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
AttributeError: 'float' object has no attribute 'lower'
Update: I was able to reproduce your error. The field text might be missing in some of your tweets.
from pandas.io.json import json_normalize
tweet_data = [{'text': "let's trade!", 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
I get the error with:
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
Output:
>> AttributeError: 'float' object has no attribute 'lower'
If I feed the tweet_data with the 'text' key I don't get the error. So, that would be an option. Another option would be to ignore nan cases in your lambda.
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet) if type(tweet) == str else False)
This way you get the correct output:
>>> tweets
text lang place.country created_at coordinates user.location Trade
0 let's trade! en uk now x,y here True
1 NaN en uk now z,w there False
This is old content, left here for completeness.
Somehow you are passing a float instead of the text to your word_in_text method. I've tried a simple example of what you want to achieve:
import pandas as pd
import re
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets = pd.DataFrame(['Hello, I like to trade', 'Trade', 'blah blah', 'Nice tradeoff here!'], columns=['text'])
The output is:
>>> tweets
text Trade
0 Hello, I like to trade True
1 Trade True
2 blah blah False
Also, for this sort of task, you can always use the Pandas' str built-in contains method. This code will give you the same result as the example above:
tweets['Trade'] = tweets['text'].str.contains("Trade", case=False) == True
I guess you want to check for 'exact word' matching, meaning "Nice tradeoff here!" shouldn't be identified as containing the word. You can also solve this problem:
tweets['Trade_[strict]'] = tweets['text'].str.contains(r"Trade\b.*", case=False) == True
The output being:
>>> tweets
text Trade Trade_[strict]
0 Hello, I like to trade True True
1 Trade True True
2 blah blah False False
3 Nice tradeoff here! True False
Plus, I added your json_normalize method with 'fake' data and it also worked. Make sure in your data you don't have any float in your text column instead of str.
from pandas.io.json import json_normalize
tweet_data = [{'text': '0', 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'text': 'Trade', 'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
And applying your method worked.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
ERROR:
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
You need to check whether the text parameter is of type str . So either check it with if else as shown in the answer by #Guiem Bosch.
Else simply convert the type of the text parameter by :
text = type(text).lower()
Hope this helps.