while i am compiling i am getting key error - python

def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary[word] for word in words]
if __name__ == "__main__":
vocabulary = open("G:\clickbait-detector-master\data/vocabulary.txt").read().split("\n")
inverse_vocabulary = dict((word, i) for i, word in enumerate(vocabulary))
clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
genuine = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in genuine], maxlen=SEQUENCE_LENGTH)
my error is:
KeyError Traceback (most recent call last)
<ipython-input-6-692b7e251048> in <module>()
25
26 clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
---> 27 clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
28
29 genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
<ipython-input-6-692b7e251048> in <listcomp>(.0)
25
26 clickbait = open("G:\clickbait-detector-master\data/clickbait.preprocessed.txt").read().split("\n")
---> 27 clickbait = sequence.pad_sequences([words_to_indices(inverse_vocabulary, sentence.split()) for sentence in clickbait], maxlen=SEQUENCE_LENGTH)
28
29 genuine = open("G:\clickbait-detector-master\data/genuine.preprocessed.txt").read().split("\n")
<ipython-input-6-692b7e251048> in words_to_indices(inverse_vocabulary, words)
16
17 def words_to_indices(inverse_vocabulary, words):
---> 18 return [inverse_vocabulary[word] for word in words]
19
20 if __name__ == "__main__":
<ipython-input-6-692b7e251048> in <listcomp>(.0)
16
17 def words_to_indices(inverse_vocabulary, words):
---> 18 return [inverse_vocabulary[word] for word in words]
19
20 if __name__ == "__main__":
KeyError: 'C'

I don't know how you want to handle missing values but the error is telling you that it comes from this:
def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary[word] for word in words]
Specifically the issue is inverse_vocabulary[word] which will throw a KeyError when the key you provide does not exist, in this case the key that is throwing the error is C.
Since I don't know how you want to handle missing keys that don't exist I will show you one way to handle this situation so that the error does not get thrown.
def words_to_indices(inverse_vocabulary, words):
return [inverse_vocabulary.get(word, '{} does not exist'.format(word) for word in words]
In which case the output of words_to_indices should look something like this:
['val1', 'val2', 'val3', 'C does not exist', etc...]
You'll have to modify it to do what you want however.

Related

how to apply cleaning function on all the column and not only on one row? AttributeError: 'Series' object has no attribute 'split'

how to apply on a column with many rows and not only on the first row? it works only for df[0] for example but not on all the rows. What can I do instead of split? and how to apply?
I currantly get a list of words of only the first cell (which is a long text). I need to get the list of all the rows
I get error:
AttributeError Traceback (most recent call last)
<ipython-input-400-30a976cb67ef> in <module>
73
74
---> 75 new_data=Clean_stop_words(df["Text"])
76
77 print(new_data, end=",")
<ipython-input-400-30a976cb67ef> in Clean_stop_words(data)
54 # for i in range(0:500):
55 # data=data.apply(str)
---> 56 data_split = data.split(' ')
57 #print(data_split)
58 # for word in data:
~\anaconda\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5128 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5129 return self[name]
-> 5130 return object.__getattribute__(self, name)
5131
5132 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'split'
def Clean_stop_words(data):
stemmer = PorterStemmer()
stop_words=stopwords.words('english')
new_data=[]
#print(data)
# for i in range(0:500):
# data=data.apply(str)
data_split = data.split(' ')
#print(data_split)
# for word in data:
# print(word)
for word in data_split:
np.char.lower(word)
#print(data_split)
word = re.sub('[^A-Za-z0-9]+', '', word)
for word in data_split:
if word not in stop_words:
word1=stemmer.stem(word)
#print(word1)
new_data.append(word1)
symbols = "!\"#$%&()*+-./:;<=>?#[\]^_`{|}~\n"
for i in symbols:
new_data= np.char.replace(new_data, i, ' ')
return new_data
new_data=Clean_stop_words(df["Text"])
#new_data=Clean_stop_words(df["Text"][0])
print(new_data, end=",")

Error when looping through a function that aggregates text values

I have an issue with my function. The design is to aggregate word tokens into dictionaries.
This is the code:
def preprocess (texts):
case = truecase.get_true_case(texts)
doc = nlp(case)
return doc
def summarize_texts(texts):
doc = preprocess(texts)
actions = {}
entities = {}
for token in doc:
if token.pos_ == "VERB":
actions[token.lemma_] = actions.get(token.text, 0) +1
for token in doc.ents:
entities[token.label_] = [token.text]
return {
'actions': actions,
'entities': entities
})
The problem I am having is the function works as expected for a single input:
summarize_texts("Play something by Billie Holiday")
{'actions': {'play': 1}, 'entities': {'PERSON': ['Billie']}}
but the objective is to be able to pass a list or csv file through it and have it aggregate it all.
When I try:
docs = [
"Play something by Billie Holiday",
"Set a timer for five minutes",
"Play it again, Sam"
]
summarize_texts(docs)
I get the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-18-200347d5cac5> in <module>()
4 "Play it again, Sam"
5 ]
----> 6 summarize_texts(docs)
5 frames
<ipython-input-16-08c879553d6e> in summarize_texts(texts)
1 def summarize_texts(texts):
----> 2 doc = preprocess(texts)
3 actions = {}
4 entities = {}
5 for token in doc:
<ipython-input-12-fccf767830b1> in preprocess(texts)
1 def preprocess (texts):
----> 2 case = truecase.get_true_case(texts)
3 doc = nlp(case)
4 return doc
/usr/local/lib/python3.6/dist-packages/truecase/__init__.py in get_true_case(sentence, out_of_vocabulary_token_option)
14 return get_truecaser().get_true_case(
15 sentence,
---> 16 out_of_vocabulary_token_option=out_of_vocabulary_token_option)
/usr/local/lib/python3.6/dist-packages/truecase/TrueCaser.py in get_true_case(self, sentence, out_of_vocabulary_token_option)
97 as-is: Returns OOV tokens as is
98 """
---> 99 tokens = self.tknzr.tokenize(sentence)
100
101 tokens_true_case = []
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/casual.py in tokenize(self, text)
293 """
294 # Fix HTML character entities:
--> 295 text = _replace_html_entities(text)
296 # Remove username handles
297 if self.strip_handles:
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/casual.py in _replace_html_entities(text, keep, remove_illegal, encoding)
257 return "" if remove_illegal else match.group(0)
258
--> 259 return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
260
261
TypeError: expected string or bytes-like object
I expect to get the output:
{'actions': {'play': 2, 'set': 1}, 'entities': {'PERSON': ['Billie', 'Sam'], 'TIME': ['five minutes']}}
Not sure what's wrong with my function syntax.
Looks like your problem is that truecase.get_true_case(texts) expects to receive a string/bytes like argument, and you're passing it a list of strings.
You'll need to iterate through texts and preprocess each item in the list separately:
def preprocess (text):
case = truecase.get_true_case(text)
doc = nlp(case)
return doc
def summarize_texts(texts):
actions = {}
entities = {}
for text in texts:
doc = preprocess(text)
for token in doc:
if token.pos_ == "VERB":
actions[token.lemma_] = actions.get(token.text, 0) +1
for token in doc.ents:
entities[token.label_] = [token.text]
return {
'actions': actions,
'entities': entities
})
Try using a for loop for texts, before calling preprocess
for i in texts:
doc = preprocess(i)

problem: TypeError: 'float' object not iterable

I'm using python 3.2.2 on windows 7.This is part of my code.it reads from an excel file .But when I run the code it just prints from 0 to 10 and gives" TypeError: 'float' object is not iterable".
Thanks for any help!
pages = [i for i in range(0,19634)]
for page in pages:
x=df.loc[page,["id"]]
x=x.values
x=str(x)[2:-2]
text=df.loc[page,["rev"]]
def remove_punct(text):
text=''.join([ch.lower() for ch in text if ch not in exclude])
tokens = re.split('\W+', text)
tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
removetable = str.maketrans('', '', '1234567890')
out_list = [s.translate(removetable) for s in tokens1]
str_list = list(filter(None,out_list))
line = [i for i in str_list if len(i) > 1]
return line
s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
with open('FileNamex.csv', 'a', encoding="utf-8") as f:
s.to_csv(f, header=False)
print(s)
this is the Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-54-c71f66bdaca6> in <module>()
33 return line
34
---> 35 s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
36
37 with open('FileNamex.csv', 'a', encoding="utf-8") as f:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
3190 else:
3191 values = self.astype(object).values
-> 3192 mapped = lib.map_infer(values, f, convert=convert_dtype)
3193
3194 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src\inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-54-c71f66bdaca6> in <lambda>(x)
33 return line
34
---> 35 s=df.loc[page,["rev"]].apply(lambda x:remove_punct(x))
36
37 with open('FileNamex.csv', 'a', encoding="utf-8") as f:
<ipython-input-54-c71f66bdaca6> in remove_punct(text)
22
23 def remove_punct(text):
---> 24 text=''.join([ch.lower() for ch in text if ch not in exclude])
25 tokens = re.split('\W+', text)
26 tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
TypeError: 'float' object is not iterable
Thanks for any help!
You are trying to apply a function that iterates text (whatever it is) - and ou call it using a float value.
floats can not be iterated. You can use text = str(text) to convert any input to text first - but looking at your code I hesitate to say that would make sense.
You can check if you are handling a float like this:
def remove_punct(text):
if isinstance(text,float):
pass # do something sensible with floats here
return # something sensible
text=''.join([ch.lower() for ch in text if ch not in exclude])
tokens = re.split('\W+', text)
tex = " ".join([wn.lemmatize(word) for word in tokens if word not in stopword])
removetable = str.maketrans('', '', '1234567890')
out_list = [s.translate(removetable) for s in tokens1]
str_list = list(filter(None,out_list))
line = [i for i in str_list if len(i) > 1]
return line
You can either tackle float via isinstance or get inspiration from
In Python, how do I determine if an object is iterable? on how to detect if you provide any iterable. You need to handle non-iterables differently.

twitter 'float' object has no attribute 'lower'

I am trying to import twitter data saved as text file and use the keyword function for designating columns that would show the details.
I have used this code in ipython3 notebook:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
The next cell has the following code:
#adding column
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
The error I get is as follows:
AttributeError Traceback (most recent
call last)
<ipython-input-35-b172c4e07d29> in <module>()
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
/usr/lib/python3/dist-packages/pandas/core/series.py in apply(self,
func, convert_dtype, args, **kwds)
2292 else:
2293 values = self.asobject
-> 2294 mapped = lib.map_infer(values, f,
convert=convert_dtype)
2295
2296 if len(mapped) and isinstance(mapped[0], Series):
pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()
<ipython-input-35-b172c4e07d29> in <lambda>(tweet)
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet:
word_in_text('Trade', tweet))
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
AttributeError: 'float' object has no attribute 'lower'
Update: I was able to reproduce your error. The field text might be missing in some of your tweets.
from pandas.io.json import json_normalize
tweet_data = [{'text': "let's trade!", 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
I get the error with:
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
Output:
>> AttributeError: 'float' object has no attribute 'lower'
If I feed the tweet_data with the 'text' key I don't get the error. So, that would be an option. Another option would be to ignore nan cases in your lambda.
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet) if type(tweet) == str else False)
This way you get the correct output:
>>> tweets
text lang place.country created_at coordinates user.location Trade
0 let's trade! en uk now x,y here True
1 NaN en uk now z,w there False
This is old content, left here for completeness.
Somehow you are passing a float instead of the text to your word_in_text method. I've tried a simple example of what you want to achieve:
import pandas as pd
import re
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets = pd.DataFrame(['Hello, I like to trade', 'Trade', 'blah blah', 'Nice tradeoff here!'], columns=['text'])
The output is:
>>> tweets
text Trade
0 Hello, I like to trade True
1 Trade True
2 blah blah False
Also, for this sort of task, you can always use the Pandas' str built-in contains method. This code will give you the same result as the example above:
tweets['Trade'] = tweets['text'].str.contains("Trade", case=False) == True
I guess you want to check for 'exact word' matching, meaning "Nice tradeoff here!" shouldn't be identified as containing the word. You can also solve this problem:
tweets['Trade_[strict]'] = tweets['text'].str.contains(r"Trade\b.*", case=False) == True
The output being:
>>> tweets
text Trade Trade_[strict]
0 Hello, I like to trade True True
1 Trade True True
2 blah blah False False
3 Nice tradeoff here! True False
Plus, I added your json_normalize method with 'fake' data and it also worked. Make sure in your data you don't have any float in your text column instead of str.
from pandas.io.json import json_normalize
tweet_data = [{'text': '0', 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'text': 'Trade', 'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
And applying your method worked.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
ERROR:
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
You need to check whether the text parameter is of type str . So either check it with if else as shown in the answer by #Guiem Bosch.
Else simply convert the type of the text parameter by :
text = type(text).lower()
Hope this helps.

seqmining: how to calculate frequency of a sequence on python

I'm trying to use pymining on Python to generate frequent sequences from my dataset. My code below appears to be working well:
from pymining import seqmining
seqs = ( 'caabc', 'abcb', 'cabc', 'abbca')
freq_seqs = seqmining.freq_seq_enum(seqs, 2)
sorted(freq_seqs)
However, when i want to use it with my dataset:
import numpy as np
import pandas as pd
from pymining import seqmining
def importdata():
filename = pd.read_csv('C:/Users/asus/Desktop/memoire/sequences-code.csv', sep= ';', header = None)
data=importdata()
seqs = data
freq_seqs = seqmining.freq_seq_enum(seqs, 2)
sorted(freq_seqs)
I get this error:
TypeError: 'NoneType' object is not iterable
this is all the error:
TypeError Traceback (most recent call last)
<ipython-input-4-19e2af14465a> in <module>()
8 data=importdata()
9 seqs = data
---> 10 freq_seqs = seqmining.freq_seq_enum(seqs, 2)
11 sorted(freq_seqs)
12
~\Anaconda3\lib\site-packages\pymining\seqmining.py in freq_seq_enum(sequences, min_support)
9 '''
10 freq_seqs = set()
---> 11 _freq_seq(sequences, tuple(), 0, min_support, freq_seqs)
12 return freq_seqs
13
~\Anaconda3\lib\site-packages\pymining\seqmining.py in _freq_seq(sdb, prefix, prefix_support, min_support, freq_seqs)
16 if prefix:
17 freq_seqs.add((prefix, prefix_support))
---> 18 locally_frequents = _local_freq_items(sdb, prefix, min_support)
19 if not locally_frequents:
20 return
~\Anaconda3\lib\site-packages\pymining\seqmining.py in _local_freq_items(sdb, prefix, min_support)
28 items = defaultdict(int)
29 freq_items = []
---> 30 for entry in sdb:
31 visited = set()
32 for element in entry:
TypeError: 'NoneType' object is not iterable
The simplest change you can make to your code is to get rid of importdata, which is just a wrapper on pd.read_csv. Try:
filename = 'C:/Users/asus/Desktop/memoire/sequences-code.csv'
data = pd.read_csv(filename, sep=';', header=None)
Let me know if that helps.

Categories

Resources