I have some qeustion and problem with celaning text on my NLP model. I dont know why i get this error: AttributeError: 'list' object has no attribute 'split.
On below is my df['Text'].sample(5) :
26278 [RT, #davidsirota:, subset, people, website, t...
63243 [RT, #jmartNYT:, The, presses, Team, Biden, As...
61059 [RT, #caitoz:, BREAKING:, Biden, nominate, "Li...
43160 [RT, #K_JeanPierre:, I, profoundly, honored, P...
Name: Text, dtype: object
On below is my code
def tokenizer(text):
tokenized = [w for w in text.split() if w not in stopset]
return tokenized
df['Text'] = df['Text'].apply(tokenizer)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
def remove_nonwords(Text):
if re.findall('\d',Text):
return ''
else:
return Text
def clean_text(Text):
text=' '.join([i for i in Text.split() if i not in stopset])
text=' '.join([stem.stem(word) for word in Text.split()])
return Text
df['text2'] = df['Text'].apply(clean_text)
Could you someone help me ?
Related
I'm trying to remove random words from text in a column with nltk.
Here is my code:
import pandas as pd
import nltk
import nltknltk.download('punkt')
from nltk.tokenize import word_tokenize
df = pd.read_excel("Output_Summarization/OUTPUT_ocr_OPENAIGOOD.xlsx", usecols=["Open_AI_Text"])
for index, row in df.iterrows():
words = word_tokenize(row["Open_AI_Text"])
word_to_remove = random.choice(words)
new_text = row["Open_AI_Text"].replace(word_to_remove, "")
df.at[index, "Open_AI_Text"] = new_text
df.to_excel("Texte_Trou.xlsx", index=False)
Next I have an error :
TypeError
Traceback (most recent call last)<ipython-input-51-3cb2fde32407> in <module>11
for index, row in df.iterrows():12
# tokeniser le texte de la ligne en mots individuels13
words = word_tokenize(row["Open_AI_Text"])1415
# choisir un mot au hasard à enlever
/usr/local/lib/python3.6/site-packages/nltk/tokenize/init.py in
word_tokenize(text, language, preserve_line)126
:type preserver_line: bool127
"""128
sentences = [text] if preserve_line else sent_tokenize(text, language)129
return [token for sent in sentences130
for token in _treebank_word_tokenizer.tokenize(sent)]
/usr/local/lib/python3.6/site-packages/nltk/tokenize/init.py in
sent_tokenize(text, language)93
"""94 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))95
return tokenizer.tokenize(text)9697 # Standard word tokenizer.
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
tokenize(self, text, realign_boundaries)1235
Given a text, returns a list of the sentences in that text.1236
"""1237
return list(self.sentences_from_text(text, realign_boundaries))12381239
def debug_decisions(self, text):
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
sentences_from_text(self, text, realign_boundaries)1283
follows the period.1284
"""1285
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]12861287
def _slices_from_text(self, text):
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
span_tokenize(self, text, realign_boundaries)1274
if realign_boundaries:1275
slices = self._realign_boundaries(text, slices)1276
return [(sl.start, sl.stop) for sl in slices]12771278
def sentences_from_text(self, text, realign_boundaries=True):
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
<listcomp>(.0)1274
if realign_boundaries:1275
slices = self._realign_boundaries(text, slices)1276
return [(sl.start, sl.stop) for sl in slices]12771278
def sentences_from_text(self, text, realign_boundaries=True):
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices)1314
"""1315 realign = 01316
for sl1, sl2 in _pair_iter(slices):1317
sl1 = slice(sl1.start + realign, sl1.stop)1318
if not sl2:
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
_pair_iter(it)310
"""311 it = iter(it)312
prev = next(it)313
for el in it:314
yield (prev, el)
/usr/local/lib/python3.6/site-packages/nltk/tokenize/punkt.py in
_slices_from_text(self, text)1287
def _slices_from_text(self, text):1288
last_break = 01289
for match in self._lang_vars.period_context_re().finditer(text):1290
context = match.group() + match.group('after_tok')1291
if self.text_contains_sentbreak(context):
TypeError: expected string or bytes-like object
I tried to replace my variable with a list but this didn't work, how can I solve this issue?
I was trying to translate tweet text using a deep translator but I found some issues.
Before translating the texts, I did some text preprocessing such as cleaning, removing emoji, etc. This is the ddefined functions of pre-processing :
def deEmojify(text):
regrex_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return regrex_pattern.sub(r'',text)
def cleaningText(text):
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
text = re.sub(r'RT[\s]', '', text) # remove RT
text = re.sub(r"http\S+", '', text) # remove link
text = re.sub(r"[!##$]", '', text) # remove link
text = re.sub(r'[0-9]+', '', text) # remove numbers
text = text.replace('\n', ' ') # replace new line into space
text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
text = text.strip(' ') # remove characters space from both left and right text
return text
def casefoldingText(text): # Converting all the characters in a text into lower case
text = text.lower()
return text
def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
text = word_tokenize(text)
return text
def filteringText(text): # Remove stopwors in a text
listStopwords = set(stopwords.words('indonesian'))
filtered = []
for txt in text:
if txt not in listStopwords:
filtered.append(txt)
text = filtered
return text
def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
factory = StemmerFactory()
stemmer = factory.create_stemmer()
text = [stemmer.stem(word) for word in text]
return text
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate_batch(text)
return text
And here's the translate function :
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate(text)
return text
this is an example of the expected result ( text in Indonesian)
text = '#jshuahaee Ketemu agnes mo lagii😍😍'
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result :
After cleaning ==> Ketemu agnes mo lagii😍😍
After emoji ==> Ketemu agnes mo lagii
After case folding ==> ketemu agnes mo lagii
After token ==> ['ketemu', 'agnes', 'mo', 'lagii']
After filter ==> ['ketemu', 'agnes', 'mo', 'lagii']
After Stem ==> ['ketemu', 'agnes', 'mo', 'lagi']
After translate ==> ['meet', 'agnes', 'mo', 'again']
But, I found issues when the sentences contain some dots, the error happened when after the stem process the text contain of [''] ( I don't know how to call this)
text = 'News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada 1… '
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result
After cleaning ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After emoji ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada …
After case folding ==> news update meski kurang diaspirasi shoppee yg korea minded dalam waktu indonesa belaja di bulan november lazada …
After token ==> ['news', 'update', 'meski', 'kurang', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'dalam', 'waktu', 'indonesa', 'belaja', 'di', 'bulan', 'november', 'lazada', '…']
After filter ==> ['news', 'update', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'belaja', 'november', 'lazada', '…']
After Stem ==> ['news', 'update', 'aspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'baja', 'november', 'lazada', '']
This is the error message
NotValidPayload Traceback (most recent call last)
<ipython-input-40-cb9390422d3c> in <module>
14 print('After Stem ==> ', stem)
15
---> 16 en = convert_eng(stem)
17 print('After translate ==> ', en)
<ipython-input-28-28bc36c96914> in convert_eng(text)
8 return text
9 def convert_eng(text):
---> 10 text = GoogleTranslator(source='auto', target='en').translate_batch(text)
11 return text
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate_batch(self, batch, **kwargs)
195 for i, text in enumerate(batch):
196
--> 197 translated = self.translate(text, **kwargs)
198 arr.append(translated)
199 return arr
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate(self, text, **kwargs)
108 """
109
--> 110 if self._validate_payload(text):
111 text = text.strip()
112
C:\Python\lib\site-packages\deep_translator\parent.py in _validate_payload(payload, min_chars, max_chars)
44
45 if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
---> 46 raise NotValidPayload(payload)
47
48 # check if payload contains only symbols
NotValidPayload: --> text must be a valid text with maximum 5000 character, otherwise it cannot be translated
My idea is to remove the '', i think that was the problem, but I have no idea how to do that.
Anyone, please kindly help me
You need to introduce a bit of error checking into your code, and only process an expected data type. Your convert_eng function (that uses GoogleTranslator#translate_batch) requires a list of non-blank strings as an argument (see if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit(): part), and your stem contains an empty string as the last item in the list.
Besides, it is possible that filteringText(text) can return [] because all words can turn out to be stopwords. Also, do not use filter as a name of a variable, it is a built-in.
So, change
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
to
filter1 = filteringText(token)
print('After filter ==> ', filter1)
if filter1:
stem = stemmingText(filter1)
print('After Stem ==> ', stem)
en = convert_eng([x for x in stem if x.strip() and not x.isdigit()])
print('After translate ==> ', en)
I left out the isinstance(x, str) check because I assume you already know your list only contains strings.
I have the following example text / tweet:
RT #trader $AAPL 2012 is o´o´o´o´o´pen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO
I want to follow the procedure of Table 1 in Li, T, van Dalen, J, & van Rees, P.J. (Pieter Jan). (2017). More than just noise? Examining the information content of stock microblogs on financial markets. Journal of Information Technology. doi:10.1057/s41265-016-0034-2 in order to clean up the tweet.
They clean the tweet up in such a way that the final result is:
{RT|123456} {USER|56789} {TICKER|AAPL} {NUMBER|2012} notooopen nottalk patent {COMPANY|GOOG} notdefinetli treatment {HASH|samsung} {EMOTICON|POS} haha {URL}
I use the following script to tokenize the tweet based on the regex:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"""
,
# Twitter username:
r"""(?:#[\w_]+)"""
,
# Hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Cashtags:
r"""(?:\$+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = word_re.findall(s)
if not self.preserve_case:
words = map((lambda x: x if emoticon_re.search(x) else x.lower()), words)
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
This yields the following output:
rt
#trader
$aapl
2012
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
#samsung
got
:-)
heh
url_that_cannot_be_posted_on_SO
How can I adjust this script to get:
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|url_that_cannot_be_posted_on_SO}
Thanks in advance for helping me out big time!
You really need to use named capturing groups (mentioned by thebjorn), and use groupdict() to get name-value pairs upon each match. It requires some post-processing though:
All pairs where the value is None must be discarded
If the self.preserve_case is false the value can be turned to lower case at once
If the group name is WORD, ELLIPSIS or ELSE the values are added to words as is
If the group name is HASHTAG, CASHTAG, USER or URL the values are added first stripped of $, # and # chars at the start and then added to words as {<GROUP_NAME>|<VALUE>} item
All other matches are added to words as {<GROUP_NAME>|<VALUE>} item.
Note that \w matches underscores by default, so [\w_] = \w. I optimized the patterns a little bit.
Here is a fixed code snippet:
import re
emoticon_string = r"""
(?P<EMOTICON>
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[][()dDpP/:{}#|\\] # mouth
|
[][()dDpP/:}{#|\\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$#.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
,
# Twitter username:
r"""(?P<USER>#\w+)"""
,
# Hashtags:
r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
,
# Cashtags:
r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
,
# Remaining word types:
r"""
(?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
|
(?P<WORD>\w+) # Words without apostrophes or dashes.
|
(?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
|
(?P<ELSE>\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
#print(word_re.pattern)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = []
for x in word_re.finditer(s):
for key, val in x.groupdict().items():
if val:
if not self.preserve_case:
val = val.lower()
if key in ['WORD','ELLIPSIS','ELSE']:
words.append(val)
elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
words.append("{{{}|{}}}".format(key, re.sub(r'^[##$]+', '', val)))
else:
words.append("{{{}|{}}}".format(key, val))
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
With test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com', it outputs
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|http://some.site.here.com}
See the regex demo online.
I am trying to import twitter data saved as text file and use the keyword function for designating columns that would show the details.
I have used this code in ipython3 notebook:
#definition for collecting keyword.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
The next cell has the following code:
#adding column
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
The error I get is as follows:
AttributeError Traceback (most recent
call last)
<ipython-input-35-b172c4e07d29> in <module>()
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
/usr/lib/python3/dist-packages/pandas/core/series.py in apply(self,
func, convert_dtype, args, **kwds)
2292 else:
2293 values = self.asobject
-> 2294 mapped = lib.map_infer(values, f,
convert=convert_dtype)
2295
2296 if len(mapped) and isinstance(mapped[0], Series):
pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()
<ipython-input-35-b172c4e07d29> in <lambda>(tweet)
1 #adding column
----> 2 tweets['Trade'] = tweets['text'].apply(lambda tweet:
word_in_text('Trade', tweet))
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
AttributeError: 'float' object has no attribute 'lower'
Update: I was able to reproduce your error. The field text might be missing in some of your tweets.
from pandas.io.json import json_normalize
tweet_data = [{'text': "let's trade!", 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
I get the error with:
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
Output:
>> AttributeError: 'float' object has no attribute 'lower'
If I feed the tweet_data with the 'text' key I don't get the error. So, that would be an option. Another option would be to ignore nan cases in your lambda.
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet) if type(tweet) == str else False)
This way you get the correct output:
>>> tweets
text lang place.country created_at coordinates user.location Trade
0 let's trade! en uk now x,y here True
1 NaN en uk now z,w there False
This is old content, left here for completeness.
Somehow you are passing a float instead of the text to your word_in_text method. I've tried a simple example of what you want to achieve:
import pandas as pd
import re
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets = pd.DataFrame(['Hello, I like to trade', 'Trade', 'blah blah', 'Nice tradeoff here!'], columns=['text'])
The output is:
>>> tweets
text Trade
0 Hello, I like to trade True
1 Trade True
2 blah blah False
Also, for this sort of task, you can always use the Pandas' str built-in contains method. This code will give you the same result as the example above:
tweets['Trade'] = tweets['text'].str.contains("Trade", case=False) == True
I guess you want to check for 'exact word' matching, meaning "Nice tradeoff here!" shouldn't be identified as containing the word. You can also solve this problem:
tweets['Trade_[strict]'] = tweets['text'].str.contains(r"Trade\b.*", case=False) == True
The output being:
>>> tweets
text Trade Trade_[strict]
0 Hello, I like to trade True True
1 Trade True True
2 blah blah False False
3 Nice tradeoff here! True False
Plus, I added your json_normalize method with 'fake' data and it also worked. Make sure in your data you don't have any float in your text column instead of str.
from pandas.io.json import json_normalize
tweet_data = [{'text': '0', 'lang':'en', 'place': {'country':'uk'}, 'created_at':'now', 'coordinates':'x,y', 'user':{'location':'here'}}, {'text': 'Trade', 'lang':'en', 'place': {'country':'uk'}, 'created_at': 'now', 'coordinates':'z,w', 'user':{'location':'there'}}]
tweets = json_normalize(tweet_data)[["text", "lang", "place.country","created_at", "coordinates","user.location"]]
And applying your method worked.
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
tweets['Trade'] = tweets['text'].apply(lambda tweet: word_in_text('Trade', tweet))
ERROR:
<ipython-input-34-daa2f94a8fec> in word_in_text(word, text)
2 def word_in_text(word, text):
3 word = word.lower()
----> 4 text = text.lower()
5 match = re.search(word, text)
6 if match:
You need to check whether the text parameter is of type str . So either check it with if else as shown in the answer by #Guiem Bosch.
Else simply convert the type of the text parameter by :
text = type(text).lower()
Hope this helps.
Lets say I have a string and want to mark some entities such as Persons, and Locations.
string = 'My name is John Doe, and I live in USA'
string_tagged = 'My name is [John Doe], and I live in {USA}'
I want to mark persons with [ ] and locations with { }.
My code:
import spacy
nlp = spacy.load('en')
doc = nlp(string)
sentence = doc.text
for ent in doc.ents:
if ent.label_ == 'PERSON':
sentence = sentence[:ent.start_char] + sentence[ent.start_char:].replace(ent.text, '[' + ent.text + ']', 1)
elif ent.label_ == 'GPE':
sentence = sentence[:ent.start_char] + sentence[ent.start_char:].replace(ent.text, '{' + ent.text + '}', 1)
print(sentence[:ent.start_char] + sentence[ent.start_char:])
...so with the example string this works fine. But with more complicated sentences I get double quotes around some entities. For the sentence:
string_bug = 'Canada, Canada, Canada, Canada, Canada, Canada'
returns >> {Canada}, {Canada}, {Canada}, {Canada}, {{Canada}}, Canada
The reason why I splitted the sentence string into two was to only replace new words (with higher character positions). I think the bug might be in that I am in looping over doc.ents, so I get the old positions of my string, and the string grows for each loop with new [ ] and {}. But feels like there must be some easier way of dealing with this in spaCy.
Here's a slight modification that helped me work with your code.
string = 'My name is John Doe, and I live in USA'
import re
import spacy
nlp = spacy.load('en')
doc = nlp(string)
sentence = doc.text
for ent in doc.ents:
if ent.label_ == 'PERSON':
sentence = re.sub(ent.text, '[' + ent.text + ']', sentence)
elif ent.label_ == 'GPE':
sentence = re.sub(ent.text, '{' + ent.text + '}', sentence)
print sentence
Yields:
My name is [John Doe], and I live in {USA}