I was trying to translate tweet text using a deep translator but I found some issues.
Before translating the texts, I did some text preprocessing such as cleaning, removing emoji, etc. This is the ddefined functions of pre-processing :
def deEmojify(text):
regrex_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return regrex_pattern.sub(r'',text)
def cleaningText(text):
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove mentions
text = re.sub(r'#[A-Za-z0-9]+', '', text) # remove hashtag
text = re.sub(r'RT[\s]', '', text) # remove RT
text = re.sub(r"http\S+", '', text) # remove link
text = re.sub(r"[!##$]", '', text) # remove link
text = re.sub(r'[0-9]+', '', text) # remove numbers
text = text.replace('\n', ' ') # replace new line into space
text = text.translate(str.maketrans('', '', string.punctuation)) # remove all punctuations
text = text.strip(' ') # remove characters space from both left and right text
return text
def casefoldingText(text): # Converting all the characters in a text into lower case
text = text.lower()
return text
def tokenizingText(text): # Tokenizing or splitting a string, text into a list of tokens
text = word_tokenize(text)
return text
def filteringText(text): # Remove stopwors in a text
listStopwords = set(stopwords.words('indonesian'))
filtered = []
for txt in text:
if txt not in listStopwords:
filtered.append(txt)
text = filtered
return text
def stemmingText(text): # Reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words
factory = StemmerFactory()
stemmer = factory.create_stemmer()
text = [stemmer.stem(word) for word in text]
return text
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate_batch(text)
return text
And here's the translate function :
def convert_eng(text):
text = GoogleTranslator(source='auto', target='en').translate(text)
return text
this is an example of the expected result ( text in Indonesian)
text = '#jshuahaee Ketemu agnes mo lagiišš'
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result :
After cleaning ==> Ketemu agnes mo lagiišš
After emoji ==> Ketemu agnes mo lagii
After case folding ==> ketemu agnes mo lagii
After token ==> ['ketemu', 'agnes', 'mo', 'lagii']
After filter ==> ['ketemu', 'agnes', 'mo', 'lagii']
After Stem ==> ['ketemu', 'agnes', 'mo', 'lagi']
After translate ==> ['meet', 'agnes', 'mo', 'again']
But, I found issues when the sentences contain some dots, the error happened when after the stem process the text contain of [''] ( I don't know how to call this)
text = 'News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada 1ā¦ '
clean = cleaningText(text)
print('After cleaning ==> ', clean)
emoji = deEmojify(clean)
print('After emoji ==> ', emoji)
cf = casefoldingText(emoji)
print('After case folding ==> ', cf)
token = tokenizingText(cf)
print('After token ==> ', token)
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
en = convert_eng(stem)
print('After translate ==> ', en)
Result
After cleaning ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada ā¦
After emoji ==> News update Meski kurang diaspirasi Shoppee yg korea minded dalam waktu indonesa belaja di bulan November Lazada ā¦
After case folding ==> news update meski kurang diaspirasi shoppee yg korea minded dalam waktu indonesa belaja di bulan november lazada ā¦
After token ==> ['news', 'update', 'meski', 'kurang', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'dalam', 'waktu', 'indonesa', 'belaja', 'di', 'bulan', 'november', 'lazada', 'ā¦']
After filter ==> ['news', 'update', 'diaspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'belaja', 'november', 'lazada', 'ā¦']
After Stem ==> ['news', 'update', 'aspirasi', 'shoppee', 'yg', 'korea', 'minded', 'indonesa', 'baja', 'november', 'lazada', '']
This is the error message
NotValidPayload Traceback (most recent call last)
<ipython-input-40-cb9390422d3c> in <module>
14 print('After Stem ==> ', stem)
15
---> 16 en = convert_eng(stem)
17 print('After translate ==> ', en)
<ipython-input-28-28bc36c96914> in convert_eng(text)
8 return text
9 def convert_eng(text):
---> 10 text = GoogleTranslator(source='auto', target='en').translate_batch(text)
11 return text
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate_batch(self, batch, **kwargs)
195 for i, text in enumerate(batch):
196
--> 197 translated = self.translate(text, **kwargs)
198 arr.append(translated)
199 return arr
C:\Python\lib\site-packages\deep_translator\google_trans.py in translate(self, text, **kwargs)
108 """
109
--> 110 if self._validate_payload(text):
111 text = text.strip()
112
C:\Python\lib\site-packages\deep_translator\parent.py in _validate_payload(payload, min_chars, max_chars)
44
45 if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit():
---> 46 raise NotValidPayload(payload)
47
48 # check if payload contains only symbols
NotValidPayload: --> text must be a valid text with maximum 5000 character, otherwise it cannot be translated
My idea is to remove the '', i think that was the problem, but I have no idea how to do that.
Anyone, please kindly help me
You need to introduce a bit of error checking into your code, and only process an expected data type. Your convert_eng function (that uses GoogleTranslator#translate_batch) requires a list of non-blank strings as an argument (see if not payload or not isinstance(payload, str) or not payload.strip() or payload.isdigit(): part), and your stem contains an empty string as the last item in the list.
Besides, it is possible that filteringText(text) can return [] because all words can turn out to be stopwords. Also, do not use filter as a name of a variable, it is a built-in.
So, change
filter= filteringText(token)
print('After filter ==> ', filter)
stem = stemmingText(filter)
print('After Stem ==> ', stem)
to
filter1 = filteringText(token)
print('After filter ==> ', filter1)
if filter1:
stem = stemmingText(filter1)
print('After Stem ==> ', stem)
en = convert_eng([x for x in stem if x.strip() and not x.isdigit()])
print('After translate ==> ', en)
I left out the isinstance(x, str) check because I assume you already know your list only contains strings.
Related
I have some qeustion and problem with celaning text on my NLP model. I dont know why i get this error: AttributeError: 'list' object has no attribute 'split.
On below is my df['Text'].sample(5) :
26278 [RT, #davidsirota:, subset, people, website, t...
63243 [RT, #jmartNYT:, The, presses, Team, Biden, As...
61059 [RT, #caitoz:, BREAKING:, Biden, nominate, "Li...
43160 [RT, #K_JeanPierre:, I, profoundly, honored, P...
Name: Text, dtype: object
On below is my code
def tokenizer(text):
tokenized = [w for w in text.split() if w not in stopset]
return tokenized
df['Text'] = df['Text'].apply(tokenizer)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
def remove_nonwords(Text):
if re.findall('\d',Text):
return ''
else:
return Text
def clean_text(Text):
text=' '.join([i for i in Text.split() if i not in stopset])
text=' '.join([stem.stem(word) for word in Text.split()])
return Text
df['text2'] = df['Text'].apply(clean_text)
Could you someone help me ?
I have a script that allows me to extract the info obtained from excel to a list, this list contains str values that contain phrases such as: "I like cooking", "My dogĀ“s name is Doug", etc.
So I've tried this code that I found on the Internet, knowing that the int function has a way to transform an actual phrase into numbers.
The code I used is:
lista=["I like cooking", "My dogĀ“s name is Doug", "Hi, there"]
test_list = [int(i, 36) for i in lista]
Running the code I get the following error:
builtins.ValueError: invalid literal for int() with base 36: "I like
cooking"
But IĀ“ve tried the code without the spaces or punctuation, and i get an actual value, but I do need to take those characters into consideration.
To expand on the bytearray approach you could use int.to_bytes and int.from_bytes to actually get an int back, although the integers will be much longer than you show in your example.
def to_int(s):
return int.from_bytes(bytearray(s, 'utf-8'), 'big', signed=False)
def to_str(s):
return s.to_bytes((s.bit_length() +7 ) // 8, 'big').decode()
lista = ["I like cooking",
"My dogĀ“s name is Doug",
"Hi, there"]
encoded = [to_int(s) for s in lista]
decoded = [to_str(s) for s in encoded]
encoded:
[1483184754092458833204681315544679,
28986146900667755422058678317652141643897566145770855,
1335744041264385192549]
decoded:
['I like cooking',
'My dogĀ“s name is Doug',
'Hi, there']
As noted in the comments, converting phrases to integers with int() won't work if the phrase contains whitespace or most non-alphanumeric characters with a few exceptions.
If your phrases all use a common encoding, then you might get something closer to what you want by converting your strings to bytearrays. For example:
s = 'My dogĀ“s name is Doug'
b = bytearray(s, 'utf-8')
print(list(b))
# [77, 121, 32, 100, 111, 103, 194, 180, 115, 32, 110, 97, 109, 101, 32, 105, 115, 32, 68, 111, 117, 103]
From there you would have to figure out whether or not you want to preserve the list of integers representing each phrase or combine them in some way depending on what you intend to do with these numerical string representations.
Since you want to convert your text for an AI, you should do something like this:
import re
def clean_text(text, vocab):
'''
normalizes the string
'''
chars = {'\'':[u"\u0060", u"\u00B4", u"\u2018", u"\u2019"], 'a':[u"\u00C0", u"\u00C1", u"\u00C2", u"\u00C3", u"\u00C4", u"\u00C5", u"\u00E0", u"\u00E1", u"\u00E2", u"\u00E3", u"\u00E4", u"\u00E5"],
'e':[u"\u00C8", u"\u00C9", u"\u00CA", u"\u00CB", u"\u00E8", u"\u00E9", u"\u00EA", u"\u00EB"],
'i':[u"\u00CC", u"\u00CD", u"\u00CE", u"\u00CF", u"\u00EC", u"\u00ED", u"\u00EE", u"\u00EF"],
'o':[u"\u00D2", u"\u00D3", u"\u00D4", u"\u00D5", u"\u00D6", u"\u00F2", u"\u00F3", u"\u00F4", u"\u00F5", u"\u00F6"],
'u':[u"\u00DA", u"\u00DB", u"\u00DC", u"\u00DD", u"\u00FA", u"\u00FB", u"\u00FC", u"\u00FD"]}
for gud in chars:
for bad in chars[gud]:
text = text.replace(bad, gud)
if 'http' in text:
return ''
text = text.replace('&', ' and ')
text = re.sub(r'\.( +\.)+', '..', text)
#text = re.sub(r'\.\.+', ' ^ ', text)
text = re.sub(r',+', ',', text)
text = re.sub(r'\-+', '-', text)
text = re.sub(r'\?+', ' ? ', text)
text = re.sub(r'\!+', ' ! ', text)
text = re.sub(r'\'+', "'", text)
text = re.sub(r';+', ':', text)
text = re.sub(r'/+', ' / ', text)
text = re.sub(r'<+', ' < ', text)
text = re.sub(r'>+', ' > ', text)
text = text.replace('%', '% ')
text = text.replace(' - ', ' : ')
text = text.replace(' -', " - ")
text = text.replace('- ', " - ")
text = text.replace(" '", " ")
text = text.replace("' ", " ")
#for c in ".,:":
# text = text.replace(c + ' ', ' ' + c + ' ')
text = re.sub(r' +', ' ', text.strip(' '))
for i in text:
if i not in vocab:
text = text.replace(i, '')
return text
def arr_to_vocab(arr, vocabDict):
'''
returns a provided array converted with provided vocab dict, all array elements have to be in the vocab, but not all vocab elements have to be in the input array, works with strings too
'''
try:
return [vocabDict[i] for i in arr]
except Exception as e:
print (e)
return []
def str_to_vocab(vocab):
'''
generates vocab dicts
'''
to_vocab = {}
from_vocab = {}
for index, i in enumerate(vocab):
to_vocab[index] = i
from_vocab[i] = index
return to_vocab, from_vocab
vocab = sorted([chr(i) for i in range(32, 127)]) # a basic vocab for your model
vocab.insert(0, None)
toVocab, fromVocab = str_to_vocab(vocab) #converting vocab into usable form
your_data_str = ["I like cooking", "My dogĀ“s name is Doug", "Hi, there"] #your data, a list of strings
X = []
for i in your_data_str:
X.append(arr_to_vocab(clean_text(i, vocab), fromVocab)) # normalizing and converting to "ints" each string
# your data is now almost ready for your model, just pad it to the size of your input with zeros and it's done
print (X)
If you want to know how convert an "int" string back to a string, tell me.
I have the following example text / tweet:
RT #trader $AAPL 2012 is oĀ“oĀ“oĀ“oĀ“oĀ“pen to āTalkā about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO
I want to follow the procedure of Table 1 in Li, T, van Dalen, J, & van Rees, P.J. (Pieter Jan). (2017). More than just noise? Examining the information content of stock microblogs on financial markets. Journal of Information Technology. doi:10.1057/s41265-016-0034-2 in order to clean up the tweet.
They clean the tweet up in such a way that the final result is:
{RT|123456} {USER|56789} {TICKER|AAPL} {NUMBER|2012} notooopen nottalk patent {COMPANY|GOOG} notdefinetli treatment {HASH|samsung} {EMOTICON|POS} haha {URL}
I use the following script to tokenize the tweet based on the regex:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"""
,
# Twitter username:
r"""(?:#[\w_]+)"""
,
# Hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Cashtags:
r"""(?:\$+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = word_re.findall(s)
if not self.preserve_case:
words = map((lambda x: x if emoticon_re.search(x) else x.lower()), words)
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to āTalkā about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
This yields the following output:
rt
#trader
$aapl
2012
is
oooopen
to
ā
talk
ā
about
patents
with
goog
definitely
not
the
treatment
#samsung
got
:-)
heh
url_that_cannot_be_posted_on_SO
How can I adjust this script to get:
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
ā
talk
ā
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|url_that_cannot_be_posted_on_SO}
Thanks in advance for helping me out big time!
You really need to use named capturing groups (mentioned by thebjorn), and use groupdict() to get name-value pairs upon each match. It requires some post-processing though:
All pairs where the value is None must be discarded
If the self.preserve_case is false the value can be turned to lower case at once
If the group name is WORD, ELLIPSIS or ELSE the values are added to words as is
If the group name is HASHTAG, CASHTAG, USER or URL the values are added first stripped of $, # and # chars at the start and then added to words as {<GROUP_NAME>|<VALUE>} item
All other matches are added to words as {<GROUP_NAME>|<VALUE>} item.
Note that \w matches underscores by default, so [\w_] = \w. I optimized the patterns a little bit.
Here is a fixed code snippet:
import re
emoticon_string = r"""
(?P<EMOTICON>
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[][()dDpP/:{}#|\\] # mouth
|
[][()dDpP/:}{#|\\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$#.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
,
# Twitter username:
r"""(?P<USER>#\w+)"""
,
# Hashtags:
r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
,
# Cashtags:
r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
,
# Remaining word types:
r"""
(?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
|
(?P<WORD>\w+) # Words without apostrophes or dashes.
|
(?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
|
(?P<ELSE>\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
#print(word_re.pattern)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = []
for x in word_re.finditer(s):
for key, val in x.groupdict().items():
if val:
if not self.preserve_case:
val = val.lower()
if key in ['WORD','ELLIPSIS','ELSE']:
words.append(val)
elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
words.append("{{{}|{}}}".format(key, re.sub(r'^[##$]+', '', val)))
else:
words.append("{{{}|{}}}".format(key, val))
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to āTalkā about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
With test = ' RT #trader $AAPL 2012 is oooopen to āTalkā about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com', it outputs
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
ā
talk
ā
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|http://some.site.here.com}
See the regex demo online.
The documentation of sense2vec mentions 3 primary files - the first of them being merge_text.py. I have tried several types of inputs- txt,csv,bzipped file since merge_text.py tries to open files compressed by bzip2.
The file can be found at:
https://github.com/spacy-io/sense2vec/blob/master/bin/merge_text.py
What type of input format does this script require?
Further, if anyone could please suggest how to train the model.
I extended and adjusted the code samples from sense2vec.
You go from this input text:
"As far as Saudi Arabia and its motives, that is very simple also. The Saudis are
good at money and arithmetic. Faced with the painful choice of losing money
maintaining current production at US$60 per barrel or taking two million barrels
per day off the market and losing much more money - it's an easy choice: take
the path that is less painful. If there are secondary reasons like hurting US
tight oil producers or hurting Iran and Russia, that's great, but it's really
just about the money."
To this:
as|ADV far|ADV as|ADP saudi_arabia|ENT and|CCONJ its|ADJ motif|NOUN that|ADJ is|VERB very|ADV simple|ADJ also|ADV saudis|ENT are|VERB good|ADJ at|ADP money|NOUN and|CCONJ arithmetic|NOUN faced|VERB with|ADP painful_choice|NOUN of|ADP losing|VERB money|NOUN maintaining|VERB current_production|NOUN at|ADP us$|SYM 60|MONEY per|ADP barrel|NOUN or|CCONJ taking|VERB two_million|CARDINAL barrel|NOUN per|ADP day|NOUN off|ADP market|NOUN and|CCONJ losing|VERB much_more_money|NOUN it|PRON 's|VERB easy_choice|NOUN take|VERB path|NOUN that|ADJ is|VERB less|ADV painful|ADJ if|ADP there|ADV are|VERB secondary_reason|NOUN like|ADP hurting|VERB us|ENT tight_oil_producer|NOUN or|CCONJ hurting|VERB iran|ENT and|CCONJ russia|ENT 's|VERB great|ADJ but|CCONJ it|PRON 's|VERB really|ADV just|ADV about|ADP money|NOUN
Double line breaks are interpreted as separate documents.
Urls are recognized as such, stripped down to domain.tld and marked as |URL
Nouns (also noun being part of noun phrases) are lemmatized (as motives become motifs)
Words with POS-tags like DET (determinate article) and PUNCT (for punctuation) are dropped
Here's the code. Let me know if you have questions.
I'll probably publish it on github.com/woltob soon.
import spacy
import re
nlp = spacy.load('en')
nlp.matcher = None
LABELS = {
'ENT': 'ENT',
'PERSON': 'PERSON',
'NORP': 'ENT',
'FAC': 'ENT',
'ORG': 'ENT',
'GPE': 'ENT',
'LOC': 'ENT',
'LAW': 'ENT',
'PRODUCT': 'ENT',
'EVENT': 'ENT',
'WORK_OF_ART': 'ENT',
'LANGUAGE': 'ENT',
'DATE': 'DATE',
'TIME': 'TIME',
'PERCENT': 'PERCENT',
'MONEY': 'MONEY',
'QUANTITY': 'QUANTITY',
'ORDINAL': 'ORDINAL',
'CARDINAL': 'CARDINAL'
}
pre_format_re = re.compile(r'^[\`\*\~]')
post_format_re = re.compile(r'[\`\*\~]$')
url_re = re.compile(r'(https?:\/\/)?([a-z0-9-]+\.)?([\d\w]+?\.[^\/]{2,63})')
single_linebreak_re = re.compile('\n')
double_linebreak_re = re.compile('\n{2,}')
whitespace_re = re.compile(r'[ \t]+')
quote_re = re.compile(r'"|`|Ā“')
def strip_meta(text):
text = text.replace('per cent', 'percent')
text = text.replace('>', '>').replace('<', '<')
text = pre_format_re.sub('', text)
text = post_format_re.sub('', text)
text = double_linebreak_re.sub('{2break}', text)
text = single_linebreak_re.sub(' ', text)
text = text.replace('{2break}', '\n')
text = whitespace_re.sub(' ', text)
text = quote_re.sub('', text)
return text
def transform_doc(doc):
for ent in doc.ents:
ent.merge(ent.root.tag_, ent.text, LABELS[ent.label_])
for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
np = np[1:]
np.merge(np.root.tag_, np.text, np.root.ent_type_)
strings = []
for sent in doc.sents:
sentence = []
if sent.text.strip():
for w in sent:
if w.is_space:
continue
w_ = represent_word(w)
if w_:
sentence.append(w_)
strings.append(' '.join(sentence))
if strings:
return '\n'.join(strings) + '\n'
else:
return ''
def represent_word(word):
if word.like_url:
x = url_re.search(word.text.strip().lower())
if x:
return x.group(3)+'|URL'
else:
return word.text.lower().strip()+'|URL?'
text = re.sub(r'\s', '_', word.text.strip().lower())
tag = LABELS.get(word.ent_type_)
# Dropping PUNCTUATION such as commas and DET like the
if tag is None and word.pos_ not in ['PUNCT', 'DET']:
tag = word.pos_
elif tag is None:
return None
# if not word.pos_:
# tag = '?'
return text + '|' + tag
corpus = '''
As far as Saudi Arabia and its motives, that is very simple also. The Saudis are
good at money and arithmetic. Faced with the painful choice of losing money
maintaining current production at US$60 per barrel or taking two million barrels
per day off the market and losing much more money - it's an easy choice: take
the path that is less painful. If there are secondary reasons like hurting US
tight oil producers or hurting Iran and Russia, that's great, but it's really
just about the money.
'''
corpus_stripped = strip_meta(corpus)
doc = nlp(corpus_stripped)
corpus_ = []
for word in doc:
# only lemmatize NOUN and PROPN
if word.pos_ in ['NOUN', 'PROPN'] and len(word.text) > 3 and len(word.text) != len(word.lemma_):
# Keep the original word with the length of the lemma, then add the white space, if it was there.:
lemma_ = str(word.text[:1]+word.lemma_[1:]+word.text_with_ws[len(word.text):])
# print(word.text, lemma_)
corpus_.append(lemma_)
# print(word.text, word.text[:len(word.lemma_)]+word.text_with_ws[len(word.text):])
# All other words are added normally.
else:
corpus_.append(word.text_with_ws)
result = transform_doc(nlp(''.join(corpus_)))
sense2vec_filename = 'text.txt'
file = open(sense2vec_filename,'w')
file.write(result)
file.close()
print(result)
You could visualise your model using Gensim in Tensorboard using this approach:
https://github.com/ArdalanM/gensim2tensorboard
I'll also adjust this code to work with the sense2vec approach (e.g. the words become lowercase in the preprocessing step, just comment it out in the code).
Happy coding,
woltob
The input file should be a bzipped json. To use a plain text file just edit the merge_text.py as follow:
def iter_comments(loc):
with bz2.BZ2File(loc) as file_:
for i, line in enumerate(file_):
yield line.decode('utf-8', errors='ignore')
# yield ujson.loads(line)['body']
This is somewhat complicated. I have a list that looks like this:
['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
In my list, the '\n' is what separate a story. What I would like to do is to create a dictionary from the above list that would like this:
dict = {ID1: [19841018, 'Plunging oil... cut in the price'], ID2: [19841018, 'The U.S. dollar... the foreign-exchange markets']}
You can see that my KEY of my dictionnary is the ID and the items are the year and the combination of the stories. Is that doable?
My IDs, are in this format J00100394, J00384932. So they all start with J00.
The tricky part is split your list by any value, so i've take this part from here.Then i've parsed the list parts to built the res dict
>>> import itertools
>>> def isplit(iterable,splitters):
... return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]
...
>>> l = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> res = {}
>>> for sublist in isplit(l,('\n',)):
... id_parts = sublist[0].split()
... story = ' '.join (sentence.strip() for sentence in sublist[1:])
... res[id_parts[1].strip()] = [id_parts[0].strip(), story]
...
>>> res
{'ID2': ['19841018', 'The U.S. dollar... the foreign-exchange markets late New York trading'], 'ID1': ['19841018', 'Plunging oil... cut in the price']}
I code an answer that use generator. The idea is that every time that start an id token the generator return the last key computed. You can costumize by change the check_fun() and how to mix the part of the description.
def trailing_carriage(s):
if s.endswith('\n'):
return s[:-1]
return s
def check_fun(s):
"""
:param s:Take a string s
:return: None if s dosn't match the ID rules. Otherwise return the
name,value of the token
"""
if ' ' in s:
id_candidate,name = s.split(" ",1)
try:
return trailing_carriage(name),int(id_candidate)
except ValueError:
pass
def parser_list(list, check_id_prefix=check_fun):
name = None #key dict
id_candidate = None
desc = "" #description string
for token in list:
check = check_id_prefix(token)
if check is not None:
if name is not None:
"""Return the previous coputed entry"""
yield name,id_val,desc
name,id_val = check
else:
"""Append the description"""
desc += trailing_carriage(token)
if name is not None:
"""Flush the last entry"""
yield name,id_val,desc
>>> list = ['19841018 ID1\n', ' Plunging oil... \n', 'cut in the price \n', '\n', '19841018 ID2\n', ' The U.S. dollar... \n', 'the foreign-exchange markets \n', 'late New York trading \n', '\n']
>>> print {k:[i,d] for k,i,d in parser_list(list)}
{'ID2': [19841018, ' Plunging oil... cut in the price The U.S. dollar... the foreign-exchange markets late New York trading '], 'ID1': [19841018, ' Plunging oil... cut in the price ']}