Python NLTK Word Tokenize UnicodeDecode Error - python

I get the error when trying the below code. I try to read from a text file and tokenize the words using nltk. Any ideas? The text file can be found here
from nltk.tokenize import word_tokenize
short_pos = open("./positive.txt","r").read()
#short_pos = short_pos.decode('utf-8').lower()
short_pos_words = word_tokenize(short_pos)
Error:
Traceback (most recent call last):
File "sentimentAnalysis.py", line 19, in <module>
short_pos_words = word_tokenize(short_pos)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 106, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 91, in sent_tokenize
return tokenizer.tokenize(text)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1280, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1325, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1460, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 6: ordinal not in range(128)
Thanks for your support.

Looks like this text is encoded in Latin-1. So this works for me:
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print len(short_pos_words)
You can test for different encodings by e.g. looking at the file in a good editor like TextWrangler. You can
1) open the file in different encodings to see which one looks good and
2) look at the character that caused the issue. In your case, that is the character in position 4645 - which happens to be an accented word from a Spanish review. That is not part of Ascii, so that doesn't work; it's also not a valid codepoint in UTF-8.

Your file is encoded using "latin-1".
from nltk.tokenize import word_tokenize
import codecs
with codecs.open("positive.txt", "r", "latin-1") as inputfile:
text=inputfile.read()
short_pos_words = word_tokenize(text)
print short_pos_words

Related

TypeError: expected string or bytes-like object; I have seen similar posts but I have no NA in my data

I am trying to clean my text data in spreadsheet and it has no NAs. I face this error:TypeError: expected string or bytes-like object.
import nltk
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
paragraph=pd.read_excel("..")
paragraph.info()
paragraph['Subject'].dropna(inplace=True)
sentence = paragraph['Subject'].apply(nltk.sent_tokenize)
lemmatizer=WordNetLemmatizer()
# lemmatizer
for i in range(len(sentence)):
words=nltk.word_tokenize(sentence[i])
words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
sentence[i]=' '.join(words)
I am getting these errors below.
Traceback (most recent call last):
File "<ipython-input-20-95ed150df96b>", line 11, in <module>
words=nltk.word_tokenize(sentence[i])
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 143, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\__init__.py", line 105, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1269, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1323, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1323, in <listcomp>
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1313, in span_tokenize
for sl in slices:
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1354, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 317, in _pair_iter
prev = next(it)
File "C:\Users\320055025\AppData\Local\Continuum\anaconda3\lib\site-packages\nltk\tokenize\punkt.py", line 1327, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or bytes-like object
This might work:
words=nltk.word_tokenize(str(sentence[i]))

Extracting text from PDF -Tokenize TypeError

When trying to extract text from PDF's using Textract, I get a TypeError: "cannot use a string pattern on a bytes-like object". Can anyone be helpful with what might be an solution?
When I print(text), I get the text from the PDF I want extracted, though in a bit strange format. However, text[0] etc only contain numbers..?
import textract
import os
from nltk.tokenize import word_tokenize
for filename in os.listdir('Harbour PDF'):
if '.DS_Store' == filename:
continue
filename = 'Harbour PDF/' + filename
print(filename)
text = textract.process(filename)
print(text)
tokens = word_tokenize(text)
keywords = [word for word in word_tokenize(text,'english',False)]
Error:
File "scrapePort.py", line 15, in
tokens = word_tokenize(text)
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/init.py", line 143, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/init.py", line 105, in sent_tokenize
return tokenizer.tokenize(text)
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1269, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1323, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1323, in
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1313, in span_tokenize
for sl in slices:
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1354, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 317, in _pair_iter
prev = next(it)
File "/Users/Rasmus/anaconda3/lib/python3.6/site-packages/nltk/tokenize/punkt.py", line 1327, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: cannot use a string pattern on a bytes-like object
You may need to decode your data first.
text.decode("utf-8")
I had the same problem, Try this
tokens = word_tokenize(text.decode("utf-8"))

NLTK Python word_tokenize [duplicate]

This question already has answers here:
How to fix: "UnicodeDecodeError: 'ascii' codec can't decode byte"
(20 answers)
Python (nltk) - UnicodeDecodeError: 'ascii' codec can't decode byte
(4 answers)
Closed 4 years ago.
I have loaded a txt file that contains 6000 lines of sentences. I have tried to split("/n") and word_tokenize the sentences, but I get the following error:
Traceback (most recent call last):
File "final.py", line 52, in <module>
short_pos_words = word_tokenize(short_pos)
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 128, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 95, in sent_tokenize
return tokenizer.tokenize(text)
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 313, in _pair_iter
for el in it:
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1291, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1337, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1472, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 312, in _pair_iter
prev = next(it)
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 581, in _annotate_first_pass
for aug_tok in tokens:
File "/home/tuanct1997/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 546, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 6: ordinal not in range(128)
The issue is related to the encoding of file's content. Assuming that you want to decode str to UTF-8 unicode:
Option 1 (Deprecated in Python 3):
import sys
reload(sys)
sys.setdefaultencoding('utf8')
Option 2:
Pass the encode parameter to the open function when trying to open your text file:
f = open('/path/to/txt/file', 'r+', encoding="utf-8")

Getting TypeError: expected string or bytes-like object when trying to use sent_tokenize on a dataset

Im trying to tokenize the mpqa political debate debate corpus with:
import nltk
from sklearn.datasets import load_files
categories=['abortion', 'creation', 'gayRights', 'god', 'guns', 'healthcare']
dataset= load_files(r'C:\Users\kahnl\svm tutorial\SomasundaranWiebe-politicalDebates', categories=categories)
from nltk.tokenize import sent_tokenize
sentences= sent_tokenize(dataset)
which gives the error:
Traceback (most recent call last):
File "<pyshell#5>", line 1, in <module>
sentences= sent_tokenize(dataset)
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\__init__.py", line 95, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1237, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1285, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in <listcomp>
return [(sl.start, sl.stop) for sl in slices]
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1316, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 312, in _pair_iter
prev = next(it)
File "C:\Users\kahnl\AppData\Local\Programs\Python\Python36\lib\site-packages\nltk\tokenize\punkt.py", line 1289, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or bytes-like object

Unable to decode yml file ... utf8' codec can't decode byte #xa0: invalid start byte

I'm trying to read YAML file and convert it into dictionary file. I'm seeing an issue while loading the file into dict variable.
I tried to search for similar issues. One of the replies in stackoverflow was to replace each character '\\xa0' with ' '. I tried do that line = line.replace('\\xa0',' '). This program doesn't work on Python 2.7 version. I tried using Python 3 it works fine.
import yaml
import sys
yaml_dir = "/root/tools/test_case/"
#file_name = "TC_CFD_SR.yml"
file_name = "TC_QB.yml"
tc_file_name = yaml_dir + file_name
def write(file,content):
file = open(file,'a')
file.write(content)
file.close()
def verifyYmlFile(yml_file):
data = {}
with open(yml_file, 'r') as fin:
for line in fin:
line = line.replace('\\xa0',' ')
write('anand-yaml.yml',line)
with open('anand-yaml.yml','r') as fin:
data = yaml.load(fin)
return data
if __name__ == '__main__':
data = {}
print "verifying yaml"
data= verifyYmlFile(tc_file_name)
Error:
[root#anand-harness test_case]# python verify_yaml.py
verifying yaml
Traceback (most recent call last):
File "verify_yaml.py", line 29, in <module>
data= verifyYmlFile(tc_file_name)
File "verify_yaml.py", line 23, in verifyYmlFile
data = yaml.load(fin)
File "/usr/lib64/python2.6/site-packages/yaml/__init__.py", line 71, in load
return loader.get_single_data()
File "/usr/lib64/python2.6/site-packages/yaml/constructor.py", line 37, in get_single_data
node = self.get_single_node()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 36, in get_single_node
document = self.compose_document()
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 55, in compose_document
node = self.compose_node(None, None)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 82, in compose_node
node = self.compose_sequence_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 111, in compose_sequence_node
node.value.append(self.compose_node(node, index))
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 84, in compose_node
node = self.compose_mapping_node(anchor)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 133, in compose_mapping_node
item_value = self.compose_node(node, item_key)
File "/usr/lib64/python2.6/site-packages/yaml/composer.py", line 64, in compose_node
if self.check_event(AliasEvent):
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 98, in check_event
self.current_event = self.state()
File "/usr/lib64/python2.6/site-packages/yaml/parser.py", line 449, in parse_block_mapping_value
if not self.check_token(KeyToken, ValueToken, BlockEndToken):
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 116, in check_token
self.fetch_more_tokens()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 244, in fetch_more_tokens
return self.fetch_single()
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 653, in fetch_single
self.fetch_flow_scalar(style='\'')
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 667, in fetch_flow_scalar
self.tokens.append(self.scan_flow_scalar(style))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1156, in scan_flow_scalar
chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
File "/usr/lib64/python2.6/site-packages/yaml/scanner.py", line 1196, in scan_flow_scalar_non_spaces
while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 91, in peek
self.update(index+1)
File "/usr/lib64/python2.6/site-packages/yaml/reader.py", line 165, in update
exc.encoding, exc.reason)
yaml.reader.ReaderError: 'utf8' codec can't decode byte #xa0: invalid start byte
in "anand-yaml.yml", position 3246
What am I missing?
The character sequence "\\xa0" is not the problem that you see in the message, the problem is the sequence "\xa0" (note that the backslash is not escaped).
You replacement line should be:
line = line.replace('\xa0',' ')
to circumvent the problem.
If you know what the format is you can do the correct conversion yourself, but that should not be necessary and that or the above patching is not a structural solution. It would be best if the YAML file was generated in a correct way (they default to UTF-8, so it should contain correct UTF-8). It could UTF-16 without the appropriate BOM (which the yaml library interprets IIRC).
s1 = 'abc\\xa0xyz'
print(repr(s1))
u1 = s1.decode('utf-8') # this works fine
s = 'abc\xa0xyz'
print(repr(s))
u = s.decode('utf-8') # this throws an error

Categories

Resources