UnicodeDecodeError: Unexpected end of data issue - python

I am currently extracting comments from various subreddits on Reddit using praw and trying to calculate their sentiment and add them to a database.
It works by reading from a file that contains subreddit names in order to know which subreddit to pull the comments from.
with open('subs.txt') as f:
for line in f:
string = line.strip()
for submission in reddit.subreddit(string).hot(limit=10):
subreddit = reddit.subreddit(line.strip())
name = str(subreddit.display_name)
comments = submission.comments.list()
for c in comments:
if isinstance(c, MoreComments):
continue
#print c.body
author = c.author
score = c.score
created_at = c.created_utc
upvotes = c.ups
#print c.score
comment_sentiment = getSentiment(c.body)
subreddit_sentiment += comment_sentiment
num_comments += 1
What I have currently implemented works fine up until it reaches a certain comment where it throws the following error message:
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I have looked at a bunch of different questions on here where people have encountered the same issue, but the solutions given did not seem to help my problem.
The full stack trace is as follows:
Traceback (most recent call last):
File "extract.py", line 48, in <module>
comment_sentiment = getSentiment(c.body)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 93, in getSentiment
tagged_sentences = makeTag(pos_tag_text, max_key_size, dictionary)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 106, in makeTag
return [addTag(sentence, max_key_size, dictionary) for sentence in postagged_sentences]
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 119, in addTag
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I've been racking my brain trying to think of various ways to solve this issue and unfortunately I'm lost. Is it something to do with reading from the file containing the subreddits or would it be concerning the limit of pulling data with praw? I have tried to isolate the problem but can't seem to shake this error.
Would anyone be able to help me fix this issue? I would appreciate any insight.
Many thanks.
EDIT:
sentiment_analysis.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import nltk
import yaml
import sys
import os
import re
//splitting the text initially
def splitString(text):
nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
sentences = nltk_splitter.tokenize(text)
tokenized_sentences = [nltk_tokenizer.tokenize(sentence) for sentence in sentences]
return tokenized_sentences
def tagWords(sentence,max_key_size, dictionary, tag_stem=False):
# Tag all possible sentences
tagged_sentence = []
length = len(sentence)
if max_key_size == 0:
max_key_size = length
i = 0
while (i < length):
j = min(i + max_key_size, length)
tagged = False
while (j > i):
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore') // here is where it gets caught
expression_stem = ' '.join([word[1] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
if tag_stem == True:
word = expression_word
else:
word = expression_word
....

Try encoding the string explicitly:
c.body.encode('utf-8')

Related

Unicode Error when I try import a txt file tab separated

(I work on Mac)
When I insert my python code to obtain data from txt file (tab separated) I have the error: "'utf-8' codec can't decode byte 0xa3 in position 4186: invalid start byte".
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
def load_transactions (file_name, sep="\t"):
lines = open(file_name, 'rt').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
groceries=load_transactions("Online_Retail.txt",sep="\t")
len(groceries)
Thank you.
I resolved this form:
First discover the unicode of the file
from chardet.universaldetector import UniversalDetector
usock = open('/Users/leonorbrites/Desktop/Online_Retail.txt', 'rb')
detector = UniversalDetector()
for line in usock.readlines():
detector.feed(line)
if detector.done: break
detector.close()
usock.close()
print (detector.result)
Then change the unicode from my file
def transactions (file_name, sep="\t"):
lines = open(file_name,'rt', encoding='iso-8859-1').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
retail=transactions('/Users/leonorbrites/Desktop/Online_Retail.txt', sep="\t")
len(retail)

How to translate encoding by ansi into unicode

When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence
Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.

Writing to CSV - set vs list - UnicodeEncodeError

I'm building a simple scraper in order to learn python.
After writing the csvWriter function below, I'm having issues. It seems that the encoding can't be written to csv file (I assume this is because of price information I'm scraping).
Also, I'm wondering if I am correct in thinking that in this case, it is best to go from set -> list to get the information zipped and presented in the way that I want before writing.
Also - any general advice on how I am approaching this?
from bs4 import BeautifulSoup
import requests
import time
import csv
response = request.get('http://website.com/subdomain/logqueryhere')
baseurl = 'http://website.com'
soup = BeautifulSoup(response.text)
hotelInfo = soup.find_all("div", {'class': "hotel-wrap"})
#retrieveLinks: A function to generate a list of hotel URL's to be passed to the price checker.
def retrieveLinks():
for hotel in hotelInfo:
urllist = []
hotelLink = hotel.find('a', attrs={'class': ''})
urllist.append(hotelLink['href'])
scraper(urllist)
hotelnameset = set()
hotelurlset = set()
hotelpriceset = set()
# Scraper: A function to scrape from the lists generated above with retrieveLinks
def scraper(inputlist):
global hotelnameset
global hotelurlset
global hotelpriceset
#Use a set here to avoid any dupes.
for url in inputlist:
fullurl = baseurl + url
hotelurlset.add(str(fullurl))
hotelresponse = requests.get(fullurl)
hotelsoup = BeautifulSoup(hotelresponse.text)
hoteltitle = hotelsoup.find('div', attrs={'class': 'vcard'})
hotelhighprice = hotelsoup.find('div', attrs={'class': 'pricing'}).text
hotelpriceset.add(hotelhighprice)
for H1 in hoteltitle:
hotelName = hoteltitle.find('h1').text
hotelnameset.add(str(hotelName))
time.sleep(2)
csvWriter()
#csvWriter: A function to write the above mentioned sets/lists to a CSV file.
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
c = csv.writer(open("hoteldata.csv", 'wb'))
for row in zipped:
c.writerow(row)
retrieveLinks()
Error is as follows -
± |Add_CSV_Writer U:2 ✗| → python main.py
Traceback (most recent call last):
File "main.py", line 62, in <module>
retrieveLinks()
File "main.py", line 18, in retrieveLinks
scraper(urllist)
File "main.py", line 44, in scraper
csvWriter()
File "main.py", line 60, in csvWriter
c.writerow(row)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u20ac' in position 0: ordinal not in range(128)
Posting your actual error will really help! In any case, in python 2.X the CSV writer does not automatically encode unicode for you. You essentially have to write your own using unicodecsv (https://pypi.python.org/pypi/unicodecsv/0.9.0) or use one of the unicode CSV implementations on the web (1):
import unicodecsv
def csvWriter():
global hotelnameset
global hotelurlset
global hotelpriceset
csvname = list(hotelnameset)
csvurl = list(hotelurlset)
csvprice = list(hotelpriceset)
#lets zip the values we neded (until we learn a better way to do it)
zipped = zip(csvname, csvurl, csvprice)
with open('hoteldata.csv', 'wb') as f_in:
c = unicodecsv.writer(f_in, encoding='utf-8')
for row in zipped:
c.writerow(row)

UnicodeDecodeError: 'gbk' codec can't decode bytes for Chinese

Environment:
- Mac OS Yosemite
- Python 2.7
- The file file that I am reading is saved in txt format
So I have a script to segment Chinese text into sentences and below is the code:
# coding: utf-8
cutlist ="。!?".decode('utf-8')
def FindToken(cutlist, char):
if char in cutlist:
return True
else:
return False
def Cut(cutlist,lines):
l = []
line = []
for i in lines:
if FindToken(cutlist,i):
line.append(i)
l.append(''.join(line))
line = [] =
else:
line.append(i)
return l
for lines in file("t.txt"):
l = Cut(list(cutlist),list(lines.decode('gbk')))
for line in l:
if line.strip() !="":
li = line.strip().split()
for sentence in li:
print sentence
But I am getting the following error:
Can someone give me some guidance as of what is causing this error? Thanks!
So I changed decode to utf-8 as following:
l = Cut(list(cutlist),list(lines.decode('utf-8')))
And it works now.

'ascii' codec can't encode character

I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.
I am also using beautifulsoup4 which extracts the text in decoded form.
This is my code that I have used.
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
I am getting the following error every time i run the script.
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
Any suggestions how i can remove this error.
if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.

Categories

Resources