'ascii' codec can't encode character - python

I am trying to parse an HTML link into the code and take its source code as list of strings. As I have to use get some relevant data from it, I am decoding everything into UTF-8 scheme.
I am also using beautifulsoup4 which extracts the text in decoded form.
This is my code that I have used.
def do_underline(line,mistakes):
last = u'</u></font>'
first = u"<u><font color='red'>"
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
lenm = len(mistakes)
for i in range(lenm):
a.insert(mistakes[lenm-i-1][2],last)
a.insert(mistakes[lenm-i-1][1],first)
b = u''
return b.join(a)
def readURL(u):
"""
URL -> List
Opens a webpage's source code and extract it text
along with blank and new lines.
enumerate all lines.(including blank and new lines
"""
global line_dict,q
line_dict = {}
p = opener.open(u)
p1 = p.readlines()
q = [i.decode(encoding = 'UTF-8',errors='ignore') for i in p1]
q1 = [BeautifulSoup(i).get_text() for i in q]
q2 = list(enumerate(q1))
line_dict = {i:j for (i,j) in enumerate(q)}
return q2
def process_file(f):
"""
(.html file) -> List of Spelling Mistakes
"""
global line_dict
re = readURL(f)
de = del_blankempty(re)
fd = form_dict(de)
fflist = []
chklst = []
for i in fd:
chklst = chklst + list_braces(i,line_dict)
fflist = fflist + find_index_mistakes(i,fd)
final_list = list(set(is_inside_braces_or_not(chklst,fflist)))
final_dict = {i:sorted(list(set([final_list[j] for j in range(len(final_list)) if final_list[j][0] == i])),key=lambda student: student[1]) for i in fd}
for i in line_dict:
if i in fd:
line_dict[i] = do_underline(line_dict[i],final_dict[i])
else:
line_dict[i] = line_dict[i]
create_html_file(line_dict)
print "Your Task is completed"
def create_html_file(a):
import io
fl = io.open('Spellcheck1.html','w', encoding='UTF-8')
for i in a:
fl.write(a[i])
print "Your HTML text file is created"
I am getting the following error every time i run the script.
Traceback (most recent call last):
File "checker.py", line 258, in <module>
process_file('https://www.fanfiction.net/s/9421614/1/The-Night-Blooming-Flower')
File "checker.py", line 243, in process_file
line_dict[i] = do_underline(line_dict[i],final_dict[i])
File "checker.py", line 89, in do_underline
a = [i.decode(encoding='UTF-8', errors='ignore') for i in line]
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeEncodeError: 'ascii' codec can't encode character u'\xf3' in position 0: ordinal not in range(128)
Any suggestions how i can remove this error.
if there is a way which decodes evrything into UTF-8 coming from the given link, then i think it will solve the problem.

Related

Python Trouble Parsing a .max translated to OLE File => output unreadable in text format

The following script outputs files unreadable in .txt format. Please advise.
I inspired myself with: https://area.autodesk.com/m/drew.avis/tutorials/writing-and-reading-3ds-max-scene-sidecar-data-in-python
This is to replicate a macho shark into a mechanical robot.
import olefile
# set this to your file
f = r'C:\MRP\Shortfin_Mako_Shark_Rigged_scanline.max'
def cleanString(data,isArray=False):
# remove first 6 bytes + last byte
data = data[6:]
if isArray:
data = data[:-1]
return data
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
# myString = fin.read().decode("utf-16")
# myString = cleanString(myString, isArray=True)
fout = open(entry[0], "wb")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(s)
Please advise.
https://www.turbosquid.com/fr/3d-models/max-shortfin-mako-shark-rigged/991102#
I also tried this:
with olefile.OleFileIO(f) as ole:
ole.listdir()
print(ole.listdir())
i = 0
for entry in ole.listdir():
i = i + 1
print(entry)
if i > 2:
fin = ole.openstream(entry)
#myString = fin.read().decode("utf-16")
#myString = cleanString(myString, isArray=True)
fout = open(entry[0], "w")
print(fout)
while True:
s = fin.read(8192)
if not s:
break
fout.write(cleanString(s, isArray = True).decode("utf-8"))
# stream = ole.openstream('CustomFileStreamDataStorage/MyString')
# myString = stream.read().decode('utf-16')
# myString = cleanString(myString)
# stream = ole.openstream('CustomFileStreamDataStorage/MyGeometry')
# myGeometry = stream.read().decode('utf-16')
# myGeometry = cleanString(myGeometry, isArray=True)
# myGeometry = myGeometry.split('\x00')
# stream = ole.openstream('CustomFileStreamDataStorage/MyLayers')
# myLayers = stream.read().decode('utf-16')
# myLayers = cleanString(myLayers, isArray=True)
# myLayers = myLayers.split('\x00')
# print ("My String: {}\nMy Geometry: {}\nMy Layers: {}".format (myString, myGeometry, myLayers))
What is the right encoding to decode from?
Exception has occurred: UnicodeDecodeError
'utf-8' codec can't decode bytes in position 4-5: invalid continuation byte
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-8"))
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode characters in position 2-5: character maps to
File "C:\MRP\ALG_LIN.py", line 59, in
fout.write(cleanString(s, isArray = True).decode("utf-16"))
KR,
Ludo

Unicode Error when I try import a txt file tab separated

(I work on Mac)
When I insert my python code to obtain data from txt file (tab separated) I have the error: "'utf-8' codec can't decode byte 0xa3 in position 4186: invalid start byte".
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
def load_transactions (file_name, sep="\t"):
lines = open(file_name, 'rt').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
groceries=load_transactions("Online_Retail.txt",sep="\t")
len(groceries)
Thank you.
I resolved this form:
First discover the unicode of the file
from chardet.universaldetector import UniversalDetector
usock = open('/Users/leonorbrites/Desktop/Online_Retail.txt', 'rb')
detector = UniversalDetector()
for line in usock.readlines():
detector.feed(line)
if detector.done: break
detector.close()
usock.close()
print (detector.result)
Then change the unicode from my file
def transactions (file_name, sep="\t"):
lines = open(file_name,'rt', encoding='iso-8859-1').readlines()
transactions_matrix = []
for l in lines:
l = l.rstrip('\n')
transaction = l.split(sep)
transactions_matrix.append(transaction)
return transactions_matrix
retail=transactions('/Users/leonorbrites/Desktop/Online_Retail.txt', sep="\t")
len(retail)

How to translate encoding by ansi into unicode

When I use the CountVectorizer in sklearn, it needs the file encoding in unicode, but my data file is encoding in ansi.
I tried to change the encoding to unicode using notepad++, then I use readlines, it cannot read all the lines, instead it can only read the last line. After that, I tried to read the line into data file, and write them into the new file by using unicode, but I failed.
def merge_file():
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
resname='resule_final.txt'
if os.path.exists(resname):
os.remove(resname)
result = codecs.open(resname,'w','utf-8')
num = 1
for back_name in os.listdir(r'd:\\workspace\\minibatchk-means\\data\\20_newsgroups'):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num ,":" ,str(filename)
num = num+1
path=current_dir + "\\" +str(filename)
source=open(path,'r')
line = source.readline()
line = line.strip('\n')
line = line.strip('\r')
while line !="":
line = unicode(line,"gbk")
line = line.replace('\n',' ')
line = line.replace('\r',' ')
result.write(line + ' ')
line = source.readline()
else:
print 'End file :'+ str(filename)
result.write('\n')
source.close()
print 'End All.'
result.close()
The error message is :UnicodeDecodeError: 'gbk' codec can't decode bytes in position 0-1: illegal multibyte sequence
Oh,I find the way.
First, use chardet to detect string encoding.
Second,use codecs to input or output to the file in the specific encoding.
Here is the code.
import chardet
import codecs
import os
root_dir="d:\\workspace\\minibatchk-means\\data\\20_newsgroups\\"
num = 1
failed = []
for back_name in os.listdir("d:\\workspace\\minibatchk-means\\data\\20_newsgroups"):
current_dir = root_dir + str(back_name)
for filename in os.listdir(current_dir):
print num,":",str(filename)
num=num+1
path=current_dir+"\\"+str(filename)
content = open(path,'r').read()
source_encoding=chardet.detect(content)['encoding']
if source_encoding == None:
print '??' , filename
failed.append(filename)
elif source_encoding != 'utf-8':
content=content.decode(source_encoding,'ignore')
codecs.open(path,'w',encoding='utf-8').write(content)
print failed
Thanks for all your help.

UnicodeDecodeError: Unexpected end of data issue

I am currently extracting comments from various subreddits on Reddit using praw and trying to calculate their sentiment and add them to a database.
It works by reading from a file that contains subreddit names in order to know which subreddit to pull the comments from.
with open('subs.txt') as f:
for line in f:
string = line.strip()
for submission in reddit.subreddit(string).hot(limit=10):
subreddit = reddit.subreddit(line.strip())
name = str(subreddit.display_name)
comments = submission.comments.list()
for c in comments:
if isinstance(c, MoreComments):
continue
#print c.body
author = c.author
score = c.score
created_at = c.created_utc
upvotes = c.ups
#print c.score
comment_sentiment = getSentiment(c.body)
subreddit_sentiment += comment_sentiment
num_comments += 1
What I have currently implemented works fine up until it reaches a certain comment where it throws the following error message:
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I have looked at a bunch of different questions on here where people have encountered the same issue, but the solutions given did not seem to help my problem.
The full stack trace is as follows:
Traceback (most recent call last):
File "extract.py", line 48, in <module>
comment_sentiment = getSentiment(c.body)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 93, in getSentiment
tagged_sentences = makeTag(pos_tag_text, max_key_size, dictionary)
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 106, in makeTag
return [addTag(sentence, max_key_size, dictionary) for sentence in postagged_sentences]
File "/Users/b38/Desktop/FlaskApp/sentiment_analysis.py", line 119, in addTag
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 8-10: unexpected end of data
I've been racking my brain trying to think of various ways to solve this issue and unfortunately I'm lost. Is it something to do with reading from the file containing the subreddits or would it be concerning the limit of pulling data with praw? I have tried to isolate the problem but can't seem to shake this error.
Would anyone be able to help me fix this issue? I would appreciate any insight.
Many thanks.
EDIT:
sentiment_analysis.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import nltk
import yaml
import sys
import os
import re
//splitting the text initially
def splitString(text):
nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
sentences = nltk_splitter.tokenize(text)
tokenized_sentences = [nltk_tokenizer.tokenize(sentence) for sentence in sentences]
return tokenized_sentences
def tagWords(sentence,max_key_size, dictionary, tag_stem=False):
# Tag all possible sentences
tagged_sentence = []
length = len(sentence)
if max_key_size == 0:
max_key_size = length
i = 0
while (i < length):
j = min(i + max_key_size, length)
tagged = False
while (j > i):
expression_word = ' '.join([word[0] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore') // here is where it gets caught
expression_stem = ' '.join([word[1] for word in sentence[i:j]]).lower().encode('utf-8',errors='ignore')
if tag_stem == True:
word = expression_word
else:
word = expression_word
....
Try encoding the string explicitly:
c.body.encode('utf-8')

UnicodeDecodeError: 'gbk' codec can't decode bytes for Chinese

Environment:
- Mac OS Yosemite
- Python 2.7
- The file file that I am reading is saved in txt format
So I have a script to segment Chinese text into sentences and below is the code:
# coding: utf-8
cutlist ="。!?".decode('utf-8')
def FindToken(cutlist, char):
if char in cutlist:
return True
else:
return False
def Cut(cutlist,lines):
l = []
line = []
for i in lines:
if FindToken(cutlist,i):
line.append(i)
l.append(''.join(line))
line = [] =
else:
line.append(i)
return l
for lines in file("t.txt"):
l = Cut(list(cutlist),list(lines.decode('gbk')))
for line in l:
if line.strip() !="":
li = line.strip().split()
for sentence in li:
print sentence
But I am getting the following error:
Can someone give me some guidance as of what is causing this error? Thanks!
So I changed decode to utf-8 as following:
l = Cut(list(cutlist),list(lines.decode('utf-8')))
And it works now.

Categories

Resources