UnicodeDecodeError in bs4 Python 3.4 - python

I have a part of code
from bs4 import BeautifulSoup
for i in range(1,10):
print(str(i))
soup = BeautifulSoup(open("downloads/" + str(i) + ".html","rt"), 'html.parser')
text1 = soup.find_all("div", class_="content html_format")
text1 = text1[0].get_text()
print(text1)
After executing I get an error:
Traceback (most recent call last):
File "classifier1.py", line 6, in <module>
soup = BeautifulSoup(open("downloads/" + str(i) + ".html","rt"), 'html.parser')
File "C:\Users\K18\Desktop\redictgames\classifier\bs4\__init__.py", line 175, in __init__
markup = markup.read()
File "C:\python34\lib\encodings\cp1251.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x98
in position 11152: character maps to <undefined>
I was tried 'rb' and 'r' modes too, but this doesn't work...
In the files I have articles with Russian words.
Few days ago that was working perfectly

Related

UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-3: character maps to

I'm having error but my output will be save perfectly. How?
this is the output in terminal......
Traceback (most recent call last):
File "C:\Users\Ankit\Desktop\PYTHON Project\Project App Monitoring\test1.py", line 19, in <module>
file.write(item + "\n")
File "C:\Users\Ankit\AppData\Local\Programs\Python\Python310\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-3: character maps to <undefined>
Note that: my output file that i expect to save is properly fine.
this is the program......
import winreg
# Connecting to the registry key
access_registry = winreg.ConnectRegistry(None,winreg.HKEY_LOCAL_MACHINE)
access_key = winreg.OpenKey(access_registry,r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall")
# Enumerating all values under the key
output = []
for n in range(winreg.QueryInfoKey(access_key)[0-3]):
try:
value = winreg.EnumKey(access_key, n)
output.append(value)
except:
break
# Writing the output to a file
with open("output.txt", "w") as file:
for item in output:
file.write(item + "\n")
# Closing the connection
winreg.CloseKey(access_key)
winreg.CloseKey(access_registry)

Beginner trying to figure out error and sorting word count

I have to be able to read a text file and count the number of times the words in a line occur. Plus I have to be able to sort the words from most to least occurring. My code so far is below and I keep getting this error:
Traceback (most recent call last):
File "/Users/lritter/Documents/wordcount.py", line 9, in <module>
lines = file.readlines()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 7927: invalid start byte
Code:
import os
count = {}
os.chdir('/Users/lritter/Desktop/Python')
item = int(input('Which line would you like to evaluate? '))
print('You entered: ', item)
with open('Obama_speech.txt') as file:
lines = file.readlines()
message = (lines[(item)])
message2 = message.split
for word in message2():
if len(word) >= 5:
count[word] = count.get(word,0)+1
print(count)

How do I fix "UnicodeEncodeError: 'cp950' codec can't encode character '\u7dab' in position 19: illegal multibyte sequence"

I was trying to get data from the website and input it into csv
table_array = []
def get_data(page):
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=" + str(page)
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
main_table = table_bs4.find("table").find("tbody")
for i in main_table.find_all("tr"):
array = []
for x in i.find_all("td"):
array.append(x.text)
print(x.text, end = " ")
table_array.append(array[:-1])
print("")
def get_page_number():
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=1"
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
sum_of_items = table_bs4.find("div", {"class": "summary"})
sum_of_items = int(sum_of_items.find_all("b")[1].text.replace(",", ""))
total_pages = math.ceil(sum_of_items/20)
return total_pages
Run:
for i in range(1):#range(get_page_number()):
get_data(i + 9)
with open("big_bank_rating.csv", "w", newline="") as csvfile:
write = csv.writer(csvfile)
write.writerow(["日期", "大行", "股票", "最新評級", "目標價", "變化", "潛在升幅"])
for xi in table_array:
print(xi)
write.writerows(table_array)
print("!!!Done Append!!!")
but it comes to an error of encoding
Traceback (most recent call last):
File "data_crawl.py", line 55, in <module>
write.writerows(table_array)
UnicodeEncodeError: 'cp950' codec can't encode character '\u7dab' in position 19: illegal multibyte sequence
I tried to use encoding "uft-8" but the result I got was garbled. How can I fix it? to encode the character '\u7dab'
I have to use
encoding="utf_8_sig"

Collecting data from instagram using json

I was working on a project that requires data from instagram, i am collecting it using the json.
When i use the test link: http://vocab.nic.in/rest.php/states/json - it works, but when i use: https://www.instagram.com/instagram/?__a=1 (also json) - it didn't work.
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to
import urllib.request
import json
def getResponse(url):
operUrl = urllib.request.urlopen(url)
if(operUrl.getcode()==200):
data = operUrl.read()
jsonData = json.loads(data)
else:
print("Error receiving data", operUrl.getcode())
return jsonData
def main():
urlData = "https://www.instagram.com/instagram/?__a=1"
jsonData = getResponse(urlData)
# print the state id and state name corresponding
print(jsonData)
if __name__ == '__main__':
main()
Full Traceback:
Traceback (most recent call last):
File "C:\Users\andrey_user\Desktop\ig\test.py", line 21, in [module] main()
File "C:\Users\andrey_user\Desktop\ig\test.py", line 18, in main print(jsonData)
File "D:\programs\programing\Python\lib\encodings\cp1251.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to [undefined]

json.loads() gives UnicodeEncodeError when parsing JSON object recived from node.js

I am trying to send some json object from my node.js server to a python script. However when trying to convert the json object to dictionary using json.loads, for many inputs there are UnicodeEncodeErrors. WHat do I need to do in order to correctly decode the js object.
Error: UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2062: character maps to <undefined>
at PythonShell.parseError (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:183:17)
at terminateIfNeeded (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:98:28)
at ChildProcess.<anonymous> (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:88:9)
at emitTwo (events.js:106:13)
at ChildProcess.emit (events.js:191:7)
at Process.ChildProcess._handle.onexit (internal/child_process.js:219:12)
at Process.onexit (D:\Users\Temp\Desktop\empman\node_modules\async-listener\glue.js:188:31)
----- Python Traceback -----
File "word.py", line 38, in <module>
json_data=open('data.txt').read()
File "D:\Users\Temp\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
corresponding python code
from docx import Document
from docx.shared import Inches
import sys
import io
import json
document = Document('template.docx')
# newdocument = Document('resume.docx')
# print(sys.argv) # Note the first argument is always the script filename.
resumearray = [];
for x in range(0, 21):
resumearray.append(input())
#json_data=open('data.txt').read()
f = io.open('data','r', encoding='utf-16-le')
# #datastore = json.loads(f.read)
print(f.read())
# text = f.read()
# json_data = text
# document.add_paragraph('_______________________________________________________________________')
#document.add_paragraph(resumearray[1])
k=resumearray[1]
#document.add_paragraph(k)
jsobject = json.loads(k)
document.add_paragraph('_______________________________________________')
#document.add_paragraph(jsobject.values())
for x in range(0, 9):
if resumearray[x]=='[]':
document.add_paragraph('nothing was found')
else:
document.add_paragraph(resumearray[x])
You are running python on Windows, where the default encoding is cp1252. The json is encoded as utf-8, hence the error.
>>> with open('blob.json', encoding='cp1252') as f:
... j = json.load(f)
...
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/usr/local/lib/python3.6/json/__init__.py", line 296, in load
return loads(fp.read(),
File "/usr/local/lib/python3.6/encodings/cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2795: character maps to <undefined>
Use utf-8 instead:
>>> with open('blob.json', encoding='utf-8') as f:
... j = json.load(f)
...
>>> print(len(j))
29

Categories

Resources