I have a part of code
from bs4 import BeautifulSoup
for i in range(1,10):
print(str(i))
soup = BeautifulSoup(open("downloads/" + str(i) + ".html","rt"), 'html.parser')
text1 = soup.find_all("div", class_="content html_format")
text1 = text1[0].get_text()
print(text1)
After executing I get an error:
Traceback (most recent call last):
File "classifier1.py", line 6, in <module>
soup = BeautifulSoup(open("downloads/" + str(i) + ".html","rt"), 'html.parser')
File "C:\Users\K18\Desktop\redictgames\classifier\bs4\__init__.py", line 175, in __init__
markup = markup.read()
File "C:\python34\lib\encodings\cp1251.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x98
in position 11152: character maps to <undefined>
I was tried 'rb' and 'r' modes too, but this doesn't work...
In the files I have articles with Russian words.
Few days ago that was working perfectly
Related
I'm having error but my output will be save perfectly. How?
this is the output in terminal......
Traceback (most recent call last):
File "C:\Users\Ankit\Desktop\PYTHON Project\Project App Monitoring\test1.py", line 19, in <module>
file.write(item + "\n")
File "C:\Users\Ankit\AppData\Local\Programs\Python\Python310\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-3: character maps to <undefined>
Note that: my output file that i expect to save is properly fine.
this is the program......
import winreg
# Connecting to the registry key
access_registry = winreg.ConnectRegistry(None,winreg.HKEY_LOCAL_MACHINE)
access_key = winreg.OpenKey(access_registry,r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall")
# Enumerating all values under the key
output = []
for n in range(winreg.QueryInfoKey(access_key)[0-3]):
try:
value = winreg.EnumKey(access_key, n)
output.append(value)
except:
break
# Writing the output to a file
with open("output.txt", "w") as file:
for item in output:
file.write(item + "\n")
# Closing the connection
winreg.CloseKey(access_key)
winreg.CloseKey(access_registry)
I have to be able to read a text file and count the number of times the words in a line occur. Plus I have to be able to sort the words from most to least occurring. My code so far is below and I keep getting this error:
Traceback (most recent call last):
File "/Users/lritter/Documents/wordcount.py", line 9, in <module>
lines = file.readlines()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 7927: invalid start byte
Code:
import os
count = {}
os.chdir('/Users/lritter/Desktop/Python')
item = int(input('Which line would you like to evaluate? '))
print('You entered: ', item)
with open('Obama_speech.txt') as file:
lines = file.readlines()
message = (lines[(item)])
message2 = message.split
for word in message2():
if len(word) >= 5:
count[word] = count.get(word,0)+1
print(count)
I was trying to get data from the website and input it into csv
table_array = []
def get_data(page):
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=" + str(page)
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
main_table = table_bs4.find("table").find("tbody")
for i in main_table.find_all("tr"):
array = []
for x in i.find_all("td"):
array.append(x.text)
print(x.text, end = " ")
table_array.append(array[:-1])
print("")
def get_page_number():
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=1"
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
sum_of_items = table_bs4.find("div", {"class": "summary"})
sum_of_items = int(sum_of_items.find_all("b")[1].text.replace(",", ""))
total_pages = math.ceil(sum_of_items/20)
return total_pages
Run:
for i in range(1):#range(get_page_number()):
get_data(i + 9)
with open("big_bank_rating.csv", "w", newline="") as csvfile:
write = csv.writer(csvfile)
write.writerow(["日期", "大行", "股票", "最新評級", "目標價", "變化", "潛在升幅"])
for xi in table_array:
print(xi)
write.writerows(table_array)
print("!!!Done Append!!!")
but it comes to an error of encoding
Traceback (most recent call last):
File "data_crawl.py", line 55, in <module>
write.writerows(table_array)
UnicodeEncodeError: 'cp950' codec can't encode character '\u7dab' in position 19: illegal multibyte sequence
I tried to use encoding "uft-8" but the result I got was garbled. How can I fix it? to encode the character '\u7dab'
I have to use
encoding="utf_8_sig"
I was working on a project that requires data from instagram, i am collecting it using the json.
When i use the test link: http://vocab.nic.in/rest.php/states/json - it works, but when i use: https://www.instagram.com/instagram/?__a=1 (also json) - it didn't work.
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to
import urllib.request
import json
def getResponse(url):
operUrl = urllib.request.urlopen(url)
if(operUrl.getcode()==200):
data = operUrl.read()
jsonData = json.loads(data)
else:
print("Error receiving data", operUrl.getcode())
return jsonData
def main():
urlData = "https://www.instagram.com/instagram/?__a=1"
jsonData = getResponse(urlData)
# print the state id and state name corresponding
print(jsonData)
if __name__ == '__main__':
main()
Full Traceback:
Traceback (most recent call last):
File "C:\Users\andrey_user\Desktop\ig\test.py", line 21, in [module] main()
File "C:\Users\andrey_user\Desktop\ig\test.py", line 18, in main print(jsonData)
File "D:\programs\programing\Python\lib\encodings\cp1251.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to [undefined]
I am trying to send some json object from my node.js server to a python script. However when trying to convert the json object to dictionary using json.loads, for many inputs there are UnicodeEncodeErrors. WHat do I need to do in order to correctly decode the js object.
Error: UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2062: character maps to <undefined>
at PythonShell.parseError (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:183:17)
at terminateIfNeeded (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:98:28)
at ChildProcess.<anonymous> (D:\Users\Temp\Desktop\empman\node_modules\python-shell\index.js:88:9)
at emitTwo (events.js:106:13)
at ChildProcess.emit (events.js:191:7)
at Process.ChildProcess._handle.onexit (internal/child_process.js:219:12)
at Process.onexit (D:\Users\Temp\Desktop\empman\node_modules\async-listener\glue.js:188:31)
----- Python Traceback -----
File "word.py", line 38, in <module>
json_data=open('data.txt').read()
File "D:\Users\Temp\AppData\Local\Programs\Python\Python36-32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
corresponding python code
from docx import Document
from docx.shared import Inches
import sys
import io
import json
document = Document('template.docx')
# newdocument = Document('resume.docx')
# print(sys.argv) # Note the first argument is always the script filename.
resumearray = [];
for x in range(0, 21):
resumearray.append(input())
#json_data=open('data.txt').read()
f = io.open('data','r', encoding='utf-16-le')
# #datastore = json.loads(f.read)
print(f.read())
# text = f.read()
# json_data = text
# document.add_paragraph('_______________________________________________________________________')
#document.add_paragraph(resumearray[1])
k=resumearray[1]
#document.add_paragraph(k)
jsobject = json.loads(k)
document.add_paragraph('_______________________________________________')
#document.add_paragraph(jsobject.values())
for x in range(0, 9):
if resumearray[x]=='[]':
document.add_paragraph('nothing was found')
else:
document.add_paragraph(resumearray[x])
You are running python on Windows, where the default encoding is cp1252. The json is encoded as utf-8, hence the error.
>>> with open('blob.json', encoding='cp1252') as f:
... j = json.load(f)
...
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "/usr/local/lib/python3.6/json/__init__.py", line 296, in load
return loads(fp.read(),
File "/usr/local/lib/python3.6/encodings/cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 2795: character maps to <undefined>
Use utf-8 instead:
>>> with open('blob.json', encoding='utf-8') as f:
... j = json.load(f)
...
>>> print(len(j))
29