Got stuck while reading files - python

what Code DO's
I am trying to read each file from the folder which i have given ,And extracting some line using bs4 Soup package in python.
I got an error reading the file that some unicode chars not able to read.
error
Traceback (most recent call last): File "C:-----\check.py", line 25, in
soup=BeautifulSoup(text.read(), 'html.parser') File "C:\Python\Python37\lib\encodings\cp1252.py",
line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position
3565: character maps to
from bs4 import BeautifulSoup
from termcolor import colored
import re, os
import requests
path = "./brian-work/"
freddys_library = os.listdir(path)
def getfiles():
for r, d, f in os.walk(path):
for file in f:
if '.html' in file:
files.append(os.path.join(r, file))
return files
for book in getfiles():
print("file is printed")
print(book)
text = open(book, "r")
soup=BeautifulSoup(text.read(), 'html.parser')
h1 = soup.select('h1')[0].text.strip()
print(h1)
if soup.find('h1'):
h1 = soup.select('h1')[0].text.strip()
else:
print ("no h1")
continue
filename1=book.split("/")[-1]
filename1=filename1.split(".")[0]
print(h1.split(' ', 1)[0])
print(filename1)
if h1.split(' ', 1)[0].lower() == filename1.split('-',1)[0] :
print('+++++++++++++++++++++++++++++++++++++++++++++');
print('same\n');
else:
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX');
print('not')
count=count+1
Please help what should i correct here.
Thanks

The problem is opening a file without knowing its encoding. The default encoding for text = open(book, "r"), per open docs, is the value returned from locale.getpreferredencoding(False), which is cp1252 for your system. The file is some other encoding, so it fails.
Use text = open(book, "rb") (binary mode) and let BeautifulSoup figure it out. HTML files usually indicate their encoding.
You can also use text = open(book,encoding='utf8') or whatever the correct encoding is if you know it already.

Related

the JSON object must be str, bytes or bytearray, not NoneType

This program takes html file from input directory and translate it to hindi using googletrans.
import os
from bs4 import BeautifulSoup
from googletrans import Translator
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(f, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
try:
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except:
print("Translation failed for element: ", element)
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
f.write(str(soup))
print(f"Translated file '{filename}' written to '{output_dir}'")
I am gettig an error:
File "e:\Webscraping\Translate1.py", line 36, in <module>
translation = translator.translate(element.string, dest='hi')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
parsed = json.loads(data[0][2])
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\json\__init__.py", line 339, in loads
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType
During the handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "e:\Webscraping\Translate1.py", line 44, in <module>
print("Translation failed for element: ", element)
File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>
I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.
So I changed the code a bit. So to solve the encoding error I used cardet to detect the encoding of the file and then reopened the file with detected encoding.
Here's the code:
import os
import chardet
from bs4 import BeautifulSoup
from googletrans import Translator
import logging
# Set up logging
logging.basicConfig(filename='translation.log', level=logging.DEBUG)
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\institution"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\institution"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'rb') as f:
# Detect the encoding of the file
encoding = chardet.detect(f.read())['encoding']
# Re-open the file with the detected encoding
f.seek(0)
text = f.read().decode(encoding)
soup = BeautifulSoup(text, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
try:
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except Exception as e:
logging.error(f"Translation failed for element: {element} with error: {e}")
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.write(str(soup))
logging.info(f"Translated file '{filename}' written to '{output_dir}'")

How can I encode html file after read file with ZipFile?

I am reading a zip file from a URL. Inside the zip file, there is an HTML file. After I read the file everything works fine. But when I print the text I am facing a Unicode problem. Python version: 3.8
from zipfile import ZipFile
from io import BytesIO
from bs4 import BeautifulSoup
from lxml import html
content = requests.get("www.url.com")
zf = ZipFile(BytesIO(content.content))
file_name = zf.namelist()[0]
file = zf.open(file_name)
soup = BeautifulSoup(file.read(),'html.parser',from_encoding='utf-8',exclude_encodings='utf-8')
for product in soup.find_all('tr'):
product = product.find_all('td')
if len(product) < 2: continue
print(product[1].text)
I already try to open file and print text with .decode('utf-8') I got following error:
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe8 in position 0: invalid continuation byte
I add from_encoding and exclude_encodings in BeautifulSoup but nothing change and I didn't get an error.
Expected prints:
ÇEŞİTLİ MADDELER TOPLAMI
Tarçın
Fidanı
What I am getting:
ÇEÞÝTLÝ MADDELER TOPLAMI
Tarçýn
Fidaný
I look at the file and the encoding is not utf-8, but iso-8859-9.
Change the encoding and everything will be fine:
soup = BeautifulSoup(file.read(),'html.parser',from_encoding='iso-8859-9')
This will output: ÇEŞİTLİ MADDELER TOPLAMI

Continuing for loop after exception in Python

So first of all I saw similar questions, but nothing worked/wasn't applicable to my problem.
I'm writing a program that is taking in a Text file with a lot of search queries to be searched on Youtube. The program is iterating through the text file line by line. But these have special UTF-8 characters that cannot be decoded. So at a certain point the program stops with a
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1826: character maps to
As I cannot check every line of my entries, I want it to except the error, print the line it was working on and continue at that point.
As the error is not happening in my for loop, but rather the for loop itself, I don't know how to write an try...except statement.
This is the code:
import urllib.request
import re
from unidecode import unidecode
with open('out.txt', 'r') as infh,\
open("links.txt", "w") as outfh:
for line in infh:
try:
clean = unidecode(line)
search_keyword = clean
html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_keyword)
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
outfh.write("https://www.youtube.com/watch?v=" + video_ids[0] + "\n")
#print("https://www.youtube.com/watch?v=" + video_ids[0])
except:
print("Error encounted with Line: " + line)
This is the full error message, to see that the for loop itself is causing the problem.
Traceback (most recent call last):
File "ytbysearchtolinks.py", line 6, in
for line in infh:
File "C:\Users\nfeyd\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1826: character maps to
If you need an example of input I'm working with: https://pastebin.com/LEkwdU06
The try-except-block looks correct and should allow you to catch all occurring exceptions.
The usage of unidecode probably won't help you because non-ASCII characters must be encoded in a specific way in URLs, see, e.g., here.
One solution is to use urllib's quote() function. As per documentation:
Replace special characters in string using the %xx escape.
This is what works for me with the input you've provided:
import urllib.request
from urllib.parse import quote
import re
with open('out.txt', 'r', encoding='utf-8') as infh,\
open("links.txt", "w") as outfh:
for line in infh:
search_keyword = quote(line)
html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_keyword)
video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
outfh.write("https://www.youtube.com/watch?v=" + video_ids[0] + "\n")
print("https://www.youtube.com/watch?v=" + video_ids[0])
EDIT:
After thinking about it, I believe you are running into the following problem:
You are running the code on Windows, and apparently, Python will try to open the file with cp1252 encoding when on Windows, while the file that you shared is in UTF-8 encoding:
$ file out.txt
out.txt: UTF-8 Unicode text, with CRLF line terminators
This would explain the exception you are getting and why it's not being caught by your try-except-block (it's occurring when trying to open the file).
Make sure that you are using encoding='utf-8' when opening the file.
i ran your code, but i didnt have some problems. Do you have create virtual environment with virtualenv and install all the packages you use ?

UnicodeDecodeError 'utf-8' codec can't decode byte 0x92 in position 2893: invalid start byte

I'm trying to open a series of HTML files in order to get the text from the body of those files using BeautifulSoup. I have about 435 files that I wanted to run through but I keep getting this error.
I've tried converting the HTML files to text and opening the text files but I get the same error...
path = "./Bitcoin"
for file in os.listdir(path):
with open(os.path.join(path, file), "r") as fname:
txt = fname.read()
I want to get the source code of the HTML file so I can parse it using beautifulsoup but I get this error
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-133-f32d00599677> in <module>
3 for file in os.listdir(path):
4 with open(os.path.join(path, file), "r") as fname:
----> 5 txt = fname.read()
~/anaconda3/lib/python3.7/codecs.py in decode(self, input, final)
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 2893: invalid start byte
There are various approaches to dealing with text data with unknown encodings. However in this case, as you intend pass the data to Beautiful Soup, the solution is simple: don't bother trying to decode the file yourself, let Beautiful Soup do it. Beautiful Soup will automatically decode bytes to unicode.
In your current code, you read the file in text mode, which means that Python will assume that the file is encoded as UTF-8 unless you provide an encoding argument to the open function. This causes an error if the file's contents are not valid UTF-8.
for file in os.listdir(path):
with open(os.path.join(path, file), "r") as fname:
txt = fname.read()
Instead, read the html files in binary mode and pass the resulting bytes instance to Beautiful Soup.
for file in os.listdir(path):
with open(os.path.join(path, file), "rb") as fname:
bytes_ = fname.read()
soup = BeautifulSoup(bytes_)
FWIW, the file currently causing your problem is probably encoded with cp1252 or a similar windows 8-bit encoding.
>>> '’'.encode('cp1252')
b'\x92'

Writing XML to file corrupts files in python

I'm attempting to write contents from xml.dom.minidom object to file. The simple idea is to use 'writexml' method:
import codecs
def write_xml_native():
# Building DOM from XML
xmldoc = minidom.parse('semio2.xml')
f = codecs.open('codified.xml', mode='w', encoding='utf-8')
# Using native writexml() method to write
xmldoc.writexml(f, encoding="utf=8")
f.close()
The problem is that it corrupts the non-latin-encoded text in the file. The other way is to get the text string and write it to file explicitly:
def write_xml():
# Building DOM from XML
xmldoc = minidom.parse('semio2.xml')
# Opening file for writing UTF-8, which is XML's default encoding
f = codecs.open('codified3.xml', mode='w', encoding='utf-8')
# Writing XML in UTF-8 encoding, as recommended in the documentation
f.write(xmldoc.toxml("utf-8"))
f.close()
This results in the following error:
Traceback (most recent call last):
File "D:\Projects\Semio\semioparser.py", line 45, in <module>
write_xml()
File "D:\Projects\Semio\semioparser.py", line 42, in write_xml
f.write(xmldoc.toxml(encoding="utf-8"))
File "C:\Python26\lib\codecs.py", line 686, in write
return self.writer.write(data)
File "C:\Python26\lib\codecs.py", line 351, in write
data, consumed = self.encode(object, self.errors)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 2064: ordinal not in range(128)
How do I write an XML text to file? What is it I'm missing?
EDIT. Error is fixed by adding decode statement:
f.write(xmldoc.toxml("utf-8").decode("utf-8"))
But russian symbols are still corrupted.
The text is not corrupted when viewed in an interpreter, but when it's written in file.
Hmm, though this should work:
xml = minidom.parse("test.xml")
with codecs.open("out.xml", "w", "utf-8") as out:
xml.writexml(out)
you may alternatively try:
with codecs.open("test.xml", "r", "utf-8") as inp:
xml = minidom.parseString(inp.read().encode("utf-8"))
with codecs.open("out.xml", "w", "utf-8") as out:
xml.writexml(out)
Update: In case you construct xml out of string object, you should encode it before passing to minidom parser, like this:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import xml.dom.minidom as minidom
xml = minidom.parseString(u"<ru>Тест</ru>".encode("utf-8"))
with codecs.open("out.xml", "w", "utf-8") as out:
xml.writexml(out)
Try this:
with open("codified.xml", "w") as f:
f.write(xmldoc.toxml("utf-8").decode("utf-8"))
This works for me (under Python 3, though).

Categories

Resources