the JSON object must be str, bytes or bytearray, not NoneType - python

This program takes html file from input directory and translate it to hindi using googletrans.
import os
from bs4 import BeautifulSoup
from googletrans import Translator
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(f, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
try:
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except:
print("Translation failed for element: ", element)
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
f.write(str(soup))
print(f"Translated file '{filename}' written to '{output_dir}'")
I am gettig an error:
File "e:\Webscraping\Translate1.py", line 36, in <module>
translation = translator.translate(element.string, dest='hi')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
parsed = json.loads(data[0][2])
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\json\__init__.py", line 339, in loads
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType
During the handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "e:\Webscraping\Translate1.py", line 44, in <module>
print("Translation failed for element: ", element)
File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>
I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.

So I changed the code a bit. So to solve the encoding error I used cardet to detect the encoding of the file and then reopened the file with detected encoding.
Here's the code:
import os
import chardet
from bs4 import BeautifulSoup
from googletrans import Translator
import logging
# Set up logging
logging.basicConfig(filename='translation.log', level=logging.DEBUG)
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\institution"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\institution"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'rb') as f:
# Detect the encoding of the file
encoding = chardet.detect(f.read())['encoding']
# Re-open the file with the detected encoding
f.seek(0)
text = f.read().decode(encoding)
soup = BeautifulSoup(text, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
try:
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except Exception as e:
logging.error(f"Translation failed for element: {element} with error: {e}")
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.write(str(soup))
logging.info(f"Translated file '{filename}' written to '{output_dir}'")

Related

Got stuck while reading files

what Code DO's
I am trying to read each file from the folder which i have given ,And extracting some line using bs4 Soup package in python.
I got an error reading the file that some unicode chars not able to read.
error
Traceback (most recent call last): File "C:-----\check.py", line 25, in
soup=BeautifulSoup(text.read(), 'html.parser') File "C:\Python\Python37\lib\encodings\cp1252.py",
line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position
3565: character maps to
from bs4 import BeautifulSoup
from termcolor import colored
import re, os
import requests
path = "./brian-work/"
freddys_library = os.listdir(path)
def getfiles():
for r, d, f in os.walk(path):
for file in f:
if '.html' in file:
files.append(os.path.join(r, file))
return files
for book in getfiles():
print("file is printed")
print(book)
text = open(book, "r")
soup=BeautifulSoup(text.read(), 'html.parser')
h1 = soup.select('h1')[0].text.strip()
print(h1)
if soup.find('h1'):
h1 = soup.select('h1')[0].text.strip()
else:
print ("no h1")
continue
filename1=book.split("/")[-1]
filename1=filename1.split(".")[0]
print(h1.split(' ', 1)[0])
print(filename1)
if h1.split(' ', 1)[0].lower() == filename1.split('-',1)[0] :
print('+++++++++++++++++++++++++++++++++++++++++++++');
print('same\n');
else:
print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX');
print('not')
count=count+1
Please help what should i correct here.
Thanks
The problem is opening a file without knowing its encoding. The default encoding for text = open(book, "r"), per open docs, is the value returned from locale.getpreferredencoding(False), which is cp1252 for your system. The file is some other encoding, so it fails.
Use text = open(book, "rb") (binary mode) and let BeautifulSoup figure it out. HTML files usually indicate their encoding.
You can also use text = open(book,encoding='utf8') or whatever the correct encoding is if you know it already.

Writing yaml file: attribute error

I'm trying to read a yaml file, replacing part of it and write the result it into the same file, but I get an attribute error.
Code
import yaml
import glob
import re
from yaml import load, dump
from yaml import CLoader as Loader, CDumper as Dumper
import io
list_paths = glob.glob("my_path/*.yaml")
for path in list_paths:
with open(path, 'r') as stream:
try:
text = load(stream, Loader=Loader)
text = str(text)
print text
if "my_string" in text:
start = "'my_string': '"
end = "'"
m = re.compile(r'%s.*?%s' % (start,end),re.S)
m = m.search(text).group(0)
text[m] = "'my_string': 'this is my string'"
except yaml.YAMLError as exc:
print(exc)
with io.open(path, 'w', encoding = 'utf8') as outfile:
yaml.dump(text, path, default_flow_style=False, allow_unicode=True)
Error
I get this error for the yaml_dump line
AttributeError: 'str' object has no attribute 'write'
What I have tried so far
Not converting the text to a string, but then I get an error on the m.search line:
TypeError: expected string or buffer
Convert first to string and then to dictagain, but I get this error from the code text: dict(text) : ValueError: dictionary update sequence element #0 has length 1; 2 is required
Yaml file
my string: something
string2: something else
Expected result: yaml file
my string: this is my string
string2: something else
To stop getting that error all you need to do is change the
with io.open(path, 'w', encoding = 'utf8') as outfile:
yaml.dump(text, path, default_flow_style=False, allow_unicode=True)
to
with open(path, 'w') as outfile:
yaml.dump(text.encode("UTF-8"), outfile, default_flow_style=False, allow_unicode=True)
As the other answer says, this solution simply replaces the string path with the open file descriptor.
This
yaml.dump(text, path, default_flow_style=False, allow_unicode=True)
is not possible if path is a str. It must be an open file.

Converting Multiple html file into pdf using pdfkit in Python

I try converting multiple html file into pdf using pdfkik. This is my code:
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/jaypratappandey/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
data=[]
f=open('htmlfile.html', 'w')
top=open('tophtmlfile.html', 'w')
for name in soup.select('.pv-top-card-section__body'):
top.write("%s" % name)
for item in soup.select('.pv-oc.ember-view'):
f.write("%s" % item)
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'jayprofile.pdf')
driver.quit()
This code give the following error:
Traceback (most recent call last):
File "lkdndata.py", line 23, in <module>
pdfkit.from_file(['tophtmlfile.html', 'htmlfile.html'], 'ankurprofile.pdf')
File "/usr/local/lib/python3.5/dist-packages/pdfkit/api.py", line 49, in from_file
return r.to_pdf(output_path)
File "/usr/local/lib/python3.5/dist-packages/pdfkit/pdfkit.py", line 156, in to_pdf
raise IOError('wkhtmltopdf reported an error:\n' + stderr)
OSError: wkhtmltopdf reported an error:
Error: This version of wkhtmltopdf is build against an unpatched version of QT, and does not support more then one input document.
Exit with code 1, due to unknown error.
The solution i found was to first merge the html files into one and then go on to convert it using pdfkit. so in your case would be to save the tophtml and html files together in same dir and replace the path to that dir.
import pdfkit
import os
# path to folder containing html files
path = "/home/ec2-user/data-science-processes/src/results/"
def multiple_html_to_pdf(path):
""" converts multiple html files to a single pdf
args: path to directory containing html files
"""
empty_html = '<html><head></head><body></body></html>'
for file in os.listdir(path):
if file.endswith(".html"):
print(file)
# append html files
with open(path + file, 'r') as f:
html = f.read()
empty_html = empty_html.replace('</body></html>', html + '</body></html>')
# save merged html
with open('merged.html', 'w') as f:
f.write(empty_html)
pdfkit.from_file('/home/ec2-user/data-science-processes/report/merged.html','Report.pdf')
multiple_html_to_pdf(path)
I had the same error. The error you are probably getting is due to the inconsistency of your qt installation and non availability of compatible qt version.
Try running
wkhtmltopdf
on your terminal and see whether you can find "Reduced Functionality".
If yes then my assumption is correct and then your safest bet would be to compile it from source.

Python downloading PDF into a .zip

What I am trying to do is loop through a list of URL to download a series of .pdfs, and save them to a .zip. At the moment I am just trying to test code using just one URL. The ERROR I am getting is:
Traceback (most recent call last):
File "I:\test_pdf_download_zip.py", line 36, in <module>
zip_file(zipfile_name, url)
File "I:\test_pdf_download_zip.py", line 30, in zip_file
myzip.write(dowload_pdf(url))
TypeError: expected a string or other character buffer object
Would someone know how to pass .pdf request to the .zip correctly (avoiding the error above) in order for me to append it, or know if it is possible to do this?
import os
import zipfile
import requests
output = r"I:"
# File name of the zipfile
zipfile_name = os.path.join(output, "test.zip")
# Random test pdf
url = r"http://www.pdf995.com/samples/pdf.pdf"
def create_zipfile(zipfile_name):
zipfile.ZipFile(zipfile_name, "w")
def dowload_pdf(url):
response = requests.get(url, stream=True)
with open('test.pdf', 'wb') as f:
f.write(response.content)
def zip_file(zip_name, url):
with open(zip_name,'a') as myzip:
myzip.write(dowload_pdf(url))
if __name__ == "__main__":
create_zipfile(zipfile_name)
zip_file(zipfile_name, url)
print("Done")
Your download_pdf() function is saving a file but it doesn't return anything. You need to modify it so it actually returns the file path to myzip.write(). You don't want to hardcode test.pdf but pass unique paths to your download function so you don't end up with multiple test.pdf in your archive.
def dowload_pdf(url, path):
response = requests.get(url, stream=True)
with open(path, 'wb') as f:
f.write(response.content)
return path

save base64 image python

I am trying to save an image with python that is Base64 encoded. Here the string is to large to post but here is the image
And when received by python the last 2 characters are == although the string is not formatted so I do this
import base64
data = "data:image/png;base64," + photo_base64.replace(" ", "+")
And then I do this
imgdata = base64.b64decode(data)
filename = 'some_image.jpg' # I assume you have a way of picking unique filenames
with open(filename, 'wb') as f:
f.write(imgdata)
But this causes this error
Traceback (most recent call last):
File "/var/www/cgi-bin/save_info.py", line 83, in <module>
imgdata = base64.b64decode(data)
File "/usr/lib64/python2.7/base64.py", line 76, in b64decode
raise TypeError(msg)
TypeError: Incorrect padding
I also printed out the length of the string once the data:image/png;base64, has been added and the spaces replace with + and it has a length of 34354, I have tried a bunch of different images but all of them when I try to open the saved file say that the file is damaged.
What is happening and why is the file corrupt?
Thanks
EDIT
Here is some base64 that also failed
iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAADBQTFRFA6b1q Ci5/f2lt/9yu3 Y8v2cMpb1/DSJbz5i9R2NLwfLrWbw m T8I8////////SvMAbAAAABB0Uk5T////////////////////AOAjXRkAAACYSURBVHjaLI8JDgMgCAQ5BVG3//9t0XYTE2Y5BPq0IGpwtxtTP4G5IFNMnmEKuCopPKUN8VTNpEylNgmCxjZa2c1kafpHSvMkX6sWe7PTkwRX1dY7gdyMRHZdZ98CF6NZT2ecMVaL9tmzTtMYcwbP y3XeTgZkF5s1OSHwRzo1fkILgWC5R0X4BHYu7t/136wO71DbvwVYADUkQegpokSjwAAAABJRU5ErkJggg==
This is what I receive in my python script from the POST Request
Note I have not replace the spaces with +'s
There is no need to add data:image/png;base64, before, I tried using the code below, it works fine.
import base64
data = 'iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAADBQTFRFA6b1q Ci5/f2lt/9yu3 Y8v2cMpb1/DSJbz5i9R2NLwfLrWbw m T8I8////////SvMAbAAAABB0Uk5T////////////////////AOAjXRkAAACYSURBVHjaLI8JDgMgCAQ5BVG3//9t0XYTE2Y5BPq0IGpwtxtTP4G5IFNMnmEKuCopPKUN8VTNpEylNgmCxjZa2c1kafpHSvMkX6sWe7PTkwRX1dY7gdyMRHZdZ98CF6NZT2ecMVaL9tmzTtMYcwbP y3XeTgZkF5s1OSHwRzo1fkILgWC5R0X4BHYu7t/136wO71DbvwVYADUkQegpokSjwAAAABJRU5ErkJggg=='.replace(' ', '+')
imgdata = base64.b64decode(data)
filename = 'some_image.jpg' # I assume you have a way of picking unique filenames
with open(filename, 'wb') as f:
f.write(imgdata)
If you append data:image/png;base64, to data, then you get error. If You have this, you must replace it.
new_data = initial_data.replace('data:image/png;base64,', '')

Categories

Resources