python getting unicode encode error when saving file - python

i'm trying to get text from a webpage and it makes 'Traceback (most recent call last):
File "C:\Users\username\Desktop\Python\parsing.py", line 21, in
textFile.write(str(results))
UnicodeEncodeError: 'cp949' codec can't encode character '\xa9' in position 37971: illegal multibyte sequence'
I've searched and tried
textFile.write(str(results).decode('utf-8'))
and it makes no attribute arror.
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com/')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', 'w')
textFile.write(str(results))
textFile.close()
Is there any way to convert the codec of str(results) and save it properly??
python version is 3.7.3

Please specify the encoding like in this example
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', mode='w', encoding='utf8')
textFile.write(str(results))
textFile.close()

Related

Processing files with listdir() breakes when directory contains subdirectories

Following code should walk through directory and grab XML files and process them (i.e. prefixing HTML classes stored in XML elements — however, this is not important in relation to the question). The code works as long as there are no subdirectories inside "/input-dir", but as soon as there are subdirectories, an error message gets thrown out:
Traceback (most recent call last):
File "/Users/ab/Code/SHCprefixer-2022/shc-prefixer_upwork.py", line 22, in content = file.readlines(); File "/codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 566: invalid start byte
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import os
import lxml
import re
input_path = "./input-dir";
output_path = "./output-dir";
ls = os.listdir(input_path);
print(ls);
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss);
for d in range(len(ls)):
with open(f"{input_path}/{ls[d]}", "r") as file:
content = file.readlines();
content = "".join(content)
bs_content = BeautifulSoup(content, "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
with open(f"{output_path}/{ls[d]}", "w") as f:
f.write(str_bs_content)
Probably the error is related to the listdir() command, and as indicated in "IsADirectoryError: [Errno 21] Is a directory: " It is a file, I should use os.walk(), but I wasn't able to implement it. Would be great if someone could help.
You need to test whether the returned file system name is a file. You also want to search the entire subtree. Instead of listdir you could use os.walk, but I think that the newer pathlib module better suites your needs. Its .glob method, when used with "**", will search the subtree and filter for a known file extension at the same time.
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import lxml
import re
from pathlib import Path
input_path = Path("./input-dir")
output_path = Path("./output-dir")
ls = [p for p in input_path.glob("**/*.xml") if p.is_file()]
print(", ".join(str(p) for p in ls))
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss)
for infile in ls:
with infile.open() as file:
bs_content = BeautifulSoup(file.read(), "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
outfile = output_path / infile.relative_to(input_path)
outfile.parent.mkdir(parents=True, exist_ok=True)
with outfile.open("w") as f:
f.write(str_bs_content)
Looks like you will need to filter out directories from the input path dir. You could use os.path.isfile(x) to check it. Using list comprehension you can get the filtered list in one line:
ls = [f for f in os.listdir(input_path) if os.path.isfile(f)]

KeyError: 'pdf' showing now; code was working previously - why?

Getting the following error:
Traceback (most recent call last):
File "test.gyp", line 37, in <module>
for x in url_list["pdf"]:
KeyError: 'pdf'
When previously code was working fine. Aside from shifting what directory the actual .gyp file was in temporarily, I did not alter code. Any clues as to why this has suddenly become an issue?
#!/usr/bin/env python3
import os
import glob
import pdfx
import wget
import urllib.parse
import requests
## Accessing and Creating Six Digit File Code
pdf_dir = "./"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
for file in pdf_files:
## Identify File Name and Limit to Digits
filename = os.path.basename(file)
newname = filename[0:6]
## Run PDFX to identify and download links
pdf = pdfx.PDFx(filename)
url_list = pdf.get_references_as_dict()
attachment_counter = (1)
for x in url_list["url"]:
if x[0:4] == "http":
parsed_url = urllib.parse.quote(x)
extension = os.path.splitext(x)[1]
r = requests.get(x)
with open('temporary', 'wb') as f:
f.write(r.content)
##Concatenate File Name Once Downloaded
os.rename('./temporary', str(newname) + '_attach' + str(attachment_counter) + str(extension))
##Increase Attachment Count
attachment_counter += 1
for x in url_list["pdf"]:
if x[0:4] == "http":
parsed_url = urllib.parse.quote(x)
extension = os.path.splitext(x)[1]
r = requests.get(x)
with open('temporary', 'wb') as f:
f.write(r.content)
##Concatenate File Name Once Downloaded
os.rename('./temporary', str(newname) + '_attach' + str(attachment_counter) + str(extension))
##Increase Attachment Count
attachment_counter += 1
Here is one little snippet from when I had it print out my overall url_list, and you can see that it IS adding items to the dictionary (edited here for privacy) flagged as 'pdf' - so I'm truly at a loss as to why it eventually gives me the error.
'pdf': ['URLSHOWSHERE.pdf']}
You are getting this error due to the fact that your dictionary url_list doesn't have any key named 'pdf'. Please check your dictionary atleast by explicitly printing it to get a glimpse of its content.

Python: URL images download. The URL contains an accented letter. I'm getting an error

There is a problem that the URLs in the .csv file contain accented letters (á, é, í, etc.). If it has an accented character in it, I get an error.
import pandas as pd
import urllib.request
def url_to_jpg(i, url, file_path):
filename = 'image-{}.jpg'.format(i)
full_path = '{}{}'.format(file_path, filename)
urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(filename))
return None
FILENAME = 'imgs_urls.csv'
FILE_PATH = 'images/'
urls = pd.read_csv(FILENAME, encoding ='latin1')
for i, url in enumerate(urls.values):
url_to_jpg(i, url[0], FILE_PATH)
Picture of the error:
Can somebody help me?
You cannot use URLs with non-ASCII characters, you need to clean/convert them before.
In your case, you would add the clean_url function below into your loop. This worked with your link in python 3:
urllib.parse.urlsplit splits the URL into the components
urllib.parse.quote will properly escape the Unicode
characters
urllib.parse.urlunsplit will join it back together
.
import urllib.request
import urllib.parse
def url_to_jpg(url, file_path):
urllib.request.urlretrieve(url, file_path)
print('{} saved.'.format(file_path))
def clean_url(url):
url = urllib.parse.urlsplit(url)
url = list(url)
url[2] = urllib.parse.quote(url[2])
url = urllib.parse.urlunsplit(url)
return url
url = u'<url_with_non_ascii_char>'
url = clean_url(url)
url_to_jpg(url, "test.jpg")

Iterate through multiple files and append text from HTML using Beautiful Soup

I have a directory of downloaded HTML files (46 of them) and I am attempting to iterate through each of them, read their contents, strip the HTML, and append only the text into a text file. However, I'm unsure where I'm messing up, though, as nothing gets written to my text file?
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (path)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
f.close()
-----update----
I've updated my code as below, however the text file still doesn't get created.
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
-----update 2-----
Ah, I caught that I had my directory incorrect, so now I have:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
When this is executed, I get this error:
Traceback (most recent call last):
File "C:\Users\Me\Downloads\bsoup.py, line 11 in <module>
myfile.write(soup)
TypeError: must be str, not BeautifulSoup
I fixed this last error by changing
myfile.write(soup)
to
myfile.write(soup.get_text())
-----update 3 ----
It's working properly now, here's the working code:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(open(markup, "r").read())
with open("example.txt", "a") as myfile:
myfile.write(soup.get_text())
myfile.close()
actually you are not reading html file, this should work,
soup=BeautifulSoup(open(webpage,'r').read(), 'lxml')
If you want to use lxml.html directly here is a modified version of some code I've been using for a project. If you want to grab all the text, just don't filter by tag. There may be a way to do it without iterating, but I don't know. It saves the data as unicode, so you will have to take that into account when opening the file.
import os
import glob
import lxml.html
path = '/'
# Whatever tags you want to pull text from.
visible_text_tags = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'a', 'div', 'span']
for infile in glob.glob(os.path.join(path, "*.html")):
doc = lxml.html.parse(infile)
file_text = []
for element in doc.iter(): # Iterate once through the entire document
try: # Grab tag name and text (+ tail text)
tag = element.tag
text = element.text
tail = element.tail
except:
continue
words = None # text words split to list
if tail: # combine text and tail
text = text + " " + tail if text else tail
if text: # lowercase and split to list
words = text.lower().split()
if tag in visible_text_tags:
if words:
file_text.append(' '.join(words))
with open('example.txt', 'a') as myfile:
myfile.write(' '.join(file_text).encode('utf8'))

python beautiful soup ascii error

My script works when I download a english bible. but gives me an ascii error when I download a foreign bible.
python
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and converting Bibles to Aurora...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
namesave = '%s.html' % '.'.join(name.split('.')[:-1])
chnum = name.split('.')[-2]
dirname = urlparse.urlparse(url).path.split('.')[-1]
try:
f = urllib2.urlopen(url)
except urllib2.URLError:
print "Bad URL or timeout"
continue
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
thearticle = soup.html.body.article
bookname = thearticle['data-book-human']
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
converted = str(soup)
full_path = os.path.join(dirname, namesave)
open(full_path, 'wb').write(converted)
print(name)
print("DOWNLOADS AND CONVERSIONS COMPLETE!")
links.html that works
http://www.youversion.com/bible/john.6.ceb
links.html that gives error
http://www.youversion.com/bible/john.6.nav
the error
File "test.py", line 32, in <module>
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-4: ordinal not in range(128)
I've seen a similar error before, might even be the same. Can't recall exactly.
Try:
BeautifulSoup(s, convertEntities=BeautifulSoup.HTML_ENTITIES)
Or try to force unicode:
soup.html.replaceWith(u'<html>'+unicode(bookname)+u'</html>')

Categories

Resources