python beautiful soup ascii error - python

My script works when I download a english bible. but gives me an ascii error when I download a foreign bible.
python
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re
print ("downloading and converting Bibles to Aurora...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
url = link.get('href')
name = urlparse.urlparse(url).path.split('/')[-1]
namesave = '%s.html' % '.'.join(name.split('.')[:-1])
chnum = name.split('.')[-2]
dirname = urlparse.urlparse(url).path.split('.')[-1]
try:
f = urllib2.urlopen(url)
except urllib2.URLError:
print "Bad URL or timeout"
continue
s = f.read()
if (os.path.isdir(dirname) == 0):
os.mkdir(dirname)
soup = BeautifulSoup(s)
thearticle = soup.html.body.article
bookname = thearticle['data-book-human']
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
converted = str(soup)
full_path = os.path.join(dirname, namesave)
open(full_path, 'wb').write(converted)
print(name)
print("DOWNLOADS AND CONVERSIONS COMPLETE!")
links.html that works
http://www.youversion.com/bible/john.6.ceb
links.html that gives error
http://www.youversion.com/bible/john.6.nav
the error
File "test.py", line 32, in <module>
soup.html.replaceWith('<html>'+str(bookname)+'</html>')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-4: ordinal not in range(128)

I've seen a similar error before, might even be the same. Can't recall exactly.
Try:
BeautifulSoup(s, convertEntities=BeautifulSoup.HTML_ENTITIES)
Or try to force unicode:
soup.html.replaceWith(u'<html>'+unicode(bookname)+u'</html>')

Related

Processing files with listdir() breakes when directory contains subdirectories

Following code should walk through directory and grab XML files and process them (i.e. prefixing HTML classes stored in XML elements — however, this is not important in relation to the question). The code works as long as there are no subdirectories inside "/input-dir", but as soon as there are subdirectories, an error message gets thrown out:
Traceback (most recent call last):
File "/Users/ab/Code/SHCprefixer-2022/shc-prefixer_upwork.py", line 22, in content = file.readlines(); File "/codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 566: invalid start byte
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import os
import lxml
import re
input_path = "./input-dir";
output_path = "./output-dir";
ls = os.listdir(input_path);
print(ls);
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss);
for d in range(len(ls)):
with open(f"{input_path}/{ls[d]}", "r") as file:
content = file.readlines();
content = "".join(content)
bs_content = BeautifulSoup(content, "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
with open(f"{output_path}/{ls[d]}", "w") as f:
f.write(str_bs_content)
Probably the error is related to the listdir() command, and as indicated in "IsADirectoryError: [Errno 21] Is a directory: " It is a file, I should use os.walk(), but I wasn't able to implement it. Would be great if someone could help.
You need to test whether the returned file system name is a file. You also want to search the entire subtree. Instead of listdir you could use os.walk, but I think that the newer pathlib module better suites your needs. Its .glob method, when used with "**", will search the subtree and filter for a known file extension at the same time.
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import lxml
import re
from pathlib import Path
input_path = Path("./input-dir")
output_path = Path("./output-dir")
ls = [p for p in input_path.glob("**/*.xml") if p.is_file()]
print(", ".join(str(p) for p in ls))
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss)
for infile in ls:
with infile.open() as file:
bs_content = BeautifulSoup(file.read(), "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
outfile = output_path / infile.relative_to(input_path)
outfile.parent.mkdir(parents=True, exist_ok=True)
with outfile.open("w") as f:
f.write(str_bs_content)
Looks like you will need to filter out directories from the input path dir. You could use os.path.isfile(x) to check it. Using list comprehension you can get the filtered list in one line:
ls = [f for f in os.listdir(input_path) if os.path.isfile(f)]

Python: URL images download. The URL contains an accented letter. I'm getting an error

There is a problem that the URLs in the .csv file contain accented letters (á, é, í, etc.). If it has an accented character in it, I get an error.
import pandas as pd
import urllib.request
def url_to_jpg(i, url, file_path):
filename = 'image-{}.jpg'.format(i)
full_path = '{}{}'.format(file_path, filename)
urllib.request.urlretrieve(url, full_path)
print('{} saved.'.format(filename))
return None
FILENAME = 'imgs_urls.csv'
FILE_PATH = 'images/'
urls = pd.read_csv(FILENAME, encoding ='latin1')
for i, url in enumerate(urls.values):
url_to_jpg(i, url[0], FILE_PATH)
Picture of the error:
Can somebody help me?
You cannot use URLs with non-ASCII characters, you need to clean/convert them before.
In your case, you would add the clean_url function below into your loop. This worked with your link in python 3:
urllib.parse.urlsplit splits the URL into the components
urllib.parse.quote will properly escape the Unicode
characters
urllib.parse.urlunsplit will join it back together
.
import urllib.request
import urllib.parse
def url_to_jpg(url, file_path):
urllib.request.urlretrieve(url, file_path)
print('{} saved.'.format(file_path))
def clean_url(url):
url = urllib.parse.urlsplit(url)
url = list(url)
url[2] = urllib.parse.quote(url[2])
url = urllib.parse.urlunsplit(url)
return url
url = u'<url_with_non_ascii_char>'
url = clean_url(url)
url_to_jpg(url, "test.jpg")

python getting unicode encode error when saving file

i'm trying to get text from a webpage and it makes 'Traceback (most recent call last):
File "C:\Users\username\Desktop\Python\parsing.py", line 21, in
textFile.write(str(results))
UnicodeEncodeError: 'cp949' codec can't encode character '\xa9' in position 37971: illegal multibyte sequence'
I've searched and tried
textFile.write(str(results).decode('utf-8'))
and it makes no attribute arror.
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com/')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', 'w')
textFile.write(str(results))
textFile.close()
Is there any way to convert the codec of str(results) and save it properly??
python version is 3.7.3
Please specify the encoding like in this example
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', mode='w', encoding='utf8')
textFile.write(str(results))
textFile.close()

When trying to extract meta data out of images on webpages, keeps returning {}, why?

I've looked at the exifread documentation and it says that it's returned as a dictionary, but the problem is that it returns nothing except {}, I don't know if that means there is no meta data in the image, or I made a nooby mistake, well anyway I've spend a good chunk of time looking at my code and documentation, but still can't find the solution, any help would be appreciated :)
Code:
import exifread
import colorama
import urllib2
import urllib
import random
import time
import bs4
import sys
def get_images(target):
colorama.init()
print(colorama.Fore.LIGHTGREEN_EX + "[*] Retrieving Meta Data from Target's Page...")
req = urllib2.Request(target)
resp = urllib2.urlopen(req)
page = resp.read()
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
if "www" in src or "http" in src or "https" in src:
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(src, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
else:
s = target + src
rand_num = random.random()
name = str(rand_num) + ".jpg"
urllib.urlretrieve(s, name)
f = open(name, "rb")
tags = exifread.process_file(f)
print (tags)
return
def main():
target = raw_input("Enter the target: ")
print ("\n")
get_images(target)
time.sleep(5)
sys.exit()
if __name__ == "__main__":
main()
The problem is you were not passing a base url, you need to pass the host and then join that to the src unless you get an absolute url from the src attribute.
The following code demonstrates a working example, I used requests in place of urllib but the logic is the same:
import bs4
import sys
import os
import requests
from urlparse import urljoin
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img", src=True):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
tags = exifread.process_file(f,"rb")
print (tags)
def main():
target ="http://www.exiv2.org/sample.html"
# need base to join to relative src
base = "http://www.exiv2.org/"
get_images(target, base)
if __name__ == "__main__":
main()
You will get the exif data for the one image on the page that has some:
A PIL example:
import bs4
import os
import requests
from urlparse import urljoin
import PIL.Image
def get_images(target, base):
page = requests.get(target).content
soup = bs4.BeautifulSoup(page, "html.parser")
for img in soup.find_all("img"):
src = img.get("src")
name = os.path.basename(src)
if not src.startswith(("www.","http:","https:")):
src = urljoin(base, src)
with open(name, "wb+") as f:
f.write(requests.get(src).content)
f.seek(0)
try:
img = PIL.Image.open(f)
exif_data = img._getexif()
print(exif_data)
except AttributeError as e:
print("No exif data for {}".format(name))
os.remove(name)
os.remove(name) will remove files that have no exif data, if you don't want that to happen then remove it.

Trying to download .pdf .png anf .jpg from nytimes.com

I wrote a simple Python scraper to grab some documents from a specific page on nytimes.com. It works, in the sense that it grabs, and formats all the URL's correctly, and attempts to download the files, and formats the name correctly.
But all I get are 1kb files. I can't figure out why. Here is my code:
import urllib2
import urllib
from cookielib import CookieJar
files = 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'
slashpos = 0
def getLinks(url):
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)
result = []
for line in p:
for element in line.split():
if element.startswith('href="http://gr'):
if element.endswith('pdf"') or element.endswith('png"') or element.endswith('jpg"'):
result.append(element[6:])
else:
continue
for char in result:
slashpos = char.rfind('/') + 1
urllib.urlretrieve(char, char[slashpos:-1])
getLinks(files)
Any and all help is appreciated. Thanks!
1) use result.append(element[6:-1]) instead of result.append(element[6:]) (Avoids having double quotes in the url, the reason why downloads fail)
2) and for saving the file use urllib.urlretrieve(char, char[slashpos:]) instead of urllib.urlretrieve(char, char[slashpos:-1])
Solved !! :D
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import urlparse
from sys import argv
from cookielib import CookieJar
if len(argv) != 2:
print "Usage:\n\tpython %s 'http://www.nytimes.com/interactive/2014/11/25/us/evidence-released-in-michael-brown-case.html?_r=0'"%argv[0]
exit()
url = argv[1]
urls =[]
try:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(url)
except:
print "[-] No such website"
exit()
soup = BeautifulSoup(html)
for tag in soup.find_all('a'):
try:
tag["href"] = urlparse.urljoin(url, tag['href'])
if tag['href'] not in urls and '.png' in tag['href'] or '.jpg' in tag['href']:
newpdf = tag['href'].split("/")
name = newpdf[-1]
resp = urllib2.urlopen(tag['href'])
meta_data = resp.info()
fsize = int(meta_data.getheaders("Content-Length")[0])
print "Downloading --> %s \t size: %s "%(name, fsize)
f = open(name, "wb")
f.write(resp.read())
f.close
urls.append(tag["href"])
else:
print tag['href']
except KeyboardInterrupt:
print " User hit CTRL+C"
exit()
except:
pass
Hopefully it would be helpful for you

Categories

Resources