I would like to take an SVG-XML payload retrieved from a website to a png file without using cairo, ideally using ImageMagick/wand. As an example, here's what I've got:
import base64
from bs4 import BeautifulSoup as bs
import requests
import wand.image
r = requests.get("http://www.codazen.com")
soup = bs(r.content, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
svg_xml_url = urls[1] # points to logo image
encoded = svg_xml_url.replace("data:image/svg+xml;base64,","")
= base64.b64decode(encoded)
with wand.image.Image(blob=decoded, format="svg") as image:
png_image = image.make_blob("png32")
with open("logo.png", "wb") as f:
f.write(png_image)
However, the resulting png image is eempty: just white. What am I doing wrong? Thanks.
Related
I have a code works fine but I want to find URLs in a wide range. How can I do that?
import requests
import random
from bs4 import BeautifulSoup
img = []
word = 'dog'
url = 'https://www.google.com/search?q={0}&tbm=isch'.format(word)
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
images = soup.findAll('img')
for image in images:
img.append(image.get('src'))
print(img[random.randint(1,21)])
If I write http://www.chictopia.com/photo/show/3
I can get proper image file.
However, if I set range to crawl image within multiple web page with using for loop
I can't get image file it seems 0bytes file is downloaded
f'http://www.chictopia.com/photo/show/+{x}
why I can get 0bytes image file and could anyone explain how to parse image of multiple page.
Thank you
import re
import requests
from bs4 import BeautifulSoup
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
print (response)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[400]+[.](jpg))$', url)
if not filename:
print("fail".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(response, url)
response = requests.get(url)
f.write(response.content)
try this
modified the regex pattern and used changed the call to proper image url.
now this code will save all the images containing _400.jpg in their link as following name.
import re
import requests
from bs4 import BeautifulSoup
import shutil
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
# print (response.status_code)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.findall(r'(.+_400\.jpg)', url)
if len(filename) != 0:
image = filename[0]
image_name = f"image_{image.split('/')[-1]}"
response = requests.get(image, stream=True)
with open(image_name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(f'Saved : {image_name}')
for example.
http://images2.chictopia.com/photos/mikajones/2162299642/2162299642_400.jpg -> as image_2162299642_400.jpg
now whats wrong with your code:
you used wrong regex and took the matching value which is not the complete url of the image that's why you are getting the null value for image size (you are not even calling the image url).
all fixed.
I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.
I'm using requests library for scraping the images.
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)
Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)
I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
using below script i am trying to capture the image and then save it on disk. And then have to save the local path in the DB.
I have writting a simple code to capture the image from webpage:-
import urllib2
from os.path import basename
from urlparse import urlsplit
from bs4 import BeautifulSoup
url = "http://www.someweblink.com/path_to_the_target_webpage"
urlContent = urllib2.urlopen(url).read()
soup = BeautifulSoup(''.join(urlContent))
imgTags = soup.findAll('img')
for imgTag in imgTags:
imgUrl = imgTag['src']
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
The page code for image :-
<div class="single-post-thumb"> <img width="620" height="330" src="http://ccccc.com/wp-content/uploads/2016/05/weerewr.jpg"/>
If you just want to download the image using the url of the image you can try this
import urllib
img_url = "Image url goes here"
urllib.urlretrieve(img_url,'test.jpg')
It will save your image with test.jpg name in the current working directory.
Note: mention full url of the image sometimes "src" attribute of the img tag contains relative urls.