I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
Related
How do I make it so that each image I garnered from web scraping is then stored to a folder? I use Google Colab currently since I am just practicing stuff. I want to store them in my Google Drive folder.
This is my code for web scraping:
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata('https://www.yahoo.com/')
soup = BeautifulSoup(htmldata, 'html.parser')
imgdata = []
for i in soup.find_all('img'):
imgdata = i['src']
print(imgdata)
I created a pics folder manually in the folder where the script is running to store the pictures in it. Than i changed your code in the for loop so its appending urls to the imgdata list. The try exceptblock is there because not every url in the list is valid.
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata('https://www.yahoo.com/')
soup = BeautifulSoup(htmldata, 'html.parser')
imgdata = []
for i in soup.find_all('img'):
imgdata.append(i['src']) # made a change here so its appendig to the list
filename = "pics/picture{}.jpg"
for i in range(len(imgdata)):
print(f"img {i+1} / {len(imgdata)+1}")
# try block because not everything in the imgdata list is a valid url
try:
r = requests.get(imgdata[i], stream=True)
with open(filename.format(i), "wb") as f:
f.write(r.content)
except:
print("Url is not an valid")
foo.write('whatever')
foo.close()
If I write http://www.chictopia.com/photo/show/3
I can get proper image file.
However, if I set range to crawl image within multiple web page with using for loop
I can't get image file it seems 0bytes file is downloaded
f'http://www.chictopia.com/photo/show/+{x}
why I can get 0bytes image file and could anyone explain how to parse image of multiple page.
Thank you
import re
import requests
from bs4 import BeautifulSoup
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
print (response)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[400]+[.](jpg))$', url)
if not filename:
print("fail".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(response, url)
response = requests.get(url)
f.write(response.content)
try this
modified the regex pattern and used changed the call to proper image url.
now this code will save all the images containing _400.jpg in their link as following name.
import re
import requests
from bs4 import BeautifulSoup
import shutil
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
# print (response.status_code)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.findall(r'(.+_400\.jpg)', url)
if len(filename) != 0:
image = filename[0]
image_name = f"image_{image.split('/')[-1]}"
response = requests.get(image, stream=True)
with open(image_name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(f'Saved : {image_name}')
for example.
http://images2.chictopia.com/photos/mikajones/2162299642/2162299642_400.jpg -> as image_2162299642_400.jpg
now whats wrong with your code:
you used wrong regex and took the matching value which is not the complete url of the image that's why you are getting the null value for image size (you are not even calling the image url).
all fixed.
Hello Community I have a problem and I dont know how to solve it my problem is I write a script to crawl webpages for Images with BeautifuleSoup4 but I got the error (AttributeError: 'NoneType' object has no attribute 'group')
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img', {"src": True})
urls = [img["src"] for img in img_tags]
for url in urls:
filename = re.search(r'([\w_-]+[.](jpg|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Your regex is wrong. Use Python's internal urllib to do the heavyweight lifting instead of writing regexes if you're not familiar with them.
Use something like this (untested):
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit # import this additional library
from os.path import basename # import this additional library
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
images_div = soup.find(id=re.compile(r"fcx-gallery-\w+")) # focus on the div containing the images
if img_tags: # test if img_tags has any data
img_tags = images_div.find_all('img', {"data-src": True}) # get all the images in that div
urls = [img["data-src"] for img in img_tags] # grab sources from data-source
for url in urls:
filename = basename(urlsplit(url).path) # use this instead of a regex
with open(filename, 'wb') as f: # filename is now a string
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
I am trying to extract all the images from below URL, However, I don't understand the HTTP Error 403: Forbidden, Can it be taken care of during error handling, or simply the URL cant be scraped due to limitations?
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print("downloading to current directory ")
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename)
return image_links
get_images("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")
some sites need you to specify User-Agent header
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
def make_soup(url):
site = url
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
return BeautifulSoup(page)
You can use this function for image scraping. using img tag along not useful nowadays .we can implement something like below, that will fulfill the requirement. It's not relay on any tags so wherever image link is present it will grab it.
def extract_ImageUrl(soup_chunk):
urls_found = []
for tags in soup_chunk.find_all():
attributes = tags.attrs
if str(attributes).__contains__('http'):
for links in attributes.values():
if re.match('http.*\.jpg|png',str(links)):
if len(str(links).split()) <=1:
urls_found.append(links)
else:
link = [i.strip() for i in str(links).split() if re.match('http.*\.jpg|png',str(i))]
urls_found = urls_found + link
print("Found {} image links".format(len(urls_found)))
return urls_found
It's an initial thought, require updates to make it very better.
I'm unable to save/download the images at the location. I can't figure out the problem although the code seems right.
I'm using requests library for scraping the images.
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)
Try wrapping your r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci") in a try: or while: statement to make sure that the website you are scraping is returning a 200 response, it could be that the website is timing out or not serving your request.
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)