Unable to extract images from webpage in Python using beautiful sou - python

I am trying to extract all the images from below URL, However, I don't understand the HTTP Error 403: Forbidden, Can it be taken care of during error handling, or simply the URL cant be scraped due to limitations?
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print("downloading to current directory ")
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename)
return image_links
get_images("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")

some sites need you to specify User-Agent header
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
def make_soup(url):
site = url
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
return BeautifulSoup(page)

You can use this function for image scraping. using img tag along not useful nowadays .we can implement something like below, that will fulfill the requirement. It's not relay on any tags so wherever image link is present it will grab it.
def extract_ImageUrl(soup_chunk):
urls_found = []
for tags in soup_chunk.find_all():
attributes = tags.attrs
if str(attributes).__contains__('http'):
for links in attributes.values():
if re.match('http.*\.jpg|png',str(links)):
if len(str(links).split()) <=1:
urls_found.append(links)
else:
link = [i.strip() for i in str(links).split() if re.match('http.*\.jpg|png',str(i))]
urls_found = urls_found + link
print("Found {} image links".format(len(urls_found)))
return urls_found
It's an initial thought, require updates to make it very better.

Related

How to download an HTML file completely? [duplicate]

Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.

Find and download image on multiple pages

If I write http://www.chictopia.com/photo/show/3
I can get proper image file.
However, if I set range to crawl image within multiple web page with using for loop
I can't get image file it seems 0bytes file is downloaded
f'http://www.chictopia.com/photo/show/+{x}
why I can get 0bytes image file and could anyone explain how to parse image of multiple page.
Thank you
import re
import requests
from bs4 import BeautifulSoup
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
print (response)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[400]+[.](jpg))$', url)
if not filename:
print("fail".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(response, url)
response = requests.get(url)
f.write(response.content)
try this
modified the regex pattern and used changed the call to proper image url.
now this code will save all the images containing _400.jpg in their link as following name.
import re
import requests
from bs4 import BeautifulSoup
import shutil
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
# print (response.status_code)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.findall(r'(.+_400\.jpg)', url)
if len(filename) != 0:
image = filename[0]
image_name = f"image_{image.split('/')[-1]}"
response = requests.get(image, stream=True)
with open(image_name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(f'Saved : {image_name}')
for example.
http://images2.chictopia.com/photos/mikajones/2162299642/2162299642_400.jpg -> as image_2162299642_400.jpg
now whats wrong with your code:
you used wrong regex and took the matching value which is not the complete url of the image that's why you are getting the null value for image size (you are not even calling the image url).
all fixed.

How to obtain all image links from a Reddit page's posts using requests in Python

Here is the code I am working with using python requests and the variable page passed in is "http://www.reddit.com/r/okbuddyretard/top/?t=day". Not sure what I am doing wrong where I am not able to read the HTML and pull all of the images from posts.
def return_image_links(page):
page = requests.get(page, headers = headers)
if page.status_code == 200:
page.html.render(sleep =5,timeout = 8) # Not sure if I am doing this right either
urls = page.html.xpath("a[ends-with(#src, '.jpg')]") #This is the part I can't figure out
print(urls)
image_urls = []
for url in urls:
if is_image(url):
image_urls.append(url)
else:
image_urls.extend(get_imgur_gallery_links(url))
return image_urls
else:
print('Bad response from reddit server.')
sys.exit()
The line urls = page.html.xpath("a[ends-with(#src, '.jpg')]") is simply wrong, you seem to be trying to use methods which are present under the lxml nodule, while using names (html.xpath) that do not live under the class requests.Response.
The following is an example which uses requests and BeautifulSoup to parse the html and access to the images :
from bs4 import BeautifulSoup as bs
import requests
import sys
def return_image_links(url, headers):
page = requests.get(url, headers=headers)
if page.status_code == 200:
soup = bs(page.text, "html.parser")
img_tags = soup.select('img[alt="Post image"]')
image_urls = []
for img_tag in img_tags:
image_urls.append(img_tag['src'])
return image_urls
else:
print('Bad response from reddit server.')
sys.exit()
url = "http://www.reddit.com/r/okbuddyretard/top/?t=day"
headers = {"User-Agent": 'Mozilla/5.0'}
imgs = return_image_links(url, headers)
print(*imgs, sep='\n')

BeautifulSoup find_all("img") not working for all sites

I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1

How to get image from webpage

I am editing a Python script which gets images from a webpage (which needs a private login, so there is no point in me posting a link to it). It uses the BeautifulSoup library, and the original script is here.
What I would like to do is to customize this script to get a single image, the HTML tag of which has the id attribute id="fimage". It has no class. Here is the code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import urllib.error
from urllib.request import urlopen
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.find(id="fimage")]
print (images)
print (str(len(images)) + " images found.")
# print 'Downloading images to current working directory.'
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urlretrieve(each, filename)
return image_links
get_images('http://myurl');
#a standard call looks like this
#get_images('http://www.wookmark.com')
For some reason, this doesn't seem to work. When run on the command line, it produces the output:
[]
0 images found.
UPDATE:
Okay so I have changed the code and now the script seems to find the image I'm trying to download, but it throws another error when run and can't download it.
Here is the updated code:
from bs4 import BeautifulSoup
from urllib import request
import urllib.parse
import urllib.error
from urllib.request import urlopen
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
image = soup.find(id="logo", src=True)
if image is None:
print('No images found.')
return
image_link = image['src']
filename = image_link.split('/')[-1]
request.urlretrieve(filename)
return image_link
try:
get_images('https://pypi.python.org/pypi/ClientForm/0.2.10');
except ValueError as e:
print("File could not be retrieved.", e)
else:
print("It worked!")
#a standard call looks like this
#get_images('http://www.wookmark.com')
When run on the command line the output is:
File could not be retrieved. unknown url type: 'python-logo.png'
soup.find(id="fimage") returns one result, not a list. You are trying to loop over that one element, which means it'll try and list the child nodes, and there are none.
Simply adjust your code to take into account you only have one result; remove all the looping:
image = soup.find(id="fimage", src=True)
if image is None:
print('No matching image found')
return
image_link = image['src']
filename = image_link.split('/')[-1]
urlretrieve(each, filename)
I refined the search a little; by adding src=True you only match a tag if it has a src attribute.

Categories

Resources