I am editing a Python script which gets images from a webpage (which needs a private login, so there is no point in me posting a link to it). It uses the BeautifulSoup library, and the original script is here.
What I would like to do is to customize this script to get a single image, the HTML tag of which has the id attribute id="fimage". It has no class. Here is the code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import urllib.error
from urllib.request import urlopen
# use this image scraper from the location that
#you want to save scraped images to
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.find(id="fimage")]
print (images)
print (str(len(images)) + " images found.")
# print 'Downloading images to current working directory.'
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urlretrieve(each, filename)
return image_links
get_images('http://myurl');
#a standard call looks like this
#get_images('http://www.wookmark.com')
For some reason, this doesn't seem to work. When run on the command line, it produces the output:
[]
0 images found.
UPDATE:
Okay so I have changed the code and now the script seems to find the image I'm trying to download, but it throws another error when run and can't download it.
Here is the updated code:
from bs4 import BeautifulSoup
from urllib import request
import urllib.parse
import urllib.error
from urllib.request import urlopen
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
image = soup.find(id="logo", src=True)
if image is None:
print('No images found.')
return
image_link = image['src']
filename = image_link.split('/')[-1]
request.urlretrieve(filename)
return image_link
try:
get_images('https://pypi.python.org/pypi/ClientForm/0.2.10');
except ValueError as e:
print("File could not be retrieved.", e)
else:
print("It worked!")
#a standard call looks like this
#get_images('http://www.wookmark.com')
When run on the command line the output is:
File could not be retrieved. unknown url type: 'python-logo.png'
soup.find(id="fimage") returns one result, not a list. You are trying to loop over that one element, which means it'll try and list the child nodes, and there are none.
Simply adjust your code to take into account you only have one result; remove all the looping:
image = soup.find(id="fimage", src=True)
if image is None:
print('No matching image found')
return
image_link = image['src']
filename = image_link.split('/')[-1]
urlretrieve(each, filename)
I refined the search a little; by adding src=True you only match a tag if it has a src attribute.
Related
Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
I have been trying to download pictures and audio from this web based game. It has not yet worked for me. Is there any way I can get help here, even if it is a tutorial?
import re
import requests
from bs4 import BeautifulSoup
def extract_images(site):
""" Extract images from the url given"""
response=requests.get(site)
soup=BeautifulSoup(response.text, 'lxml.parser')
image_tags=soup.find_all('image')
# extract the urls
urls=[image['src'] for img in img_tags]
for url in urls:
pattern=r'/([\w_-]+[.](jpg|gif|png))$' # pattern to extract image files
filename=re.search(pattern, url)
if not filename:
print("No filename {}", url)
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url: # relative reques?
url="{}{}".format(site, url)
print(url)
response=requests.get(url)
f.write(response.content)
if __name__=="__main__":
site="https://heartofvegasslots.productmadness.com/"
extract_images(site)
Great start on it and almost had it working.
There are a couple of lines that you should change to get it working.
First: This line needs wont work in its current form as the list comprehension is calling the wrong variables
`urls=[image['src'] for img in img_tags]`
also if you call image['src'] and 'src' doesnt exist it will fail. try instead using the get method.
maybe try something like this:
`urls=[img.get('src') for img in image_tags]`
Second line
image_tags=soup.find_all('image')
to
image_tags=soup.find_all('img')
I have changed it to 'img' instead of 'image' as those are the name of the html tags for images.
I couldnt get it working for the website provided, but that looks like it doesnt have any img tags.
This is my code, and a website that i have tried with it working.
import re
import requests
import lxml
from bs4 import BeautifulSoup
def extract_images(site):
""" Extract images from the url given"""
response=requests.get(site)
soup=BeautifulSoup(response.text, 'lxml')
image_tags=soup.find_all('img')
print(image_tags)
# extract the urls
urls=[img.get('src') for img in image_tags]
for url in urls:
if url:
pattern=r'/([\w_-]+[.](jpg|gif|png))$' # pattern to extract image files
filename=re.search(pattern, url)
if not filename:
print("No filename {}", url)
continue
with open(filename.group(1), 'wb') as f:
print(f'writing {filename}')
if 'http' not in url: # relative reques?
url="{}{}".format(site, url)
print(url)
response=requests.get(url)
f.write(response.content)
if __name__=="__main__":
site="https://antennatestlab.com/"
extract_images(site)
I am trying to extract all the images from below URL, However, I don't understand the HTTP Error 403: Forbidden, Can it be taken care of during error handling, or simply the URL cant be scraped due to limitations?
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request
def make_soup(url):
html = urlopen(url).read()
return BeautifulSoup(html)
def get_images(url):
soup = make_soup(url)
#this makes a list of bs4 element tags
images = [img for img in soup.findAll('img')]
print (str(len(images)) + "images found.")
print("downloading to current directory ")
#compile our unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename)
return image_links
get_images("https://opensignal.com/reports/2019/04/uk/mobile-network-experience")
some sites need you to specify User-Agent header
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import urllib.request
def make_soup(url):
site = url
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
return BeautifulSoup(page)
You can use this function for image scraping. using img tag along not useful nowadays .we can implement something like below, that will fulfill the requirement. It's not relay on any tags so wherever image link is present it will grab it.
def extract_ImageUrl(soup_chunk):
urls_found = []
for tags in soup_chunk.find_all():
attributes = tags.attrs
if str(attributes).__contains__('http'):
for links in attributes.values():
if re.match('http.*\.jpg|png',str(links)):
if len(str(links).split()) <=1:
urls_found.append(links)
else:
link = [i.strip() for i in str(links).split() if re.match('http.*\.jpg|png',str(i))]
urls_found = urls_found + link
print("Found {} image links".format(len(urls_found)))
return urls_found
It's an initial thought, require updates to make it very better.
using below script i am trying to capture the image and then save it on disk. And then have to save the local path in the DB.
I have writting a simple code to capture the image from webpage:-
import urllib2
from os.path import basename
from urlparse import urlsplit
from bs4 import BeautifulSoup
url = "http://www.someweblink.com/path_to_the_target_webpage"
urlContent = urllib2.urlopen(url).read()
soup = BeautifulSoup(''.join(urlContent))
imgTags = soup.findAll('img')
for imgTag in imgTags:
imgUrl = imgTag['src']
try:
imgData = urllib2.urlopen(imgUrl).read()
fileName = basename(urlsplit(imgUrl)[2])
output = open(fileName,'wb')
output.write(imgData)
output.close()
except:
pass
The page code for image :-
<div class="single-post-thumb"> <img width="620" height="330" src="http://ccccc.com/wp-content/uploads/2016/05/weerewr.jpg"/>
If you just want to download the image using the url of the image you can try this
import urllib
img_url = "Image url goes here"
urllib.urlretrieve(img_url,'test.jpg')
It will save your image with test.jpg name in the current working directory.
Note: mention full url of the image sometimes "src" attribute of the img tag contains relative urls.
I am trying to make a web crawler that will give me all the links to images in the given URL, but many of the images that I found, while looking in the page source and searching in the page source with CTRL+F, were not printed in the output.
my code is:
import requests
from bs4 import BeautifulSoup
import urllib
import os
print ("Which website would you like to crawl?")
website_url = raw_input("--> ")
i = 0
while i < 1:
source_code = requests.get(website_url) # The source code will have the page source (<html>.......</html>
plain_text = source_code.text # Gets only the text from the source code
soup = BeautifulSoup(plain_text, "html5lib")
for link in soup.findAll('img'): # A loop which looking for all the images in the website
src = link.get('src') # I want to get the image URL and its located under 'src' in HTML
if 'http://' not in src and 'https://' not in src:
if src[0] != '/':
src = '/' + src
src = website_url + src
print src
i += 1
How should I make my code print every image that is in an <img> in the HTML page source?
For example: the website has this HTML code:
<img src="http://shippuden.co.il/wp-content/uploads/newkadosh21.jpg" *something* >
But the script didn't print its src.
The script is printing the src in <img .... src="...">
How should I improve my code to find all the images?
Taking a look on the main page of the domain you posted on the example, I see that the image you refer is not on src, but on data-lazy-src attribute.
So you should parse both attributes like:
src = link.get('src')
lazy_load_src = link.get('data-lazy-src')
Actually when running the example software you showed, the img src for the image newkadosh21 is printed, but it is an base64 like:
src="data:image/gif;base64,R0lGODdhAQABAPAAAP///wAAACwAAAAAAQABAEACAkQBADs="