I have a code works fine but I want to find URLs in a wide range. How can I do that?
import requests
import random
from bs4 import BeautifulSoup
img = []
word = 'dog'
url = 'https://www.google.com/search?q={0}&tbm=isch'.format(word)
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
images = soup.findAll('img')
for image in images:
img.append(image.get('src'))
print(img[random.randint(1,21)])
Related
I would like to take an SVG-XML payload retrieved from a website to a png file without using cairo, ideally using ImageMagick/wand. As an example, here's what I've got:
import base64
from bs4 import BeautifulSoup as bs
import requests
import wand.image
r = requests.get("http://www.codazen.com")
soup = bs(r.content, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
svg_xml_url = urls[1] # points to logo image
encoded = svg_xml_url.replace("data:image/svg+xml;base64,","")
= base64.b64decode(encoded)
with wand.image.Image(blob=decoded, format="svg") as image:
png_image = image.make_blob("png32")
with open("logo.png", "wb") as f:
f.write(png_image)
However, the resulting png image is eempty: just white. What am I doing wrong? Thanks.
I would like to know if it is possible to scrape images in websites with a code that can work for all the types of websites (I mean independently of the HTML format).
I have a list of websites ant I would need to get all the images related to each link.
For instance:
list_of links=['https://www.bbc.co.uk/programmes/articles/5nxMx7d1K8S6nhjkPBFhHSM/withering-wit-and-words-of-wisdom-oscar-wildes-best-quotes' , 'https://www.lastampa.it/torino/2020/03/31/news/coronavirus-il-lockdown-ha-gia-salvato-almeno-400-vite-umane-1.38659569' , and so on]
In general, I would use:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
link='...'
html = urlopen(link)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
for image in images:
print(image['src']+'\n')
but I have doubt in terms of html (can it be used for each website?) and about the image format (.jpg; would it be the same for all the websites?).
Thank you for all your comments and suggestions.
Assuming all the images are inside src tag and those image elements aren't dynamically added (not virtual DOM), modifying your code a little bit would work:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
link= '...'
html = urlopen(link)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {})
for image in images:
print(image['src']+'\n')
I'm trying to scrape the src of the img, but the code I found returns many img src, but not the one I want. I can't figure out what I am doing wrong. I am scraping TripAdvisor on "https://www.tripadvisor.dk/Restaurant_Review-g189541-d15804886-Reviews-The_Pescatarian-Copenhagen_Zealand.html"
So this is the HTML snippet I'm trying to extract from:
<div class="restaurants-detail-overview-cards-LocationOverviewCard__cardColumn--2ALwF"><h6>Placering og kontaktoplysninger</h6><span><div><span data-test-target="staticMapSnapshot" class=""><img class="restaurants-detail-overview-cards-LocationOverviewCard__mapImage--22-Al" src="https://trip-raster.citymaps.io/staticmap?scale=1&zoom=15&size=347x137&language=da¢er=55.687988,12.596316&markers=icon:http%3A%2F%2Fc1.tacdn.com%2F%2Fimg2%2Fmaps%2Ficons%2Fcomponent_map_pins_v1%2FR_Pin_Small.png|55.68799,12.596316"></span></div></span>
I want the code to return: (a sub-string from src)
55.68799,12.596316
I have tried:
import pandas as pd
pd.options.display.max_colwidth = 200
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import re
web_url = "https://www.tripadvisor.dk/Restaurant_Review-g189541-d15804886-Reviews-The_Pescatarian-Copenhagen_Zealand.html"
url = urlopen(web_url)
url_html = url.read()
soup = bs(url_html, 'lxml')
soup.find_all('img')
for link in soup.find_all('img'):
print(link.get('src'))
the return is along the lines of this BUT NOT the src that I need :
https://static.tacdn.com/img2/branding/rebrand/TA_logo_secondary.svg
https://static.tacdn.com/img2/branding/rebrand/TA_logo_primary.svg
https://static.tacdn.com/img2/branding/rebrand/TA_logo_secondary.svg


You can do this with just requests and re. It is only the co-ordinates part of the src which are the location based variable.
import requests, re
p = re.compile(r'"coords":"(.*?)"')
r = requests.get('https://www.tripadvisor.dk/Restaurant_Review-g189541-d15804886-Reviews-The_Pescatarian-Copenhagen_Zealand.html')
coords = p.findall(r.text)[1]
src = f'https://trip-raster.citymaps.io/staticmap?scale=1&zoom=15&size=347x137&language=da¢er={coords}&markers=icon:http://c1.tacdn.com//img2/maps/icons/component_map_pins_v1/R_Pin_Small.png|{coords}'
print(src)
print(coords)
Selenium is a workaround i tested it and works liek a charm. Here you are:
from selenium import webdriver
driver = webdriver.Chrome('chromedriver.exe')
driver.get("https://www.tripadvisor.dk/Restaurant_Review-g189541-d15804886-Reviews-The_Pescatarian-Copenhagen_Zealand.html")
links = driver.find_elements_by_xpath("//*[#src]")
urls = []
for link in links:
url = link.get_attribute('src')
if '|' in url:
urls.append(url.split('|')[1]) # saves in a list only the numbers you want i.e. 55.68799,12.596316
print(url)
print(urls)
Result of above
['55.68799,12.596316']
If you haven't used selenium before here you can find a webdriver https://chromedriver.storage.googleapis.com/index.html?path=2.46/
or here
https://sites.google.com/a/chromium.org/chromedriver/downloads
I am trying to find the number of images (extensions .jpg, .png , jpeg) with the link through python. I can use any library such as beautifulsoup. But how do I do it.
I am using following code :
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('HTMLS%5C110k_Source.htm'), "html.parser")
img_links = len(soup.find_all('.jpg'))
print("Number of Images : ", img_links)
But all in vain.
You can try to use lxml.html as below:
from lxml import html
with open('HTMLS%5C110k_Source.htm', 'r') as f:
source = html.fromstring(f.read())
print(len(source.xpath('//img[contains(#src, ".jpg") or contains(#src, ".jpeg") or contains(#src, ".png")]')))
This is as easy as writing a loop if you read the docs
import bs4
import requests
url = 'somefoobar.net'
page = requests.get(url).text
soup = bs4.BeautifulSoup(page, 'lxml')
images = soup.findAll('img')
# loop through all img elements found and store the urls with matching extensions
urls = list(x for x in images if x['src'].split('.')[-1] in file_types)
print(urls)
print(len(urls))
I'm trying to write a Python script to download images from any website. It is working, but inconsistently. Specifically, find_all("img") is not doing so for the second url. The script is:
# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
The images are rendered with JavaScript on the page that is failing.
First render the page with dryscrape
(If you don't want to use dryscrape see Web-scraping JavaScript page with Python )
e.g.
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
session = dryscrape.Session()
session.visit("http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/")
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1
But I would also check that you have an absolute URL not a relative one:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin
def url_to_image(url, filename):
# get HTTP response, open as bytes, save the image
# http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
req = requests.get(url)
i = Image.open(BytesIO(req.content))
i.save(filename)
# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")
# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
if img["src"].endswith("jpg"):
print("endswith jpg")
urls.append(str(img["src"]))
print(str(img))
jpeg_no = 00
for url in urls:
if url.startswith( 'http' ):
absoute = url
else:
absoute = urljoin(base, url)
print (absoute)
url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
jpeg_no += 1