Related
i am trying to make a program to search for images, but the function sends small images. how can i fix this? (for example, if the original image is 1920x1080, it will reduce to something like 192x108)
def search_image(key):
url = "https://www.google.com/search?tbm=isch&q=" + key.replace(" ", "+") + "&aqs=chrome..69i57j69i64j69i60l3.400j0j1&sourceid=chrome&ie=UTF-8"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = Request(url, headers=headers)
page = urlopen(req)
bs = BeautifulSoup(page, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.*gstatic.com.*')})
i = 0
for img in images:
i += 1
if i > 10: i = 10
for img in images:
ret = randint(0, i)
if ret == 1:
return img['src']
else: i -= 1
You can resize the image with the help of an image library.
import Image as image
image.thumbnail(size, Image.ANTIALIAS)
I've crated a script to get the title of different shops from some identical webpages. The script is doing fine.
I'm now trying to create a logic within the script to let it try few times if somehow it fails to grab the titles from those pages.
As a test, if I define the line with selector otherwise, as in name = soup.select_one(".sales-info > h").text, the script will go for looping indefinitely.
I've tried so far with:
import requests
from bs4 import BeautifulSoup
links = (
'https://www.yellowpages.com/san-francisco-ca/mip/nizarios-pizza-481135933',
'https://www.yellowpages.com/nationwide/mip/credo-452182701'
)
def get_title(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
try:
name = soup.select_one(".sales-info > h1").text
except Exception:
print("trying again")
return get_title(s,link) #I wish to bring about any change here to let the script try few times other than trying indefinitely
return name
if __name__ == '__main__':
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
for link in links:
print(get_title(s,link))
How can I let the script try few times when it fails to grab title from a webpage?
PS The webpages that I've used within the script are placeholders.
I added some parameters to specify number of retries, sleep between retries and default value to return if everything fails:
import time
import requests
from bs4 import BeautifulSoup
links = (
'https://www.webscraper.io/test-sites/e-commerce/allinone',
'https://www.webscraper.io/test-sites/e-commerce/static'
)
def get_title(s, link, retries=3, sleep=1, default=''):
"""
s -> session
link -> url
retries -> number of retries before return default value
sleep -> sleep between tries (in seconds)
default -> default value to return if every retry fails
"""
name, current_retry = default, 0
while current_retry != retries:
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
try:
name = soup.select_one("h8").text
except Exception:
print("Retry {}/{}".format(current_retry + 1, retries))
time.sleep(sleep)
current_retry += 1
return name
if __name__ == '__main__':
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
for link in links:
print(get_title(s, link, 3, 1, 'Failed to grab {}'.format(link)))
Prints:
Retry 1/3
Retry 2/3
Retry 3/3
Failed to grab https://www.webscraper.io/test-sites/e-commerce/allinone
Retry 1/3
Retry 2/3
Retry 3/3
Failed to grab https://www.webscraper.io/test-sites/e-commerce/static
I think the simplest way would be to switch from recursion to a loop:
def get_title(s,link):
failed = 0
while failed < 5:
try:
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
name = soup.select_one(".sales-info > h1").text
return name
except Exception: # Best to specify which one, by the way
failed += 1
print('Failed too many times')
return None
You can try to use any retrying library, such as tenacity, backoff. Notice that these libraries usually function as decorators and your function will simply need to make the import and then call the decorator in a similar fashion to:
import requests
from bs4 import BeautifulSoup
from tenacity import retry ###or import backoff
...
#retry ###or #backoff.on_exception(backoff.expo, requests.exceptions.RequestException)
def get_title(s, link, retries=3, sleep=1, default=''):
...
You can achieve the same in different ways. Here is another you might wanna consider trying:
import time
import requests
from bs4 import BeautifulSoup
links = [
"https://www.yellowpages.com/san-francisco-ca/mip/nizarios-pizza-481135933",
"https://www.yellowpages.com/nationwide/mip/credo-452182701"
]
def get_title(s,link,counter=0):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
try:
name = soup.select_one(".sales-info > h").text
except Exception:
if counter<=3:
time.sleep(1)
print("done trying {} times".format(counter))
counter += 1
return get_title(s,link,counter)
else:
return None
return name
if __name__ == '__main__':
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'
for link in links:
print(get_title(s,link))
Using stackoverflow for the first time trying to figure out how to scrape Yelp data and having a hard time. Have set up LXML, beautiful soup, requests, PIP, Python and have added these to the path in system variables yet I am still getting the error below when I try to run code below. Any suggestions?
File "test2.py", line 4, in
from exceptions import ValueError
ModuleNotFoundError: No module named 'exceptions'
from lxml import html
import json
import requests
from exceptions import ValueError
import re, urllib
import urllib3
import argparse
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import time
from concurrent.futures import ThreadPoolExecutor
import sys
from threading import Thread
import os
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#[#'https://www.yelp.com/biz/kdb-kitchen-den-bar-long-beach',
yelp_urls =['https://www.yelp.com/biz/the-atlas-room-washington','https://www.yelp.com/biz/the-rack-brandon','https://www.yelp.com/biz/payard-p%C3%A2tisserie-and-bistro-new-york-2','https://www.yelp.com/biz/maison-giraud-pacific-palisades','https://www.yelp.com/biz/saltbox-san-diego','https://www.yelp.com/biz/carmichaels-chicago-steak-house-chicago','https://www.yelp.com/biz/black-eyed-pea-restaurant-houston-6','https://www.yelp.com/biz/perfecto-mundo-latin-fusion-bistro-commack','https://www.yelp.com/biz/smittys-bbq-boyd','https://www.yelp.com/biz/reston-kabob-reston','https://www.yelp.com/biz/bookmark-cafe-largo','https://www.yelp.com/biz/the-tin-angel-pittsburgh','https://www.yelp.com/biz/briantos-original-hoagies-orlando','https://www.yelp.com/biz/freeway-diner-woodbury','https://www.yelp.com/biz/river-gods-cambridge','https://www.yelp.com/biz/golan-kosher-restaurant-north-hollywood-2','https://www.yelp.com/biz/city-hall-restaurant-new-york-2','https://www.yelp.com/biz/empire-pizza-and-grill-west-chester','https://www.yelp.com/biz/cityzen-washington-2','https://www.yelp.com/biz/three-degrees-los-gatos','https://www.yelp.com/biz/applebees-grill-bar-quakertown','https://www.yelp.com/biz/johnny-carinos-covina','https://www.yelp.com/biz/buffet-de-la-gare-hastings-hdsn','https://www.yelp.com/biz/continental-food-management-la-mirada','https://www.yelp.com/biz/elephant-bar-restaurant-peoria','https://www.yelp.com/biz/sullivans-steakhouse-denver','https://www.yelp.com/biz/yucatan-liquid-stand-coppell','https://www.yelp.com/biz/tomato-pie-morristown','https://www.yelp.com/biz/willett-house-port-chester','https://www.yelp.com/biz/thai-corner-san-antonio-2','https://www.yelp.com/biz/silkes-american-grill-mesa','https://www.yelp.com/biz/t-mex-cantina-fort-lauderdale-2','https://www.yelp.com/biz/casa-oaxaca-washington','https://www.yelp.com/biz/wings-on-wheels-hebron','https://www.yelp.com/biz/siris-thai-french-cuisine-cherry-hill','https://www.yelp.com/biz/nightwood-chicago','https://www.yelp.com/biz/cafe-gallery-burlington','https://www.yelp.com/biz/the-hurricane-caf%C3%A9-seattle-2','https://www.yelp.com/biz/231-ellsworth-san-mateo','https://www.yelp.com/biz/la-marmite-williston-park','https://www.yelp.com/biz/the-river-house-palm-beach-gardens-2','https://www.yelp.com/biz/langermanns-baltimore','https://www.yelp.com/biz/del-friscos-grille-phoenix','https://www.yelp.com/biz/carrows-family-restaurant-antioch','https://www.yelp.com/biz/minerva-fine-indian-herndon-va-herndon-5','https://www.yelp.com/biz/the-mason-bar-dallas','https://www.yelp.com/biz/la-cote-cafe-and-wine-bar-seattle','https://www.yelp.com/biz/vareli-new-york','https://www.yelp.com/biz/wendys-wixom','https://www.yelp.com/biz/lanterna-tuscan-bistro-nyack','https://www.yelp.com/biz/yo-taco-duxbury','https://www.yelp.com/biz/bombay-palace-new-york','https://www.yelp.com/biz/cafe-buonaros-naperville','https://www.yelp.com/biz/ponti-seafood-grill-seattle-3','https://www.yelp.com/biz/bill-johnsons-big-apple-restaurants-phoenix-5','https://www.yelp.com/biz/by-word-of-mouth-oakland-park','https://www.yelp.com/biz/anna-maries-pizza-and-restaurant-wharton','https://www.yelp.com/biz/dierdorf-and-harts-steakhouse-saint-louis','https://www.yelp.com/biz/wine-5-cafe-las-vegas','https://www.yelp.com/biz/ernies-restaurant-plymouth','https://www.yelp.com/biz/next-door-pizza-and-pub-lees-summit','https://www.yelp.com/biz/lannys-alta-cocina-mexicana-fort-worth','https://www.yelp.com/biz/jalisco-mexican-restaurant-eastlake','https://www.yelp.com/biz/clio-boston','https://www.yelp.com/biz/uncommon-grounds-aliquippa','https://www.yelp.com/biz/uozumi-restaurant-palmdale','https://www.yelp.com/biz/enzos-pizza-matawan','https://www.yelp.com/biz/the-pointe-cafe-south-san-francisco','https://www.yelp.com/biz/captains-restaurant-and-seafood-market-florida-city','https://www.yelp.com/biz/le-perigord-new-york-4','https://www.yelp.com/biz/i-love-thai-arlington','https://www.yelp.com/biz/bistro-44-bedford','https://www.yelp.com/biz/ritters-marietta','https://www.yelp.com/biz/rouge-et-blanc-new-york','https://www.yelp.com/biz/assembly-steak-house-and-seafood-grill-englewood-cliffs-2','https://www.yelp.com/biz/american-turkish-restaurant-fort-lauderdale','https://www.yelp.com/biz/r-and-r-bar-b-que-and-catering-service-missouri-2','https://www.yelp.com/biz/sushi-land-long-beach','https://www.yelp.com/biz/longshots-sports-bar-waretown','https://www.yelp.com/biz/salt-creek-barbeque-glendale-heights','https://www.yelp.com/biz/pizza-market-breese','https://www.yelp.com/biz/john-qs-steakhouse-cleveland','https://www.yelp.com/biz/bistro-n-boca-raton-2','https://www.yelp.com/biz/samanthas-restaurant-silver-spring-2','https://www.yelp.com/biz/baha-brothers-sandbar-grill-taunton-3','https://www.yelp.com/biz/cafe-cortina-farmington-hills-5','https://www.yelp.com/biz/big-beaver-tavern-troy','https://www.yelp.com/biz/hogans-restaurant-bloomfield-hills','https://www.yelp.com/biz/the-copper-monkey-beaverton','https://www.yelp.com/biz/clement-street-bar-and-grill-san-francisco','https://www.yelp.com/biz/pepin-scottsdale','https://www.yelp.com/biz/village-belle-philadelphia','https://www.yelp.com/biz/sweet-woodruff-san-francisco','https://www.yelp.com/biz/siam-marina-tinley-park','https://www.yelp.com/biz/luigis-italian-restaurant-centennial-2','https://www.yelp.com/biz/smokin-wills-barbecue-roselle','https://www.yelp.com/biz/voltaire-restaurant-scottsdale','https://www.yelp.com/biz/jus-cookins-restaurant-lakewood-2','https://www.yelp.com/biz/pegs-countryside-cafe-hamel','https://www.yelp.com/biz/rays-grill-fulshear','https://www.yelp.com/biz/cafe-zalute-rosemont','https://www.yelp.com/biz/guard-house-inn-gladwyne','https://www.yelp.com/biz/road-runner-grand-canyon-las-vegas-2','https://www.yelp.com/biz/garage-restaurant-and-cafe-new-york','https://www.yelp.com/biz/los-tapatios-cedar-hill','https://www.yelp.com/biz/chengdu-46-clifton','https://www.yelp.com/biz/moby-dick-house-of-kabob-fairfax','https://www.yelp.com/biz/natures-food-patch-clearwater','https://www.yelp.com/biz/taco-del-mar-hillsboro-3','https://www.yelp.com/biz/ms-tootsies-rbl-philadelphia','https://www.yelp.com/biz/the-big-c-athletic-club-concord','https://www.yelp.com/biz/west-hanover-pizzeria-hanover','https://www.yelp.com/biz/georges-pastaria-houston','https://www.yelp.com/biz/encuentro-oakland-3','https://www.yelp.com/biz/smokys-bbq-eldersburg','https://www.yelp.com/biz/ruby-tuesday-san-antonio','https://www.yelp.com/biz/saladworks-philadelphia-4','https://www.yelp.com/biz/captain-pizza-middleton','https://www.yelp.com/biz/bob-evans-fredericksburg-3','https://www.yelp.com/biz/frittata-clawson','https://www.yelp.com/biz/the-sandwich-spot-palm-springs','https://www.yelp.com/biz/freds-mexican-cafe-san-diego-4','https://www.yelp.com/biz/geordies-steak-phoenix-2','https://www.yelp.com/biz/five-guys-wayne-5','https://www.yelp.com/biz/zen-sushi-la-crescenta-2','https://www.yelp.com/biz/the-summit-steakhouse-aurora-2','https://www.yelp.com/biz/miramar-bistro-highwood','https://www.yelp.com/biz/mick-o-sheas-baltimore','https://www.yelp.com/biz/dennys-houston-30','https://www.yelp.com/biz/carls-jr-henderson-5','https://www.yelp.com/biz/mexican-town-restaurant-detroit','https://www.yelp.com/biz/sushi-roku-las-vegas','https://www.yelp.com/biz/giant-pizza-king-san-diego','https://www.yelp.com/biz/quiznos-brooklyn-6','https://www.yelp.com/biz/taco-bell-glen-ellyn','https://www.yelp.com/biz/las-tortas-locas-marietta','https://www.yelp.com/biz/smith-and-wollensky-las-vegas-2','https://www.yelp.com/biz/happy-garden-chinese-brighton','https://www.yelp.com/biz/urban-foodie-feed-store-college-park','https://www.yelp.com/biz/the-wolf-oakland','https://www.yelp.com/biz/scuzzis-italian-restaurant-san-antonio-4','https://www.yelp.com/biz/better-gourmet-health-kitchen-staten-island','https://www.yelp.com/biz/the-restaurant-and-cafe-warren','https://www.yelp.com/biz/mcdonalds-houston-214','https://www.yelp.com/biz/pyeong-chang-tofu-house-oakland','https://www.yelp.com/biz/maria-rosa-pizzeria-and-family-restaurant-flemington','https://www.yelp.com/biz/legends-sports-bar-and-grill-roseville-2','https://www.yelp.com/biz/villa-reale-pizzeria-and-restaurant-pittsburgh','https://www.yelp.com/biz/the-terrace-cafe-venice','https://www.yelp.com/biz/the-oval-room-washington-2','https://www.yelp.com/biz/high-point-coal-center','https://www.yelp.com/biz/j-and-s-montebello','https://www.yelp.com/biz/cheers-restaurant-and-bar-fort-lauderdale']
def parse_page(url):
# url = "https://www.yelp.com/biz/frances-san-francisco"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False).text
parser = html.fromstring(response)
raw_name = parser.xpath("//h1[contains(#class,'page-title')]//text()")
raw_claimed = parser.xpath("//span[contains(#class,'claim-status_icon--claimed')]/parent::div/text()")
raw_reviews = parser.xpath("//div[contains(#class,'biz-main-info')]//span[contains(#class,'review-count rating-qualifier')]//text()")
raw_category = parser.xpath('//div[contains(#class,"biz-page-header")]//span[#class="category-str-list"]//a/text()')
hours_table = parser.xpath("//table[contains(#class,'hours-table')]//tr")
details_table = parser.xpath("//div[#class='short-def-list']//dl")
raw_map_link = parser.xpath("//a[#class='biz-map-directions']/img/#src")
raw_phone = parser.xpath(".//span[#class='biz-phone']//text()")
raw_address = parser.xpath('//div[#class="mapbox-text"]//div[contains(#class,"map-box-address")]//text()')
raw_wbsite_link = parser.xpath("//span[contains(#class,'biz-website')]/a/#href")
raw_price_range = parser.xpath("//dd[contains(#class,'price-description')]//text()")
raw_health_rating = parser.xpath("//dd[contains(#class,'health-score-description')]//text()")
rating_histogram = parser.xpath("//table[contains(#class,'histogram')]//tr[contains(#class,'histogram_row')]")
raw_ratings = parser.xpath("//div[contains(#class,'biz-page-header')]//div[contains(#class,'rating')]/#title")
working_hours = []
for hours in hours_table:
raw_day = hours.xpath(".//th//text()")
raw_timing = hours.xpath("./td//text()")
day = ''.join(raw_day).strip()
timing = ''.join(raw_timing).strip()
working_hours.append({day:timing})
info = []
for details in details_table:
raw_description_key = details.xpath('.//dt//text()')
raw_description_value = details.xpath('.//dd//text()')
description_key = ''.join(raw_description_key).strip()
description_value = ''.join(raw_description_value).strip()
info.append({description_key:description_value})
ratings_histogram = []
for ratings in rating_histogram:
raw_rating_key = ratings.xpath(".//th//text()")
raw_rating_value = ratings.xpath(".//td[#class='histogram_count']//text()")
rating_key = ''.join(raw_rating_key).strip()
rating_value = ''.join(raw_rating_value).strip()
ratings_histogram.append({rating_key:rating_value})
name = ''.join(raw_name).strip()
phone = ''.join(raw_phone).strip()
address = ' '.join(' '.join(raw_address).split())
health_rating = ''.join(raw_health_rating).strip()
price_range = ''.join(raw_price_range).strip()
claimed_status = ''.join(raw_claimed).strip()
reviews = ''.join(raw_reviews).strip()
category = ','.join(raw_category)
cleaned_ratings = ''.join(raw_ratings).strip()
if raw_wbsite_link:
decoded_raw_website_link = urllib.unquote(raw_wbsite_link[0])
website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
else:
website = ''
if raw_map_link:
decoded_map_url = urllib.unquote(raw_map_link[0])
map_coordinates = re.findall("center=([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
latitude = map_coordinates[0]
longitude = map_coordinates[1]
else:
latitude = ''
longitude = ''
if raw_ratings:
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0]
else:
ratings = 0
data={'working_hours':working_hours,
'info':info,
'ratings_histogram':ratings_histogram,
'name':name,
'phone':phone,
'ratings':ratings,
'address':address,
'health_rating':health_rating,
'price_range':price_range,
'claimed_status':claimed_status,
'reviews':reviews,
'category':category,
'website':website,
'latitude':latitude,
'longitude':longitude,
'url':url,
}
return data
def parse_reviews(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0.'}
response = requests.get(url, headers=headers, verify=False).text
parser = html.fromstring(response)
ratings_zipped = []
reviews = [x for x in parser.xpath("//div[contains(#class,'main-section')]//div[contains(#class,'review-list')]//div[contains(#class,'review')]//div[contains(#class,'review-content')]")]
for r in reviews:
date = r.xpath("./div[contains(#class,'biz-rating')]//span[contains(#class,'rating-qualifier')]/text()")[0].strip()
rating = r.xpath("./div[contains(#class,'biz-rating')]//div[contains(#class,'rating-large')]/#title")[0]
content = r.xpath("./p")[0].text_content()
ratings_zipped.append([date, rating, content])
print (len(ratings_zipped))
return ratings_zipped
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def parse_pagination(url):
print (url)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False)
print (response)
parser = html.fromstring(response.text)
try:
results = (int(parser.xpath("//div[contains(#class,'page-of-pages')]//text()")[0].strip().split(' ').pop()))*20
except IndexError:
results = 20
print (results)
return results
def get_businesses_data(data):
businesses, failed_searches = [], []
start_time = time.time()
result = {}
for i,url in enumerate(data):
print ('Starting iteration: ', i)
result['url']= url
pagination = parse_pagination(url)
print ('Pagination: ', pagination)
info = parse_page(url)
result['info'] = info
_reviews = []
for v in xrange(0,pagination,20):
paginated_url = result['url'].split('?')[0] + '?start='+str(v)
print ('Scraping Reviews: ', paginated_url)
_reviews += parse_reviews(paginated_url)
time.sleep(.5)
result['scraped_reviews'] = _reviews
result['scraped_reviews_count'] = len(_reviews)
businesses.append(result)
print ('Success iteration: ', i)
# print ('Results: ', result)
print ('Num of reviews: ', str(len(_reviews)))
print('')
print ('Time Elapsed: ', str(time.time() - start_time))
return businesses
if __name__=="__main__":
index = 5
#0
size = 20
i = index*20
chunk = yelp_urls[i:i+size]
businesses = get_businesses_data(chunk)
with open ('results/run_3/output_{}.json'.format(i), 'w') as f:
json.dump(businesses,f)
'''
from exceptions import ValueError
You don't need to do that at all, ValueError is part of the built-in exceptions, not to mention the fact that you never use it in your code
I want to fetch more than 100 high resolution images from google, use python2.7 + selenium + PhantomJS.
But since I act like they said, I could only get a webpage with small images on it. And I can't find out any link to the high resolution pictures directly. How could I fix it?
My code is as below.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
class ImgCrawler:
def __init__(self,searchlink = None):
self.link = searchlink
self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
self.scrolldown = None
self.jsdriver = None
def getPhantomJSDriver(self):
self.jsdriver = webdriver.PhantomJS()
self.jsdriver.get(self.link)
def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
for i in range(scrolltimes):
self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(sleeptime)
def getSoup(self, parser=None):
print 'a', self.jsdriver.page_source
return BeautifulSoup(self.jsdriver.page_source, parser)
def getActualUrl(self, soup=None, flag=None, classflag=None, jsonflaglink=None, jsonflagtype=None):
actualurl = []
for a in soup.find_all(flag, {"class": classflag}):
link = json.loads(a.text)[jsonflaglink]
filetype = json.loads(a.text)[jsonflagtype]
detailurl = link + u'.' + filetype
actualurl.append(detailurl)
return actualurl
if __name__ == '__main__':
search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
queryword = raw_input()
query = queryword.split()
query = '+'.join(query)
weblink = search_url + query
img = ImgCrawler(weblink)
img.getPhantomJSDriver()
img.scrollDownUsePhatomJS(2,5)
soup = img.getSoup('html.parser')
print weblink
print soup
actualurllist = img.getActualUrl(soup,'div','rg_meta','ou','ity')
print len(actualurllist)
I tried for a long time to use PhantomJS but ended up using Chrome which is not what you asked for I know, but it works. I could not get it to work with PhantomJS.
First get a driver https://sites.google.com/a/chromium.org/chromedriver/downloads you can use a headless version of chrome "Chrome Canary" if you are on Windows.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import urlparse
class ImgCrawler:
def __init__(self,searchlink = None):
self.link = searchlink
self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
self.scrolldown = None
self.jsdriver = None
def getPhantomJSDriver(self):
self.jsdriver = webdriver.Chrome()
self.jsdriver.get(self.link)
def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
for i in range(scrolltimes):
self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(sleeptime)
def getSoup(self, parser=None):
print 'a', self.jsdriver.page_source
return BeautifulSoup(self.jsdriver.page_source, parser)
def getActualUrl(self, soup=None):
actualurl = []
r = re.compile(r"/imgres\?imgurl=")
for a in soup.find_all('a', href=r):
parsed = urlparse.urlparse(a['href'])
url = urlparse.parse_qs(parsed.query)['imgurl']
actualurl.append(url)
print url
return actualurl
if __name__ == '__main__':
search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
queryword = raw_input()
query = queryword.split()
query = '+'.join(query)
weblink = search_url + query
img = ImgCrawler(weblink)
img.getPhantomJSDriver()
img.scrollDownUsePhatomJS(2,5)
soup = img.getSoup('html.parser')
print weblink
print soup
actualurllist = img.getActualUrl(soup)
print len(actualurllist)
I changed getActualUrl() to get the image url from an "a" element with a "href" attribute starting with "/imgres?imgurl="
Outputs (when "hazard" is typed in to the terminal):
[u'https://static.independent.co.uk/s3fs-public/styles/article_small/public/thumbnails/image/2016/12/26/16/eden-hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/EdenHazardDecember_2016.jpg/200px-EdenHazardDecember_2016.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2016/1227/r166293_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://platform-static-files.s3.amazonaws.com/premierleague/photos/players/250x250/p42786.png']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Eden_Hazard_-_DK-Chel15_%286%29.jpg/150px-Eden_Hazard_-_DK-Chel15_%286%29.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0117/r172004_1296x729_16-9.jpg&w=738&site=espnfc']
[u'http://images.performgroup.com/di/library/GOAL/98/c0/eden-hazard-chelsea_1eohde060wvft1elcskrgihxq3.jpg?t=-1500575837&w=620&h=430']
[u'http://e1.365dm.com/17/03/16-9/20/skysports-eden-hazard-chelsea_3918835.jpg?20170331154242']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0402/r196036_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000316361045.jpg?strip=all&w=670&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/02/10/14/eden-hazard1.jpg']
[u'http://s.newsweek.com/sites/www.newsweek.com/files/2016/11/07/eden-hazard.jpg']
[u'http://www.newstube24.com/wp-content/uploads/2017/06/haz.jpg']
[u'http://images.performgroup.com/di/library/GOAL/17/b1/eden-hazard_68ypnelg3lfd14oxkffztftt6.png?t=-1802977526&w=620&h=430']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/eb/DK-Chel15_%288%29.jpg/220px-DK-Chel15_%288%29.jpg']
[u'http://images.performgroup.com/di/library/omnisport/50/3f/hazard-cropped_3y08vc3ejpua168e9mgvu4mwc.jpg?t=-930203025&w=620&h=430']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000291621611-e1490777105213.jpg?strip=all&w=745&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/01/14/14/Eden-Hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/5/56/Eden_Hazard%2713-14.JPG/150px-Eden_Hazard%2713-14.JPG']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000296311943-e1490777241155.jpg?strip=all&w=596&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/01/27/11/hazard.jpg']
[u'http://www.newzimbabwe.com/FCKEditor_Images/Eden-Hazard-896286.jpg']
[u'http://images.performgroup.com/di/library/GOAL/9c/93/eden-hazard_d4lbib7wdagw1hp2e5gnyov0k.jpg?t=-1763574189&w=620&h=430']
[u'http://www.guoguiyan.com/data/out/94/69914569-hazard-wallpapers.jpg']
[u'http://static.guim.co.uk/sys-images/Football/Pix/pictures/2015/4/16/1429206099512/Eden-Hazard-009.jpg']
[u'https://metrouk2.files.wordpress.com/2017/04/pri_37621532.jpg?w=620&h=406&crop=1']
[u'http://alummata.com/wp-content/uploads/2016/04/Hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Hazard_vs_Norwich_%28crop%29.jpg/150px-Hazard_vs_Norwich_%28crop%29.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/11/06/20/3A185FB800000578-3910886-image-a-46_1478462522592.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/467822629-465742.jpg']
[u'http://i.dailymail.co.uk/i/pix/2015/10/17/18/2D81CB1D00000578-0-image-a-37_1445102645249.jpg']
[u'http://images.performgroup.com/di/library/GOAL_INTERNATIONAL/27/ce/eden-hazard_1kepw6rvweted1hpfmp5xwd5cs.jpg?t=-228379025&w=620&h=430']
[u'http://img.skysports.com/16/12/768x432/skysports-chelsea-manchester-city-eden-hazard_3845204.jpg?20161203162258']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict0003089026162.jpg?strip=all&w=960&quality=100']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2016/05/hazard--rediscovering-our-form.img.png']
[u'http://images.performgroup.com/di/library/omnisport/b5/98/hazard-cropped_172u0n8wx4j071cvgs1n3yycvw.jpg?t=2030908123&w=620&h=430']
[u'http://images.indianexpress.com/2016/05/eden-hazard-m.jpg']
[u'http://i2.mirror.co.uk/incoming/article9755579.ece/ALTERNATES/s615/PAY-Chelsea-v-Arsenal-Premier-League.jpg']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/06/hazard-injury-update.img.png']
[u'http://futhead.cursecdn.com/static/img/fm/17/players/183277_HAZARDCAM7.png']
[u'http://images.performgroup.com/di/library/GOAL/4d/6/eden-hazard-chelsea-06032017_enh1ll3uadj01ocstyopie9e4.jpg?t=-1521106510&w=620&h=430']
[u'http://images.performgroup.com/di/library/GOAL/34/8/eden-hazard-chelsea-southampton_1oca1rpy37gmn1ldmqvytti3k4.jpg?t=-1501721805&w=620&h=430']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-scoring-his-sides-third-goal-during-picture-id617452212?s=612x612']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-889782.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2015/10/19/16/Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000307050894.jpg?strip=all&w=660&quality=100']
[u'http://e1.365dm.com/16/11/16-9/20/skysports-eden-hazard-chelsea-football_3833122.jpg?20161116153005']
[u'http://thumb.besoccer.com/media/img_news/morata-y-hazard--besoccer.jpg']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2017/03/13/21/10-hazard.jpg']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2016/12/27/13/hazard.jpg']
[u'http://images.performgroup.com/di/library/GOAL/63/2a/eden-hazard-chelsea_15ggj1j39rmky1c5a20oxt3tly.jpg?t=1297762370']
[u'http://i1.mirror.co.uk/incoming/article9755531.ece/ALTERNATES/s615b/Chelsea-v-Arsenal-Premier-League.jpg']
[u'http://cf.c.ooyala.com/l2YmpyYjE6yvLSxGiEebNMr3N1ANS1Xc/O0cEsGv5RdudyPNn4xMDoxOjBnO_4SLA']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-scoring-his-sides-third-goal-during-picture-id617452006?s=612x612']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0413/r199412_2_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000318703991-e1493109803795.jpg?strip=all&w=960&quality=100']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2016/11/18/14/hazard-award.jpg']
[u'http://static.goal.com/2477200/2477282_heroa.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/12/16/14/eden-hazard.jpg']
[u'http://www.guoguiyan.com/data/out/94/69979129-hazard-wallpapers.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/11/nintchdbpict0002769424741.jpg?w=960&strip=all']
[u'http://v.uecdn.es/p/110/thumbnail/entry_id/0_ofavjqr8/width/660/cache_st/20170327164629/type/2/bgcolor/000000/0_ofavjqr8.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/02/07/10/edenhazard.jpg']
[u'http://theworldgame.sbs.com.au/sites/sbs.com.au.theworldgame/files/styles/full/public/images/e/d/eden-hazard-cropped_g6m28ldoc0b41p3f5sp2vlflt.jpg?itok=XW5M7QEA']
[u'http://e0.365dm.com/17/03/16-9/20/skysports-eden-hazard-chelsea_3909051.jpg?20170314181126']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Chelsea-Hazard-goals-886084.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000319343894.jpg?strip=all&w=960&quality=100']
[u'https://www.footyrenders.com/render/Eden-Hazard-PL.png']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-as-he-scores-their-first-goal-the-picture-id672946758?s=612x612']
[u'https://pbs.twimg.com/profile_images/791664465729163264/XbCVl6BF.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/Eden_Hazard_2011.jpg/170px-Eden_Hazard_2011.jpg']
[u'http://s.newsweek.com/sites/www.newsweek.com/files/2016/02/01/guus-hiddink-says-eden-hazard-could-leave-chelsea..jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/06/chelsea_hazard_mobile_top.jpg?strip=all&w=750&h=352&crop=1']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/735000/Eden-Hazard-887735.jpg']
[u'http://i.telegraph.co.uk/multimedia/archive/03580/Hazard_Real_copy_3580583b.jpg']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2017/05/21/47a1f452-43e4-4215-a5b8-5043c1e12a07/686302908.jpg']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/06/hazard-s-highlights.img.png']
[u'http://i.dailymail.co.uk/i/pix/2016/12/14/15/3B45B4D300000578-4032306-Hazard_PFA_Player_of_the_Year_in_2015_has_rediscovered_his_form_-a-6_1481728291902.jpg']
[u'https://img.rasset.ie/000d5137-800.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-665260.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-659164.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Hazard-scored-Chelsea-s-third-goal-against-Tottenham-909804.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/237000/Eden-Hazard-887237.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/media/motion/ESPNi/2017/0405/int_170405_Hazard_the_successor_to_Ronaldo_at_Real/int_170405_Hazard_the_successor_to_Ronaldo_at_Real.jpg&w=738&site=espnfc']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-721522.jpg']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-with-teammates-after-scoring-his-picture-id633776492?s=612x612']
[u'http://betinmalta.com/wp-content/uploads/2017/05/hazard.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/708000/Eden-Hazard-Chelsea-712708.jpg']
[u'http://images.performgroup.com/di/library/omnisport/c9/4a/eden-hazard-cropped_12u5irb6bkze1cue2wpjzxa44.jpg?t=-2084914038&w=620&h=430']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/06/nintchdbpict0003291256741.jpg?strip=all&w=714&quality=100']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2017/03/10/f97d36aa-1eef-4a78-996f-63d543c79efc/700017169TS004_Eden_Hazard_.JPG']
[u'https://s-media-cache-ak0.pinimg.com/736x/f0/01/17/f001178defb2b3be3cffb5e9b792748b--eden-hazard-liverpool-england.jpg']
[u'http://i4.mirror.co.uk/incoming/article9898829.ece/ALTERNATES/s615b/hazard-2.jpg']
[u'http://images.performgroup.com/di/library/GOAL/24/76/eden-hazard-and-lionel-messi_kamv8simc20x1p2i2fcf7lllw.png?t=421166242&w=620&h=430']
[u'https://metrouk2.files.wordpress.com/2017/03/gettyimages-618471206.jpg?w=748&h=498&crop=1']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Hazard-Chelsea-658138.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/04/23/17/hazard.jpg']
[u'http://e0.365dm.com/15/10/16-9/20/eden-hazard-comparison-chelsea_3365521.jpg?20151018152317']
[u'http://cdn.images.express.co.uk/img/dynamic/galleries/x701/231048.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/galleries/x701/102742.jpg']
[u'https://i.ytimg.com/vi/GWhVkFTe_BY/maxresdefault.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000319343748-e1496260888520.jpg?strip=all&w=960&quality=100']
[u'https://metrouk2.files.wordpress.com/2017/06/689818982.jpg?w=748&h=498&crop=1']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000308902470.jpg?strip=all&w=960&quality=100']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/12/nintchdbpict000289569836.jpg?w=960&strip=all']
[u'https://i.ytimg.com/vi/zZ9stt70_vU/maxresdefault.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/2/2c/Kylian_Hazard_%28cropped%29.jpg']
[u'http://e00-marca.uecdn.es/assets/multimedia/imagenes/2017/03/26/14905092504845.jpg']
[u'http://images.performgroup.com/di/library/omnisport/ba/47/eden-hazard-cropped_rccnpv1me3v51kqpnj5ak4nko.jpg?t=1222186324&w=620&h=430']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-922505.jpg']
[u'https://s-media-cache-ak0.pinimg.com/736x/48/ce/4c/48ce4c478d8b06dccacce352d9d4bdc2--eden-hazard-pogba.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2016/10/23/111897755_Editorial_use_only_No_merchandising_For_Football_images_FA_and_Premier_League_restrict-large_trans_NvBQzQNjv4BqqVzuuqpFlyLIwiB6NTmJwfSVWeZ_vEN7c6bHu2jJnT8.jpg']
[u'https://metrouk2.files.wordpress.com/2017/06/686902184.jpg?w=748&h=652&crop=1']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict0003086723271-e1489501904590.jpg?strip=all&w=960&quality=100']
[u'http://e2.365dm.com/17/01/16-9/20/skysports-chelsea-manchester-city-eden-hazard_3862340.jpg?20170406190414']
[u'http://www.telegraph.co.uk/content/dam/football/2017/05/26/TELEMMGLPICT000129483487-large_trans_NvBQzQNjv4BqajCpFXsei0OXjDFGPZkcdJOkVdu-K0ystYH4SV7DHn8.jpeg']
[u'https://i.ytimg.com/vi/FFE4Ea437ks/maxresdefault.jpg']
[u'https://i1.wp.com/www.vanguardngr.com/wp-content/uploads/2017/03/Hazard-madrid.png?resize=350%2C200']
[u'http://china.chelseafc.com/content/cfc/zh/homepage/teams/first-team/eden-hazard/summary/_jcr_content/tabparmain/box/box/textimage/image.img.jpg/1496846329140.jpg']
[u'http://static.goal.com/198700/198707_news.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000319357531.jpg?strip=all&w=960&quality=100']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-poses-with-the-premier-league-trophy-after-the-picture-id686826640?s=612x612']
[u'http://cf.c.ooyala.com/t3dXdzYjE6VJktcnKdi7F2205I_mSSKQ/eWNh-8akTAF2kj8X4xMDoxOjBnO_4SLA']
[u'http://c.smimg.net/16/39/300x225/eden-hazard.jpg']
[u'http://www.whatfootballersearn.com/wp-content/uploads/Eden-Hazard.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/328000/Eden-Hazard-437328.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-Chelsea-882846.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Chelsea-star-Eden-Hazard-741161.jpg']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201703/hazard_0.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/08/28/21/37A101A700000578-3762573-image-a-19_1472417354943.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2016/07/27/87650659-edenhazard-sport-large_trans_NvBQzQNjv4BqqVzuuqpFlyLIwiB6NTmJwfSVWeZ_vEN7c6bHu2jJnT8.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/165000/620x/Eden-Hazard-598896.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/05/04/21/33C8A26600000578-0-image-a-19_1462392130112.jpg']
[u'https://ichef-1.bbci.co.uk/news/660/cpsprodpb/13AA1/production/_96354508_595836d4-f21a-419b-95cb-37a65204f6eb.jpg']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2016/11/30/1eb421ae-b210-4a01-95bb-36f509826cc1/Debruyne_v_Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000309639487-e1490040388851.jpg?strip=all&w=739&quality=100']
[u'http://static.goal.com/3311200/3311292_heroa.jpg']
[u'http://i3.mirror.co.uk/incoming/article7986781.ece/ALTERNATES/s615b/Hazard-and-son.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2016/0916/r126535_1296x729_16-9.jpg&w=738&site=espnfc']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/03/hazard-score-is-number-one-.img.png']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2015/03/17/13/eden-hazard.jpg']
[u'https://metrouk2.files.wordpress.com/2017/05/680506564.jpg?w=748&h=457&crop=1']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-with-diego-costa-of-chelsea-after-picture-id671559962?s=612x612']
[u'http://e0.365dm.com/17/05/16-9/20/skysports-eden-hazard-chelsea_3965489.jpg?20170529101357']
[u'https://s-media-cache-ak0.pinimg.com/736x/e0/80/0e/e0800e380ef363594fb292969b7c5b64--eden-hazard-chelsea-soccer.jpg']
[u'http://cdn-football365.365.co.za/content/uploads/2016/12/GettyImages.630542828.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/07/16/19/340E4A9A00000578-3693637-image-a-84_1468694248523.jpg']
[u'http://www.squawka.com/news/wp-content/uploads/2017/01/hazard-chelsea-e1485528066609.jpg']
[u'http://www.guoguiyan.com/data/out/94/68880901-hazard-wallpapers.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2017/03/12/JS122962983_EHazDavid-Rose-large_trans_NvBQzQNjv4BqtA9hvt4yaDuJhaJG2frTIUNrh1MdssoHpGF6OIxC49c.jpg']
[u'http://images.performgroup.com/di/library/GOAL/17/25/eden-hazard-chelsea_1dsnlf2z113cx10nxvp9ydudcz.jpg?t=2008335075']
[u'http://www.telegraph.co.uk/content/dam/football/2016/12/05/115182685-eden-hazard-sport-large_trans_NvBQzQNjv4BqA7a2BP2KFPtZUOepzpZgXISdNn8DgVUcalGVREaviFE.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/sport-preview-morata-hazard.jpg?strip=all&quality=100&w=750&h=500&crop=1']
[u'http://thumb.besoccer.com/media/img_news/eden-hazard--futbolista-del-chelsea--chelseafc.jpg']
[u'http://i4.mirror.co.uk/incoming/article7374471.ece/ALTERNATES/s615/Chelsea-Training-Session.jpg']
[u'https://metrouk2.files.wordpress.com/2017/04/671549404.jpg?w=748&h=532&crop=1']
[u'https://metrouk2.files.wordpress.com/2016/02/462363538.jpg?w=748&h=563&crop=1']
[u'https://metrouk2.files.wordpress.com/2017/05/6834221661.jpg?w=748&h=507&crop=1']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Chelsea-star-Eden-Hazard-739447.jpg']
[u'http://cdn.quotesgram.com/img/21/41/114220036-24CA6E4700000578-2916442-Eden_Hazard_has_been_instrumental_for_Chelsea_this_season_as_the-a-7_1421682779132.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/09/29/09/38E4401500000578-3813294-image-a-1_1475138637248.jpg']
[u'http://healthyceleb.com/wp-content/uploads/2016/04/Eden-Hazard-match-between-Chelsea-Milton-Keynes-Dons-January-2016.jpg']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201704/gettyimages-663029916.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/2/29/Thorgan_Hazard_2014.jpg/220px-Thorgan_Hazard_2014.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/91000/620x/Eden-Hazard-632850.jpg']
[u'http://i4.mirror.co.uk/incoming/article7531141.ece/ALTERNATES/s615/A-dejected-looking-Eden-Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000322464448-e1494602676644.jpg?strip=all&w=960&quality=100']
[u'http://images.performgroup.com/di/library/GOAL_INTERNATIONAL/76/92/chelsea-bournemouth-eden-hazard_148j4p900kzba159diso6ewwvo.jpg?t=1004329665&w=620&h=430']
[u'https://images.cdn.fourfourtwo.com/sites/fourfourtwo.com/files/styles/inline-image/public/hazard3.jpg?itok=ap0DtuZx']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201707/hazard.jpg']
...
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0330/r195127_1296x729_16-9.jpg&w=738&site=espnfc']
299
I want to download all Images of google image search using python . The code I am using seems to have some problem some times .My code is
import os
import sys
import time
from urllib import FancyURLopener
import urllib2
import simplejson
# Define search term
searchTerm = "parrot"
# Replace spaces ' ' in search term for '%20' in order to comply with request
searchTerm = searchTerm.replace(' ','%20')
# Start FancyURLopener with defined version
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
myopener = MyOpener()
# Set count to 0
count= 0
for i in range(0,10):
# Notice that the start changes for each iteration in order to request a new set of images for each loop
url = ('https://ajax.googleapis.com/ajax/services/search/images?' + 'v=1.0& q='+searchTerm+'&start='+str(i*10)+'&userip=MyIP')
print url
request = urllib2.Request(url, None, {'Referer': 'testing'})
response = urllib2.urlopen(request)
# Get results using JSON
results = simplejson.load(response)
data = results['responseData']
dataInfo = data['results']
# Iterate for each result and get unescaped url
for myUrl in dataInfo:
count = count + 1
my_url = myUrl['unescapedUrl']
myopener.retrieve(myUrl['unescapedUrl'],str(count)+'.jpg')
After downloading few pages I am getting an error as follows:
Traceback (most recent call last):
File "C:\Python27\img_google3.py", line 37, in <module>
dataInfo = data['results']
TypeError: 'NoneType' object has no attribute '__getitem__'
What to do ??????
I have modified my code. Now the code can download 100 images for a given query, and images are full high resolution that is original images are being downloaded.
I am downloading the images using urllib2 & Beautiful soup
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import cookielib
import json
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
query = raw_input("query image")# you can change the query for the image here
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
print url
#add the directory for your image here
DIR="Pictures"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
print "there are total" , len(ActualImages),"images"
if not os.path.exists(DIR):
os.mkdir(DIR)
DIR = os.path.join(DIR, query.split()[0])
if not os.path.exists(DIR):
os.mkdir(DIR)
###print images
for i , (img , Type) in enumerate( ActualImages):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
print cntr
if len(Type)==0:
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+".jpg"), 'wb')
else :
f = open(os.path.join(DIR , image_type + "_"+ str(cntr)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
i hope this helps you
The Google Image Search API is deprecated, you need to use the Google Custom Search for what you want to achieve. To fetch the images you need to do this:
import urllib2
import simplejson
import cStringIO
fetcher = urllib2.build_opener()
searchTerm = 'parrot'
startIndex = 0
searchUrl = "http://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" + searchTerm + "&start=" + startIndex
f = fetcher.open(searchUrl)
deserialized_output = simplejson.load(f)
This will give you 4 results, as JSON, you need to iteratively get the results by incrementing the startIndex in the API request.
To get the images you need to use a library like cStringIO.
For example, to access the first image, you need to do this:
imageUrl = deserialized_output['responseData']['results'][0]['unescapedUrl']
file = cStringIO.StringIO(urllib.urlopen(imageUrl).read())
img = Image.open(file)
Google deprecated their API, scraping Google is complicated, so I would suggest using Bing API instead to automatically download images. The pip package bing-image-downloader allows you to easily download an arbitrary number of images to a directory with a single line of code.
from bing_image_downloader import downloader
downloader.download(query_string, limit=100, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True)
Google is not so good, and Microsoft is not so evil
Here's my latest google image snarfer, written in Python, using Selenium and headless Chrome.
It requires python-selenium, the chromium-driver, and a module called retry from pip.
Link: http://sam.aiki.info/b/google-images.py
Example Usage:
google-images.py tiger 10 --opts isz:lt,islt:svga,itp:photo > urls.txt
parallel=5
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
(i=0; while read url; do wget -e robots=off -T10 --tries 10 -U"$user_agent" "$url" -O`printf %04d $i`.jpg & i=$(($i+1)) ; [ $(($i % $parallel)) = 0 ] && wait; done < urls.txt; wait)
Help Usage:
$ google-images.py --help
usage: google-images.py [-h] [--safe SAFE] [--opts OPTS] query n
Fetch image URLs from Google Image Search.
positional arguments:
query image search query
n number of images (approx)
optional arguments:
-h, --help show this help message and exit
--safe SAFE safe search [off|active|images]
--opts OPTS search options, e.g.
isz:lt,islt:svga,itp:photo,ic:color,ift:jpg
Code:
#!/usr/bin/env python3
# requires: selenium, chromium-driver, retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import selenium.common.exceptions as sel_ex
import sys
import time
import urllib.parse
from retry import retry
import argparse
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger()
retry_logger = None
css_thumbnail = "img.Q4LuWd"
css_large = "img.n3VNCb"
css_load_more = ".mye4qd"
selenium_exceptions = (sel_ex.ElementClickInterceptedException, sel_ex.ElementNotInteractableException, sel_ex.StaleElementReferenceException)
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_thumbnails(wd, want_more_than=0):
wd.execute_script("document.querySelector('{}').click();".format(css_load_more))
thumbnails = wd.find_elements_by_css_selector(css_thumbnail)
n_results = len(thumbnails)
if n_results <= want_more_than:
raise KeyError("no new thumbnails")
return thumbnails
#retry(exceptions=KeyError, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def get_image_src(wd):
actual_images = wd.find_elements_by_css_selector(css_large)
sources = []
for img in actual_images:
src = img.get_attribute("src")
if src.startswith("http") and not src.startswith("https://encrypted-tbn0.gstatic.com/"):
sources.append(src)
if not len(sources):
raise KeyError("no large image")
return sources
#retry(exceptions=selenium_exceptions, tries=6, delay=0.1, backoff=2, logger=retry_logger)
def retry_click(el):
el.click()
def get_images(wd, start=0, n=20, out=None):
thumbnails = []
count = len(thumbnails)
while count < n:
scroll_to_end(wd)
try:
thumbnails = get_thumbnails(wd, want_more_than=count)
except KeyError as e:
logger.warning("cannot load enough thumbnails")
break
count = len(thumbnails)
sources = []
for tn in thumbnails:
try:
retry_click(tn)
except selenium_exceptions as e:
logger.warning("main image click failed")
continue
sources1 = []
try:
sources1 = get_image_src(wd)
except KeyError as e:
pass
# logger.warning("main image not found")
if not sources1:
tn_src = tn.get_attribute("src")
if not tn_src.startswith("data"):
logger.warning("no src found for main image, using thumbnail")
sources1 = [tn_src]
else:
logger.warning("no src found for main image, thumbnail is a data URL")
for src in sources1:
if not src in sources:
sources.append(src)
if out:
print(src, file=out)
out.flush()
if len(sources) >= n:
break
return sources
def google_image_search(wd, query, safe="off", n=20, opts='', out=None):
search_url_t = "https://www.google.com/search?safe={safe}&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img&tbs={opts}"
search_url = search_url_t.format(q=urllib.parse.quote(query), opts=urllib.parse.quote(opts), safe=safe)
wd.get(search_url)
sources = get_images(wd, n=n, out=out)
return sources
def main():
parser = argparse.ArgumentParser(description='Fetch image URLs from Google Image Search.')
parser.add_argument('--safe', type=str, default="off", help='safe search [off|active|images]')
parser.add_argument('--opts', type=str, default="", help='search options, e.g. isz:lt,islt:svga,itp:photo,ic:color,ift:jpg')
parser.add_argument('query', type=str, help='image search query')
parser.add_argument('n', type=int, default=20, help='number of images (approx)')
args = parser.parse_args()
opts = Options()
opts.add_argument("--headless")
# opts.add_argument("--blink-settings=imagesEnabled=false")
with webdriver.Chrome(options=opts) as wd:
sources = google_image_search(wd, args.query, safe=args.safe, n=args.n, opts=args.opts, out=sys.stdout)
main()
Haven't looked into your code but this is an example solution made with selenium to try to get 400 pictures from the search term
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json
import os
import urllib2
searchterm = 'vannmelon' # will also be the name of the folder
url = "https://www.google.co.in/search?q="+searchterm+"&source=lnms&tbm=isch"
browser = webdriver.Firefox()
browser.get(url)
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
counter = 0
succounter = 0
if not os.path.exists(searchterm):
os.mkdir(searchterm)
for _ in range(500):
browser.execute_script("window.scrollBy(0,10000)")
for x in browser.find_elements_by_xpath("//div[#class='rg_meta']"):
counter = counter + 1
print "Total Count:", counter
print "Succsessful Count:", succounter
print "URL:",json.loads(x.get_attribute('innerHTML'))["ou"]
img = json.loads(x.get_attribute('innerHTML'))["ou"]
imgtype = json.loads(x.get_attribute('innerHTML'))["ity"]
try:
req = urllib2.Request(img, headers={'User-Agent': header})
raw_img = urllib2.urlopen(req).read()
File = open(os.path.join(searchterm , searchterm + "_" + str(counter) + "." + imgtype), "wb")
File.write(raw_img)
File.close()
succounter = succounter + 1
except:
print "can't get img"
print succounter, "pictures succesfully downloaded"
browser.close()
Adding to Piees's answer, for downloading any number of images from the search results, we need to simulate a click on 'Show more results' button after first 400 results are loaded.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import json
import urllib2
import sys
import time
# adding path to geckodriver to the OS environment variable
# assuming that it is stored at the same path as this script
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "dataset/"
def main():
searchtext = sys.argv[1] # the search query
num_requested = int(sys.argv[2]) # number of images to download
number_of_scrolls = num_requested / 400 + 1
# number_of_scrolls * 400 images will be opened in the browser
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = {"jpg", "jpeg", "png", "gif"}
img_count = 0
downloaded_img_count = 0
for _ in xrange(number_of_scrolls):
for __ in xrange(10):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# to load next 400 images
time.sleep(0.5)
try:
driver.find_element_by_xpath("//input[#value='Show more results']").click()
except Exception as e:
print "Less images found:", e
break
# imges = driver.find_elements_by_xpath('//div[#class="rg_meta"]') # not working anymore
imges = driver.find_elements_by_xpath('//div[contains(#class,"rg_meta")]')
print "Total images:", len(imges), "\n"
for img in imges:
img_count += 1
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
print "Downloading image", img_count, ": ", img_url
try:
if img_type not in extensions:
img_type = "jpg"
req = urllib2.Request(img_url, headers=headers)
raw_img = urllib2.urlopen(req).read()
f = open(download_path+searchtext.replace(" ", "_")+"/"+str(downloaded_img_count)+"."+img_type, "wb")
f.write(raw_img)
f.close
downloaded_img_count += 1
except Exception as e:
print "Download failed:", e
finally:
print
if downloaded_img_count >= num_requested:
break
print "Total downloaded: ", downloaded_img_count, "/", img_count
driver.quit()
if __name__ == "__main__":
main()
Full code is here.
This worked for me in Windows 10, Python 3.9.7:
pip install bing-image-downloader
Below code downloads 10 images of India from Bing search Engine to desired output folder:
from bing_image_downloader import downloader
downloader.download('India', limit=10, output_dir='dataset', adult_filter_off=True, force_replace=False, timeout=60, verbose=True)
Documentation: https://pypi.org/project/bing-image-downloader/
You can also use Selenium with Python. Here is how:
from selenium import webdriver
import urllib
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import urllib.request
driver = webdriver.Firefox()
word="apple"
url="http://images.google.com/search?q="+word+"&tbm=isch&sout=1"
driver.get(url)
imageXpathSelector='/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div/div[1]/div[1]/span/div[1]/div[1]/div[1]/a[1]/div[1]/img'
img=driver.find_element(By.XPATH,imageXpathSelector)
src=(img.get_attribute('src'))
urllib.request.urlretrieve(src, word+".jpg")
driver.close()
(This code works on Python 3.8)
Please be informed that you should install the Selenium package with 'pip install selenium'
Contrary to the other web scraping techniques, Selenium opens the browser and downloads the items because Selenium's mission is testing rather than scraping.
N.B. For imageXpathSelector if it does not work please click F12 while your browser is open and right-click the image then click the 'copy' menu from the opened menu and select 'copy Xpath' there. It will be the right Xpath location of the element you need.
This one as other code snippets have grown old and no longer worked for me. Downloads 100 images for each keyword, inspired from one of the solutions above.
from bs4 import BeautifulSoup
import urllib2
import os
class GoogleeImageDownloader(object):
_URL = "https://www.google.co.in/search?q={}&source=lnms&tbm=isch"
_BASE_DIR = 'GoogleImages'
_HEADERS = {
'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"
}
def __init__(self):
query = raw_input("Enter keyword to search images\n")
self.dir_name = os.path.join(self._BASE_DIR, query.split()[0])
self.url = self._URL.format(urllib2.quote(query))
self.make_dir_for_downloads()
self.initiate_downloads()
def make_dir_for_downloads(self):
print "Creating necessary directories"
if not os.path.exists(self._BASE_DIR):
os.mkdir(self._BASE_DIR)
if not os.path.exists(self.dir_name):
os.mkdir(self.dir_name)
def initiate_downloads(self):
src_list = []
soup = BeautifulSoup(urllib2.urlopen(urllib2.Request(self.url,headers=self._HEADERS)),'html.parser')
for img in soup.find_all('img'):
if img.has_attr("data-src"):
src_list.append(img['data-src'])
print "{} of images collected for downloads".format(len(src_list))
self.save_images(src_list)
def save_images(self, src_list):
print "Saving Images..."
for i , src in enumerate(src_list):
try:
req = urllib2.Request(src, headers=self._HEADERS)
raw_img = urllib2.urlopen(req).read()
with open(os.path.join(self.dir_name , str(i)+".jpg"), 'wb') as f:
f.write(raw_img)
except Exception as e:
print ("could not save image")
raise e
if __name__ == "__main__":
GoogleeImageDownloader()
I know this question is old, but I ran across it recently and none of the previous answers work anymore. So I wrote this script to gather images from google. As of right now it can download as many images as are available.
here is a github link to it as well https://github.com/CumminUp07/imengine/blob/master/get_google_images.py
DISCLAIMER: DUE TO COPYRIGHT ISSUES, IMAGES GATHERED SHOULD ONLY BE USED FOR RESEARCH AND EDUCATION PURPOSES ONLY
from bs4 import BeautifulSoup as Soup
import urllib2
import json
import urllib
#programtically go through google image ajax json return and save links to list#
#num_images is more of a suggestion #
#it will get the ceiling of the nearest 100 if available #
def get_links(query_string, num_images):
#initialize place for links
links = []
#step by 100 because each return gives up to 100 links
for i in range(0,num_images,100):
url = 'https://www.google.com/search?ei=1m7NWePfFYaGmQG51q7IBg&hl=en&q='+query_string+'\
&tbm=isch&ved=0ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ&start='+str(i)+'\
&yv=2&vet=10ahUKEwjjovnD7sjWAhUGQyYKHTmrC2kQuT0I7gEoAQ.1m7NWePfFYaGmQG51q7IBg.i&ijn=1&asearch=ichunk&async=_id:rg_s,_pms:s'
#set user agent to avoid 403 error
request = urllib2.Request(url, None, {'User-Agent': 'Mozilla/5.0'})
#returns json formatted string of the html
json_string = urllib2.urlopen(request).read()
#parse as json
page = json.loads(json_string)
#html found here
html = page[1][1]
#use BeautifulSoup to parse as html
new_soup = Soup(html,'lxml')
#all img tags, only returns results of search
imgs = new_soup.find_all('img')
#loop through images and put src in links list
for j in range(len(imgs)):
links.append(imgs[j]["src"])
return links
#download images #
#takes list of links, directory to save to #
#and prefix for file names #
#saves images in directory as a one up number #
#with prefix added #
#all images will be .jpg #
def get_images(links,directory,pre):
for i in range(len(links)):
urllib.urlretrieve(links[i], "./"+directory+"/"+str(pre)+str(i)+".jpg")
#main function to search images #
#takes two lists, base term and secondary terms #
#also takes number of images to download per #
#combination #
#it runs every combination of search terms #
#with base term first then secondary #
def search_images(base,terms,num_images):
for y in range(len(base)):
for x in range(len(terms)):
all_links = get_links(base[y]+'+'+terms[x],num_images)
get_images(all_links,"images",x)
if __name__ == '__main__':
terms = ["cars","numbers","scenery","people","dogs","cats","animals"]
base = ["animated"]
search_images(base,terms,1000)
Instead of google image search, try other image searches like ecosia or bing.
Here is a sample code for retrieving images from ecosia search engine.
from bs4 import BeautifulSoup
import requests
import urllib
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers = {'User-Agent':user_agent}
urls = ["https://www.ecosia.org/images?q=india%20pan%20card%20example"]
#The url's from which the image is to be extracted.
index = 0
for url in urls:
request = urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
data = response.read() # Read the html result page
soup = BeautifulSoup(data, 'html.parser')
for link in soup.find_all('img'):
#The images are enclosed in 'img' tag and the 'src' contains the url of the image.
img_url = link.get('src')
dest = str(index) + ".jpg" #Destination to store the image.
try:
urllib.request.urlretrieve(img_url)
index += 1
except:
continue
The code works with google image search but it fails to retrieve images because google stores the images in encrypted format which is difficult to retrieve from the image url.
The solutions works as on 1-Feb-2021.
Okay, so instead of coding this from you I am going to tell you what you're doing wrong and it might lead you in the right direction. Usually most modern websites render html dynamically via javascript and so if you simply send a GET request(with urllib/CURL/fetch/axios) you wont get what you usually see in the browser going to the same URL/web address. What you need is something that renders the javascript code to create the same HTML/webpage you see on your browser, you can use something like selenium gecko driver for firefox to do this and there python modules out there that let you do this.
I hope this helps, if you still feel lost here's a simple script i wrote a while back to extract something similar from your google photos
from selenium import webdriver
import re
url="https://photos.app.goo.gl/xxxxxxx"
driver = webdriver.Firefox()
driver.get(url)
regPrms="^background-image\:url\(.*\)$"
regPrms="^The.*Spain$"
html = driver.page_source
urls=re.findall("(?P<url>https?://[^\s\"$]+)", html)
fin=[]
for url in urls:
if "video-downloads" in url:
fin.append(url)
print("The Following ZIP contains all your pictures")
for url in fin:
print("-------------------")
print(url)
You can achieve this using selenium as others mentioned it above.
Alternatively, you can try using Google Images API from SerpApi. Check out the playground.
Code and example. Fuction to download images was taken from this answer:
import os, time, shutil, httpx, asyncio
from urllib.parse import urlparse
from serpapi import GoogleSearch
# https://stackoverflow.com/a/39217788/1291371
async def download_file(url):
print(f'Downloading {url}')
# https://stackoverflow.com/a/18727481/1291371
parsed_url = urlparse(url)
local_filename = os.path.basename(parsed_url.path)
os.makedirs('images', exist_ok=True)
async with httpx.AsyncClient() as client:
async with client.stream('GET', url) as response:
async with open(f'images/{local_filename}', 'wb') as f:
await asyncio.to_thread(shutil.copyfileobj, response.raw, f)
return local_filename
async def main():
start = time.perf_counter()
params = {
"engine": "google",
"ijn": "0",
"q": "lasagna",
"tbm": "isch",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
download_files_tasks = [
download_file(image['original']) for image in results['images_results']
]
await asyncio.gather(*download_files_tasks, return_exceptions=True)
print(
f"Downloaded {len(download_files_tasks)} images in {time.perf_counter() - start:0.4f} seconds")
asyncio.run(main())
Disclaimer, I work for SerpApi.
The one I used is :
https://github.com/hellock/icrawler
This package is a mini framework of web crawlers. With modularization design, it is easy to use and extend. It supports media data like images and videos very well, and can also be applied to texts and another type of files. Scrapy is heavy and powerful, while icrawler is tiny and flexible.
def main():
parser = ArgumentParser(description='Test built-in crawlers')
parser.add_argument(
'--crawler',
nargs='+',
default=['google', 'bing', 'baidu', 'flickr', 'greedy', 'urllist'],
help='which crawlers to test')
args = parser.parse_args()
for crawler in args.crawler:
eval('test_{}()'.format(crawler))
print('\n')