I use a while loop in Python to download several pdf documents given by a csv file.
The code runs smoothly without any issue but the loop stops working after several loops (sometimes 100 other times 40 or 140).
Below is my code which is used:
import pandas as pd
import os
import urllib
from urllib import request
import requests
import csv
import numpy as np
df = pd.read_csv('Linklist.csv', sep = ';') # can also index sheet by name or fetch all sheets
df.head() #get relevant columns
url_list = df['URL'].tolist() #column with links
name_list = df['Name'].tolist() #column with name
name_list_2 =df['Year'].to_list() #column with second identifier here a year for example
Year_date = []
for element in name_list_2:
Year_date.append(str(element))
max_length = len(url_list)
i = 0
f = open('results.csv', 'w')
writer = csv.writer(f)
while i <= max_length-1:
response = requests.get(url_list[i])
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15')]
if response.status_code != 200:
i += 1
a = str(response.status_code)
write_a = (name_list[i], Year_date[i], a)
writer.writerow(write_a)
print(name_list[i]+' '+ Year_date[i]+ ' ' +a)
else:
urllib.request.install_opener(opener)
request.urlretrieve( url_list[i],'/targetpath/'+ name_list[i] + Year_date[i] + '.pdf') #.pdf if it is a pdf doc you want to download
b = str(response.status_code)
write_b = (name_list[i], Year_date[i], b)
writer.writerow(write_b)
print(name_list[i]+' '+ Year_date[i] + ' '+ b)
i += 1
f.close()
The information by #barmar and #ogdenkev is correct I needed to integrate a Timeout component!
The working code looks like this now (just integrated the part which I changed):
DEFAULT_TIMEOUT = 180
old_send = requests.Session.send
def new_send(*args, **kwargs):
if kwargs.get("timeout", None) is None:
kwargs["timeout"] = DEFAULT_TIMEOUT
return old_send(*args, **kwargs)
requests.Session.send = new_send
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'}
while i <= max_length-1:
try:
response = requests.get(url_list[I], headers = headers)
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15')]
if response.status_code != 200:
a = str(response.status_code)
write_a = (name_list[i], Year_date[i], a)
writer.writerow(write_a)
print(name_list[i]+' '+ Year_date[i]+ ' ' +a)
print(i)
i += 1
else:
urllib.request.install_opener(opener)
request.urlretrieve( url_list[i],'/path/'+ name_list[i] + Year_date[i] + '.pdf') #.pdf if it is a pdf doc you want to download
b = str(response.status_code)
write_b = (name_list[i], Year_date[i], b)
writer.writerow(write_b)
print(name_list[i]+' '+ Year_date[i] + ' '+ b)
print(i)
i += 1
except requests.exceptions.RequestException as e:
c = 'Timeout'
write_a = (name_list[i], Year_date[i], c)
writer.writerow(write_a)
print(name_list[i]+' '+ Year_date[i]+ ' ' +c)
print(i)
i += 1
f.close()
I am trying to download and save images using a scraper but it only downloads the first 20 images while I want it to download as many images as possible.
import requests
from bs4 import BeautifulSoup
import os
url = "https://www.google.com/search?q=cats&sxsrf=ALeKk01diaA8AhwZsRpiMkZxaTUY6MuN4Q:1624119375856&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj62uGTjKTxAhWMIsAKHV12B74Q_AUoAXoECAEQAw&biw=1848&bih=949"
folder = "images"
r = requests.get(url,stream=True)
soup = BeautifulSoup(r.text,"html.parser")
images = soup.select("img")
try:
os.mkdir(os.path.join(os.getcwd(),folder))
except:
pass
os.chdir(os.path.join(os.getcwd(),folder))
i = 0
for image in images:
if i != 0:
link = image["src"]
with open(str(i) + ".jpg", "wb") as f:
im = requests.get(link)
f.write(im.content)
print("Writing: ",i)
i += 1
with this code i get 109 jpeg
import requests
from bs4 import BeautifulSoup
import os
my_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36 OPR/58.0.3135.107"
headers = {}
headers['User-Agent'] = my_UA
url = "https://www.google.com/search?q=cats&sxsrf=ALeKk01diaA8AhwZsRpiMkZxaTUY6MuN4Q:1624119375856&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj62uGTjKTxAhWMIsAKHV12B74Q_AUoAXoECAEQAw&biw=1848&bih=949"
folder = "images"
r = requests.get(url,stream=True,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
images = soup.select("img")
try:
os.mkdir(os.path.join(os.getcwd(),folder))
except:
pass
os.chdir(os.path.join(os.getcwd(),folder))
i = 0
print("total images found=",len(images))
for image in images:
link=""
if image.get("src"):
link = image["src"]
elif image.get("data-src"):
link = image["data-src"]
if link and not "image/gif;" in link:
with open(str(i) + ".jpg", "wb") as f:
im = requests.get(link,headers=headers)
f.write(im.content)
print("Writing: ",i)
i += 1
-there are 2 properties "src" and "data-src"
-It skips gifs.
-For more files you can do it with selenium for example
Using stackoverflow for the first time trying to figure out how to scrape Yelp data and having a hard time. Have set up LXML, beautiful soup, requests, PIP, Python and have added these to the path in system variables yet I am still getting the error below when I try to run code below. Any suggestions?
File "test2.py", line 4, in
from exceptions import ValueError
ModuleNotFoundError: No module named 'exceptions'
from lxml import html
import json
import requests
from exceptions import ValueError
import re, urllib
import urllib3
import argparse
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import time
from concurrent.futures import ThreadPoolExecutor
import sys
from threading import Thread
import os
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#[#'https://www.yelp.com/biz/kdb-kitchen-den-bar-long-beach',
yelp_urls =['https://www.yelp.com/biz/the-atlas-room-washington','https://www.yelp.com/biz/the-rack-brandon','https://www.yelp.com/biz/payard-p%C3%A2tisserie-and-bistro-new-york-2','https://www.yelp.com/biz/maison-giraud-pacific-palisades','https://www.yelp.com/biz/saltbox-san-diego','https://www.yelp.com/biz/carmichaels-chicago-steak-house-chicago','https://www.yelp.com/biz/black-eyed-pea-restaurant-houston-6','https://www.yelp.com/biz/perfecto-mundo-latin-fusion-bistro-commack','https://www.yelp.com/biz/smittys-bbq-boyd','https://www.yelp.com/biz/reston-kabob-reston','https://www.yelp.com/biz/bookmark-cafe-largo','https://www.yelp.com/biz/the-tin-angel-pittsburgh','https://www.yelp.com/biz/briantos-original-hoagies-orlando','https://www.yelp.com/biz/freeway-diner-woodbury','https://www.yelp.com/biz/river-gods-cambridge','https://www.yelp.com/biz/golan-kosher-restaurant-north-hollywood-2','https://www.yelp.com/biz/city-hall-restaurant-new-york-2','https://www.yelp.com/biz/empire-pizza-and-grill-west-chester','https://www.yelp.com/biz/cityzen-washington-2','https://www.yelp.com/biz/three-degrees-los-gatos','https://www.yelp.com/biz/applebees-grill-bar-quakertown','https://www.yelp.com/biz/johnny-carinos-covina','https://www.yelp.com/biz/buffet-de-la-gare-hastings-hdsn','https://www.yelp.com/biz/continental-food-management-la-mirada','https://www.yelp.com/biz/elephant-bar-restaurant-peoria','https://www.yelp.com/biz/sullivans-steakhouse-denver','https://www.yelp.com/biz/yucatan-liquid-stand-coppell','https://www.yelp.com/biz/tomato-pie-morristown','https://www.yelp.com/biz/willett-house-port-chester','https://www.yelp.com/biz/thai-corner-san-antonio-2','https://www.yelp.com/biz/silkes-american-grill-mesa','https://www.yelp.com/biz/t-mex-cantina-fort-lauderdale-2','https://www.yelp.com/biz/casa-oaxaca-washington','https://www.yelp.com/biz/wings-on-wheels-hebron','https://www.yelp.com/biz/siris-thai-french-cuisine-cherry-hill','https://www.yelp.com/biz/nightwood-chicago','https://www.yelp.com/biz/cafe-gallery-burlington','https://www.yelp.com/biz/the-hurricane-caf%C3%A9-seattle-2','https://www.yelp.com/biz/231-ellsworth-san-mateo','https://www.yelp.com/biz/la-marmite-williston-park','https://www.yelp.com/biz/the-river-house-palm-beach-gardens-2','https://www.yelp.com/biz/langermanns-baltimore','https://www.yelp.com/biz/del-friscos-grille-phoenix','https://www.yelp.com/biz/carrows-family-restaurant-antioch','https://www.yelp.com/biz/minerva-fine-indian-herndon-va-herndon-5','https://www.yelp.com/biz/the-mason-bar-dallas','https://www.yelp.com/biz/la-cote-cafe-and-wine-bar-seattle','https://www.yelp.com/biz/vareli-new-york','https://www.yelp.com/biz/wendys-wixom','https://www.yelp.com/biz/lanterna-tuscan-bistro-nyack','https://www.yelp.com/biz/yo-taco-duxbury','https://www.yelp.com/biz/bombay-palace-new-york','https://www.yelp.com/biz/cafe-buonaros-naperville','https://www.yelp.com/biz/ponti-seafood-grill-seattle-3','https://www.yelp.com/biz/bill-johnsons-big-apple-restaurants-phoenix-5','https://www.yelp.com/biz/by-word-of-mouth-oakland-park','https://www.yelp.com/biz/anna-maries-pizza-and-restaurant-wharton','https://www.yelp.com/biz/dierdorf-and-harts-steakhouse-saint-louis','https://www.yelp.com/biz/wine-5-cafe-las-vegas','https://www.yelp.com/biz/ernies-restaurant-plymouth','https://www.yelp.com/biz/next-door-pizza-and-pub-lees-summit','https://www.yelp.com/biz/lannys-alta-cocina-mexicana-fort-worth','https://www.yelp.com/biz/jalisco-mexican-restaurant-eastlake','https://www.yelp.com/biz/clio-boston','https://www.yelp.com/biz/uncommon-grounds-aliquippa','https://www.yelp.com/biz/uozumi-restaurant-palmdale','https://www.yelp.com/biz/enzos-pizza-matawan','https://www.yelp.com/biz/the-pointe-cafe-south-san-francisco','https://www.yelp.com/biz/captains-restaurant-and-seafood-market-florida-city','https://www.yelp.com/biz/le-perigord-new-york-4','https://www.yelp.com/biz/i-love-thai-arlington','https://www.yelp.com/biz/bistro-44-bedford','https://www.yelp.com/biz/ritters-marietta','https://www.yelp.com/biz/rouge-et-blanc-new-york','https://www.yelp.com/biz/assembly-steak-house-and-seafood-grill-englewood-cliffs-2','https://www.yelp.com/biz/american-turkish-restaurant-fort-lauderdale','https://www.yelp.com/biz/r-and-r-bar-b-que-and-catering-service-missouri-2','https://www.yelp.com/biz/sushi-land-long-beach','https://www.yelp.com/biz/longshots-sports-bar-waretown','https://www.yelp.com/biz/salt-creek-barbeque-glendale-heights','https://www.yelp.com/biz/pizza-market-breese','https://www.yelp.com/biz/john-qs-steakhouse-cleveland','https://www.yelp.com/biz/bistro-n-boca-raton-2','https://www.yelp.com/biz/samanthas-restaurant-silver-spring-2','https://www.yelp.com/biz/baha-brothers-sandbar-grill-taunton-3','https://www.yelp.com/biz/cafe-cortina-farmington-hills-5','https://www.yelp.com/biz/big-beaver-tavern-troy','https://www.yelp.com/biz/hogans-restaurant-bloomfield-hills','https://www.yelp.com/biz/the-copper-monkey-beaverton','https://www.yelp.com/biz/clement-street-bar-and-grill-san-francisco','https://www.yelp.com/biz/pepin-scottsdale','https://www.yelp.com/biz/village-belle-philadelphia','https://www.yelp.com/biz/sweet-woodruff-san-francisco','https://www.yelp.com/biz/siam-marina-tinley-park','https://www.yelp.com/biz/luigis-italian-restaurant-centennial-2','https://www.yelp.com/biz/smokin-wills-barbecue-roselle','https://www.yelp.com/biz/voltaire-restaurant-scottsdale','https://www.yelp.com/biz/jus-cookins-restaurant-lakewood-2','https://www.yelp.com/biz/pegs-countryside-cafe-hamel','https://www.yelp.com/biz/rays-grill-fulshear','https://www.yelp.com/biz/cafe-zalute-rosemont','https://www.yelp.com/biz/guard-house-inn-gladwyne','https://www.yelp.com/biz/road-runner-grand-canyon-las-vegas-2','https://www.yelp.com/biz/garage-restaurant-and-cafe-new-york','https://www.yelp.com/biz/los-tapatios-cedar-hill','https://www.yelp.com/biz/chengdu-46-clifton','https://www.yelp.com/biz/moby-dick-house-of-kabob-fairfax','https://www.yelp.com/biz/natures-food-patch-clearwater','https://www.yelp.com/biz/taco-del-mar-hillsboro-3','https://www.yelp.com/biz/ms-tootsies-rbl-philadelphia','https://www.yelp.com/biz/the-big-c-athletic-club-concord','https://www.yelp.com/biz/west-hanover-pizzeria-hanover','https://www.yelp.com/biz/georges-pastaria-houston','https://www.yelp.com/biz/encuentro-oakland-3','https://www.yelp.com/biz/smokys-bbq-eldersburg','https://www.yelp.com/biz/ruby-tuesday-san-antonio','https://www.yelp.com/biz/saladworks-philadelphia-4','https://www.yelp.com/biz/captain-pizza-middleton','https://www.yelp.com/biz/bob-evans-fredericksburg-3','https://www.yelp.com/biz/frittata-clawson','https://www.yelp.com/biz/the-sandwich-spot-palm-springs','https://www.yelp.com/biz/freds-mexican-cafe-san-diego-4','https://www.yelp.com/biz/geordies-steak-phoenix-2','https://www.yelp.com/biz/five-guys-wayne-5','https://www.yelp.com/biz/zen-sushi-la-crescenta-2','https://www.yelp.com/biz/the-summit-steakhouse-aurora-2','https://www.yelp.com/biz/miramar-bistro-highwood','https://www.yelp.com/biz/mick-o-sheas-baltimore','https://www.yelp.com/biz/dennys-houston-30','https://www.yelp.com/biz/carls-jr-henderson-5','https://www.yelp.com/biz/mexican-town-restaurant-detroit','https://www.yelp.com/biz/sushi-roku-las-vegas','https://www.yelp.com/biz/giant-pizza-king-san-diego','https://www.yelp.com/biz/quiznos-brooklyn-6','https://www.yelp.com/biz/taco-bell-glen-ellyn','https://www.yelp.com/biz/las-tortas-locas-marietta','https://www.yelp.com/biz/smith-and-wollensky-las-vegas-2','https://www.yelp.com/biz/happy-garden-chinese-brighton','https://www.yelp.com/biz/urban-foodie-feed-store-college-park','https://www.yelp.com/biz/the-wolf-oakland','https://www.yelp.com/biz/scuzzis-italian-restaurant-san-antonio-4','https://www.yelp.com/biz/better-gourmet-health-kitchen-staten-island','https://www.yelp.com/biz/the-restaurant-and-cafe-warren','https://www.yelp.com/biz/mcdonalds-houston-214','https://www.yelp.com/biz/pyeong-chang-tofu-house-oakland','https://www.yelp.com/biz/maria-rosa-pizzeria-and-family-restaurant-flemington','https://www.yelp.com/biz/legends-sports-bar-and-grill-roseville-2','https://www.yelp.com/biz/villa-reale-pizzeria-and-restaurant-pittsburgh','https://www.yelp.com/biz/the-terrace-cafe-venice','https://www.yelp.com/biz/the-oval-room-washington-2','https://www.yelp.com/biz/high-point-coal-center','https://www.yelp.com/biz/j-and-s-montebello','https://www.yelp.com/biz/cheers-restaurant-and-bar-fort-lauderdale']
def parse_page(url):
# url = "https://www.yelp.com/biz/frances-san-francisco"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False).text
parser = html.fromstring(response)
raw_name = parser.xpath("//h1[contains(#class,'page-title')]//text()")
raw_claimed = parser.xpath("//span[contains(#class,'claim-status_icon--claimed')]/parent::div/text()")
raw_reviews = parser.xpath("//div[contains(#class,'biz-main-info')]//span[contains(#class,'review-count rating-qualifier')]//text()")
raw_category = parser.xpath('//div[contains(#class,"biz-page-header")]//span[#class="category-str-list"]//a/text()')
hours_table = parser.xpath("//table[contains(#class,'hours-table')]//tr")
details_table = parser.xpath("//div[#class='short-def-list']//dl")
raw_map_link = parser.xpath("//a[#class='biz-map-directions']/img/#src")
raw_phone = parser.xpath(".//span[#class='biz-phone']//text()")
raw_address = parser.xpath('//div[#class="mapbox-text"]//div[contains(#class,"map-box-address")]//text()')
raw_wbsite_link = parser.xpath("//span[contains(#class,'biz-website')]/a/#href")
raw_price_range = parser.xpath("//dd[contains(#class,'price-description')]//text()")
raw_health_rating = parser.xpath("//dd[contains(#class,'health-score-description')]//text()")
rating_histogram = parser.xpath("//table[contains(#class,'histogram')]//tr[contains(#class,'histogram_row')]")
raw_ratings = parser.xpath("//div[contains(#class,'biz-page-header')]//div[contains(#class,'rating')]/#title")
working_hours = []
for hours in hours_table:
raw_day = hours.xpath(".//th//text()")
raw_timing = hours.xpath("./td//text()")
day = ''.join(raw_day).strip()
timing = ''.join(raw_timing).strip()
working_hours.append({day:timing})
info = []
for details in details_table:
raw_description_key = details.xpath('.//dt//text()')
raw_description_value = details.xpath('.//dd//text()')
description_key = ''.join(raw_description_key).strip()
description_value = ''.join(raw_description_value).strip()
info.append({description_key:description_value})
ratings_histogram = []
for ratings in rating_histogram:
raw_rating_key = ratings.xpath(".//th//text()")
raw_rating_value = ratings.xpath(".//td[#class='histogram_count']//text()")
rating_key = ''.join(raw_rating_key).strip()
rating_value = ''.join(raw_rating_value).strip()
ratings_histogram.append({rating_key:rating_value})
name = ''.join(raw_name).strip()
phone = ''.join(raw_phone).strip()
address = ' '.join(' '.join(raw_address).split())
health_rating = ''.join(raw_health_rating).strip()
price_range = ''.join(raw_price_range).strip()
claimed_status = ''.join(raw_claimed).strip()
reviews = ''.join(raw_reviews).strip()
category = ','.join(raw_category)
cleaned_ratings = ''.join(raw_ratings).strip()
if raw_wbsite_link:
decoded_raw_website_link = urllib.unquote(raw_wbsite_link[0])
website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
else:
website = ''
if raw_map_link:
decoded_map_url = urllib.unquote(raw_map_link[0])
map_coordinates = re.findall("center=([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
latitude = map_coordinates[0]
longitude = map_coordinates[1]
else:
latitude = ''
longitude = ''
if raw_ratings:
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0]
else:
ratings = 0
data={'working_hours':working_hours,
'info':info,
'ratings_histogram':ratings_histogram,
'name':name,
'phone':phone,
'ratings':ratings,
'address':address,
'health_rating':health_rating,
'price_range':price_range,
'claimed_status':claimed_status,
'reviews':reviews,
'category':category,
'website':website,
'latitude':latitude,
'longitude':longitude,
'url':url,
}
return data
def parse_reviews(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0.'}
response = requests.get(url, headers=headers, verify=False).text
parser = html.fromstring(response)
ratings_zipped = []
reviews = [x for x in parser.xpath("//div[contains(#class,'main-section')]//div[contains(#class,'review-list')]//div[contains(#class,'review')]//div[contains(#class,'review-content')]")]
for r in reviews:
date = r.xpath("./div[contains(#class,'biz-rating')]//span[contains(#class,'rating-qualifier')]/text()")[0].strip()
rating = r.xpath("./div[contains(#class,'biz-rating')]//div[contains(#class,'rating-large')]/#title")[0]
content = r.xpath("./p")[0].text_content()
ratings_zipped.append([date, rating, content])
print (len(ratings_zipped))
return ratings_zipped
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def parse_pagination(url):
print (url)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False)
print (response)
parser = html.fromstring(response.text)
try:
results = (int(parser.xpath("//div[contains(#class,'page-of-pages')]//text()")[0].strip().split(' ').pop()))*20
except IndexError:
results = 20
print (results)
return results
def get_businesses_data(data):
businesses, failed_searches = [], []
start_time = time.time()
result = {}
for i,url in enumerate(data):
print ('Starting iteration: ', i)
result['url']= url
pagination = parse_pagination(url)
print ('Pagination: ', pagination)
info = parse_page(url)
result['info'] = info
_reviews = []
for v in xrange(0,pagination,20):
paginated_url = result['url'].split('?')[0] + '?start='+str(v)
print ('Scraping Reviews: ', paginated_url)
_reviews += parse_reviews(paginated_url)
time.sleep(.5)
result['scraped_reviews'] = _reviews
result['scraped_reviews_count'] = len(_reviews)
businesses.append(result)
print ('Success iteration: ', i)
# print ('Results: ', result)
print ('Num of reviews: ', str(len(_reviews)))
print('')
print ('Time Elapsed: ', str(time.time() - start_time))
return businesses
if __name__=="__main__":
index = 5
#0
size = 20
i = index*20
chunk = yelp_urls[i:i+size]
businesses = get_businesses_data(chunk)
with open ('results/run_3/output_{}.json'.format(i), 'w') as f:
json.dump(businesses,f)
'''
from exceptions import ValueError
You don't need to do that at all, ValueError is part of the built-in exceptions, not to mention the fact that you never use it in your code
I want to fetch more than 100 high resolution images from google, use python2.7 + selenium + PhantomJS.
But since I act like they said, I could only get a webpage with small images on it. And I can't find out any link to the high resolution pictures directly. How could I fix it?
My code is as below.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
class ImgCrawler:
def __init__(self,searchlink = None):
self.link = searchlink
self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
self.scrolldown = None
self.jsdriver = None
def getPhantomJSDriver(self):
self.jsdriver = webdriver.PhantomJS()
self.jsdriver.get(self.link)
def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
for i in range(scrolltimes):
self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(sleeptime)
def getSoup(self, parser=None):
print 'a', self.jsdriver.page_source
return BeautifulSoup(self.jsdriver.page_source, parser)
def getActualUrl(self, soup=None, flag=None, classflag=None, jsonflaglink=None, jsonflagtype=None):
actualurl = []
for a in soup.find_all(flag, {"class": classflag}):
link = json.loads(a.text)[jsonflaglink]
filetype = json.loads(a.text)[jsonflagtype]
detailurl = link + u'.' + filetype
actualurl.append(detailurl)
return actualurl
if __name__ == '__main__':
search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
queryword = raw_input()
query = queryword.split()
query = '+'.join(query)
weblink = search_url + query
img = ImgCrawler(weblink)
img.getPhantomJSDriver()
img.scrollDownUsePhatomJS(2,5)
soup = img.getSoup('html.parser')
print weblink
print soup
actualurllist = img.getActualUrl(soup,'div','rg_meta','ou','ity')
print len(actualurllist)
I tried for a long time to use PhantomJS but ended up using Chrome which is not what you asked for I know, but it works. I could not get it to work with PhantomJS.
First get a driver https://sites.google.com/a/chromium.org/chromedriver/downloads you can use a headless version of chrome "Chrome Canary" if you are on Windows.
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import urlparse
class ImgCrawler:
def __init__(self,searchlink = None):
self.link = searchlink
self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
self.scrolldown = None
self.jsdriver = None
def getPhantomJSDriver(self):
self.jsdriver = webdriver.Chrome()
self.jsdriver.get(self.link)
def scrollDownUsePhatomJS(self, scrolltimes = 1, sleeptime = 10):
for i in range(scrolltimes):
self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(sleeptime)
def getSoup(self, parser=None):
print 'a', self.jsdriver.page_source
return BeautifulSoup(self.jsdriver.page_source, parser)
def getActualUrl(self, soup=None):
actualurl = []
r = re.compile(r"/imgres\?imgurl=")
for a in soup.find_all('a', href=r):
parsed = urlparse.urlparse(a['href'])
url = urlparse.parse_qs(parsed.query)['imgurl']
actualurl.append(url)
print url
return actualurl
if __name__ == '__main__':
search_url = "https://www.google.com.hk/search?safe=strict&hl=zh-CN&site=imghp&tbm=isch&source=hp&biw=&bih=&btnG=Google+%E6%90%9C%E7%B4%A2&q="
queryword = raw_input()
query = queryword.split()
query = '+'.join(query)
weblink = search_url + query
img = ImgCrawler(weblink)
img.getPhantomJSDriver()
img.scrollDownUsePhatomJS(2,5)
soup = img.getSoup('html.parser')
print weblink
print soup
actualurllist = img.getActualUrl(soup)
print len(actualurllist)
I changed getActualUrl() to get the image url from an "a" element with a "href" attribute starting with "/imgres?imgurl="
Outputs (when "hazard" is typed in to the terminal):
[u'https://static.independent.co.uk/s3fs-public/styles/article_small/public/thumbnails/image/2016/12/26/16/eden-hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/EdenHazardDecember_2016.jpg/200px-EdenHazardDecember_2016.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2016/1227/r166293_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://platform-static-files.s3.amazonaws.com/premierleague/photos/players/250x250/p42786.png']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Eden_Hazard_-_DK-Chel15_%286%29.jpg/150px-Eden_Hazard_-_DK-Chel15_%286%29.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0117/r172004_1296x729_16-9.jpg&w=738&site=espnfc']
[u'http://images.performgroup.com/di/library/GOAL/98/c0/eden-hazard-chelsea_1eohde060wvft1elcskrgihxq3.jpg?t=-1500575837&w=620&h=430']
[u'http://e1.365dm.com/17/03/16-9/20/skysports-eden-hazard-chelsea_3918835.jpg?20170331154242']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0402/r196036_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000316361045.jpg?strip=all&w=670&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/02/10/14/eden-hazard1.jpg']
[u'http://s.newsweek.com/sites/www.newsweek.com/files/2016/11/07/eden-hazard.jpg']
[u'http://www.newstube24.com/wp-content/uploads/2017/06/haz.jpg']
[u'http://images.performgroup.com/di/library/GOAL/17/b1/eden-hazard_68ypnelg3lfd14oxkffztftt6.png?t=-1802977526&w=620&h=430']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/eb/DK-Chel15_%288%29.jpg/220px-DK-Chel15_%288%29.jpg']
[u'http://images.performgroup.com/di/library/omnisport/50/3f/hazard-cropped_3y08vc3ejpua168e9mgvu4mwc.jpg?t=-930203025&w=620&h=430']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000291621611-e1490777105213.jpg?strip=all&w=745&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/01/14/14/Eden-Hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/5/56/Eden_Hazard%2713-14.JPG/150px-Eden_Hazard%2713-14.JPG']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000296311943-e1490777241155.jpg?strip=all&w=596&quality=100']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/01/27/11/hazard.jpg']
[u'http://www.newzimbabwe.com/FCKEditor_Images/Eden-Hazard-896286.jpg']
[u'http://images.performgroup.com/di/library/GOAL/9c/93/eden-hazard_d4lbib7wdagw1hp2e5gnyov0k.jpg?t=-1763574189&w=620&h=430']
[u'http://www.guoguiyan.com/data/out/94/69914569-hazard-wallpapers.jpg']
[u'http://static.guim.co.uk/sys-images/Football/Pix/pictures/2015/4/16/1429206099512/Eden-Hazard-009.jpg']
[u'https://metrouk2.files.wordpress.com/2017/04/pri_37621532.jpg?w=620&h=406&crop=1']
[u'http://alummata.com/wp-content/uploads/2016/04/Hazard.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/Hazard_vs_Norwich_%28crop%29.jpg/150px-Hazard_vs_Norwich_%28crop%29.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/11/06/20/3A185FB800000578-3910886-image-a-46_1478462522592.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/467822629-465742.jpg']
[u'http://i.dailymail.co.uk/i/pix/2015/10/17/18/2D81CB1D00000578-0-image-a-37_1445102645249.jpg']
[u'http://images.performgroup.com/di/library/GOAL_INTERNATIONAL/27/ce/eden-hazard_1kepw6rvweted1hpfmp5xwd5cs.jpg?t=-228379025&w=620&h=430']
[u'http://img.skysports.com/16/12/768x432/skysports-chelsea-manchester-city-eden-hazard_3845204.jpg?20161203162258']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict0003089026162.jpg?strip=all&w=960&quality=100']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2016/05/hazard--rediscovering-our-form.img.png']
[u'http://images.performgroup.com/di/library/omnisport/b5/98/hazard-cropped_172u0n8wx4j071cvgs1n3yycvw.jpg?t=2030908123&w=620&h=430']
[u'http://images.indianexpress.com/2016/05/eden-hazard-m.jpg']
[u'http://i2.mirror.co.uk/incoming/article9755579.ece/ALTERNATES/s615/PAY-Chelsea-v-Arsenal-Premier-League.jpg']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/06/hazard-injury-update.img.png']
[u'http://futhead.cursecdn.com/static/img/fm/17/players/183277_HAZARDCAM7.png']
[u'http://images.performgroup.com/di/library/GOAL/4d/6/eden-hazard-chelsea-06032017_enh1ll3uadj01ocstyopie9e4.jpg?t=-1521106510&w=620&h=430']
[u'http://images.performgroup.com/di/library/GOAL/34/8/eden-hazard-chelsea-southampton_1oca1rpy37gmn1ldmqvytti3k4.jpg?t=-1501721805&w=620&h=430']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-scoring-his-sides-third-goal-during-picture-id617452212?s=612x612']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-889782.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2015/10/19/16/Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000307050894.jpg?strip=all&w=660&quality=100']
[u'http://e1.365dm.com/16/11/16-9/20/skysports-eden-hazard-chelsea-football_3833122.jpg?20161116153005']
[u'http://thumb.besoccer.com/media/img_news/morata-y-hazard--besoccer.jpg']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2017/03/13/21/10-hazard.jpg']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2016/12/27/13/hazard.jpg']
[u'http://images.performgroup.com/di/library/GOAL/63/2a/eden-hazard-chelsea_15ggj1j39rmky1c5a20oxt3tly.jpg?t=1297762370']
[u'http://i1.mirror.co.uk/incoming/article9755531.ece/ALTERNATES/s615b/Chelsea-v-Arsenal-Premier-League.jpg']
[u'http://cf.c.ooyala.com/l2YmpyYjE6yvLSxGiEebNMr3N1ANS1Xc/O0cEsGv5RdudyPNn4xMDoxOjBnO_4SLA']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-scoring-his-sides-third-goal-during-picture-id617452006?s=612x612']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0413/r199412_2_1296x729_16-9.jpg&w=738&site=espnfc']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000318703991-e1493109803795.jpg?strip=all&w=960&quality=100']
[u'https://static.independent.co.uk/s3fs-public/styles/story_medium/public/thumbnails/image/2016/11/18/14/hazard-award.jpg']
[u'http://static.goal.com/2477200/2477282_heroa.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2016/12/16/14/eden-hazard.jpg']
[u'http://www.guoguiyan.com/data/out/94/69979129-hazard-wallpapers.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/11/nintchdbpict0002769424741.jpg?w=960&strip=all']
[u'http://v.uecdn.es/p/110/thumbnail/entry_id/0_ofavjqr8/width/660/cache_st/20170327164629/type/2/bgcolor/000000/0_ofavjqr8.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/02/07/10/edenhazard.jpg']
[u'http://theworldgame.sbs.com.au/sites/sbs.com.au.theworldgame/files/styles/full/public/images/e/d/eden-hazard-cropped_g6m28ldoc0b41p3f5sp2vlflt.jpg?itok=XW5M7QEA']
[u'http://e0.365dm.com/17/03/16-9/20/skysports-eden-hazard-chelsea_3909051.jpg?20170314181126']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Chelsea-Hazard-goals-886084.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/nintchdbpict000319343894.jpg?strip=all&w=960&quality=100']
[u'https://www.footyrenders.com/render/Eden-Hazard-PL.png']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-as-he-scores-their-first-goal-the-picture-id672946758?s=612x612']
[u'https://pbs.twimg.com/profile_images/791664465729163264/XbCVl6BF.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/9/95/Eden_Hazard_2011.jpg/170px-Eden_Hazard_2011.jpg']
[u'http://s.newsweek.com/sites/www.newsweek.com/files/2016/02/01/guus-hiddink-says-eden-hazard-could-leave-chelsea..jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/06/chelsea_hazard_mobile_top.jpg?strip=all&w=750&h=352&crop=1']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/735000/Eden-Hazard-887735.jpg']
[u'http://i.telegraph.co.uk/multimedia/archive/03580/Hazard_Real_copy_3580583b.jpg']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2017/05/21/47a1f452-43e4-4215-a5b8-5043c1e12a07/686302908.jpg']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/06/hazard-s-highlights.img.png']
[u'http://i.dailymail.co.uk/i/pix/2016/12/14/15/3B45B4D300000578-4032306-Hazard_PFA_Player_of_the_Year_in_2015_has_rediscovered_his_form_-a-6_1481728291902.jpg']
[u'https://img.rasset.ie/000d5137-800.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-665260.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-659164.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Hazard-scored-Chelsea-s-third-goal-against-Tottenham-909804.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/237000/Eden-Hazard-887237.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/media/motion/ESPNi/2017/0405/int_170405_Hazard_the_successor_to_Ronaldo_at_Real/int_170405_Hazard_the_successor_to_Ronaldo_at_Real.jpg&w=738&site=espnfc']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Eden-Hazard-721522.jpg']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-with-teammates-after-scoring-his-picture-id633776492?s=612x612']
[u'http://betinmalta.com/wp-content/uploads/2017/05/hazard.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/708000/Eden-Hazard-Chelsea-712708.jpg']
[u'http://images.performgroup.com/di/library/omnisport/c9/4a/eden-hazard-cropped_12u5irb6bkze1cue2wpjzxa44.jpg?t=-2084914038&w=620&h=430']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/06/nintchdbpict0003291256741.jpg?strip=all&w=714&quality=100']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2017/03/10/f97d36aa-1eef-4a78-996f-63d543c79efc/700017169TS004_Eden_Hazard_.JPG']
[u'https://s-media-cache-ak0.pinimg.com/736x/f0/01/17/f001178defb2b3be3cffb5e9b792748b--eden-hazard-liverpool-england.jpg']
[u'http://i4.mirror.co.uk/incoming/article9898829.ece/ALTERNATES/s615b/hazard-2.jpg']
[u'http://images.performgroup.com/di/library/GOAL/24/76/eden-hazard-and-lionel-messi_kamv8simc20x1p2i2fcf7lllw.png?t=421166242&w=620&h=430']
[u'https://metrouk2.files.wordpress.com/2017/03/gettyimages-618471206.jpg?w=748&h=498&crop=1']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Hazard-Chelsea-658138.jpg']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2017/04/23/17/hazard.jpg']
[u'http://e0.365dm.com/15/10/16-9/20/eden-hazard-comparison-chelsea_3365521.jpg?20151018152317']
[u'http://cdn.images.express.co.uk/img/dynamic/galleries/x701/231048.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/galleries/x701/102742.jpg']
[u'https://i.ytimg.com/vi/GWhVkFTe_BY/maxresdefault.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000319343748-e1496260888520.jpg?strip=all&w=960&quality=100']
[u'https://metrouk2.files.wordpress.com/2017/06/689818982.jpg?w=748&h=498&crop=1']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000308902470.jpg?strip=all&w=960&quality=100']
[u'https://www.thesun.co.uk/wp-content/uploads/2016/12/nintchdbpict000289569836.jpg?w=960&strip=all']
[u'https://i.ytimg.com/vi/zZ9stt70_vU/maxresdefault.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/2/2c/Kylian_Hazard_%28cropped%29.jpg']
[u'http://e00-marca.uecdn.es/assets/multimedia/imagenes/2017/03/26/14905092504845.jpg']
[u'http://images.performgroup.com/di/library/omnisport/ba/47/eden-hazard-cropped_rccnpv1me3v51kqpnj5ak4nko.jpg?t=1222186324&w=620&h=430']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-922505.jpg']
[u'https://s-media-cache-ak0.pinimg.com/736x/48/ce/4c/48ce4c478d8b06dccacce352d9d4bdc2--eden-hazard-pogba.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2016/10/23/111897755_Editorial_use_only_No_merchandising_For_Football_images_FA_and_Premier_League_restrict-large_trans_NvBQzQNjv4BqqVzuuqpFlyLIwiB6NTmJwfSVWeZ_vEN7c6bHu2jJnT8.jpg']
[u'https://metrouk2.files.wordpress.com/2017/06/686902184.jpg?w=748&h=652&crop=1']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict0003086723271-e1489501904590.jpg?strip=all&w=960&quality=100']
[u'http://e2.365dm.com/17/01/16-9/20/skysports-chelsea-manchester-city-eden-hazard_3862340.jpg?20170406190414']
[u'http://www.telegraph.co.uk/content/dam/football/2017/05/26/TELEMMGLPICT000129483487-large_trans_NvBQzQNjv4BqajCpFXsei0OXjDFGPZkcdJOkVdu-K0ystYH4SV7DHn8.jpeg']
[u'https://i.ytimg.com/vi/FFE4Ea437ks/maxresdefault.jpg']
[u'https://i1.wp.com/www.vanguardngr.com/wp-content/uploads/2017/03/Hazard-madrid.png?resize=350%2C200']
[u'http://china.chelseafc.com/content/cfc/zh/homepage/teams/first-team/eden-hazard/summary/_jcr_content/tabparmain/box/box/textimage/image.img.jpg/1496846329140.jpg']
[u'http://static.goal.com/198700/198707_news.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000319357531.jpg?strip=all&w=960&quality=100']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-poses-with-the-premier-league-trophy-after-the-picture-id686826640?s=612x612']
[u'http://cf.c.ooyala.com/t3dXdzYjE6VJktcnKdi7F2205I_mSSKQ/eWNh-8akTAF2kj8X4xMDoxOjBnO_4SLA']
[u'http://c.smimg.net/16/39/300x225/eden-hazard.jpg']
[u'http://www.whatfootballersearn.com/wp-content/uploads/Eden-Hazard.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/328000/Eden-Hazard-437328.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/secondary/Eden-Hazard-Chelsea-882846.jpg']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Chelsea-star-Eden-Hazard-741161.jpg']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201703/hazard_0.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/08/28/21/37A101A700000578-3762573-image-a-19_1472417354943.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2016/07/27/87650659-edenhazard-sport-large_trans_NvBQzQNjv4BqqVzuuqpFlyLIwiB6NTmJwfSVWeZ_vEN7c6bHu2jJnT8.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/165000/620x/Eden-Hazard-598896.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/05/04/21/33C8A26600000578-0-image-a-19_1462392130112.jpg']
[u'https://ichef-1.bbci.co.uk/news/660/cpsprodpb/13AA1/production/_96354508_595836d4-f21a-419b-95cb-37a65204f6eb.jpg']
[u'https://premierleague-static-files.s3.amazonaws.com/premierleague/photo/2016/11/30/1eb421ae-b210-4a01-95bb-36f509826cc1/Debruyne_v_Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/03/nintchdbpict000309639487-e1490040388851.jpg?strip=all&w=739&quality=100']
[u'http://static.goal.com/3311200/3311292_heroa.jpg']
[u'http://i3.mirror.co.uk/incoming/article7986781.ece/ALTERNATES/s615b/Hazard-and-son.jpg']
[u'http://a.espncdn.com/combiner/i/?img=/photo/2016/0916/r126535_1296x729_16-9.jpg&w=738&site=espnfc']
[u'http://www.chelseafc.com/content/cfc/en/homepage/news/boilerplate-config/latest-news/2017/03/hazard-score-is-number-one-.img.png']
[u'https://static.independent.co.uk/s3fs-public/thumbnails/image/2015/03/17/13/eden-hazard.jpg']
[u'https://metrouk2.files.wordpress.com/2017/05/680506564.jpg?w=748&h=457&crop=1']
[u'http://media.gettyimages.com/photos/eden-hazard-of-chelsea-celebrates-with-diego-costa-of-chelsea-after-picture-id671559962?s=612x612']
[u'http://e0.365dm.com/17/05/16-9/20/skysports-eden-hazard-chelsea_3965489.jpg?20170529101357']
[u'https://s-media-cache-ak0.pinimg.com/736x/e0/80/0e/e0800e380ef363594fb292969b7c5b64--eden-hazard-chelsea-soccer.jpg']
[u'http://cdn-football365.365.co.za/content/uploads/2016/12/GettyImages.630542828.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/07/16/19/340E4A9A00000578-3693637-image-a-84_1468694248523.jpg']
[u'http://www.squawka.com/news/wp-content/uploads/2017/01/hazard-chelsea-e1485528066609.jpg']
[u'http://www.guoguiyan.com/data/out/94/68880901-hazard-wallpapers.jpg']
[u'http://www.telegraph.co.uk/content/dam/football/2017/03/12/JS122962983_EHazDavid-Rose-large_trans_NvBQzQNjv4BqtA9hvt4yaDuJhaJG2frTIUNrh1MdssoHpGF6OIxC49c.jpg']
[u'http://images.performgroup.com/di/library/GOAL/17/25/eden-hazard-chelsea_1dsnlf2z113cx10nxvp9ydudcz.jpg?t=2008335075']
[u'http://www.telegraph.co.uk/content/dam/football/2016/12/05/115182685-eden-hazard-sport-large_trans_NvBQzQNjv4BqA7a2BP2KFPtZUOepzpZgXISdNn8DgVUcalGVREaviFE.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/04/sport-preview-morata-hazard.jpg?strip=all&quality=100&w=750&h=500&crop=1']
[u'http://thumb.besoccer.com/media/img_news/eden-hazard--futbolista-del-chelsea--chelseafc.jpg']
[u'http://i4.mirror.co.uk/incoming/article7374471.ece/ALTERNATES/s615/Chelsea-Training-Session.jpg']
[u'https://metrouk2.files.wordpress.com/2017/04/671549404.jpg?w=748&h=532&crop=1']
[u'https://metrouk2.files.wordpress.com/2016/02/462363538.jpg?w=748&h=563&crop=1']
[u'https://metrouk2.files.wordpress.com/2017/05/6834221661.jpg?w=748&h=507&crop=1']
[u'http://cdn.images.express.co.uk/img/dynamic/67/590x/Chelsea-star-Eden-Hazard-739447.jpg']
[u'http://cdn.quotesgram.com/img/21/41/114220036-24CA6E4700000578-2916442-Eden_Hazard_has_been_instrumental_for_Chelsea_this_season_as_the-a-7_1421682779132.jpg']
[u'http://i.dailymail.co.uk/i/pix/2016/09/29/09/38E4401500000578-3813294-image-a-1_1475138637248.jpg']
[u'http://healthyceleb.com/wp-content/uploads/2016/04/Eden-Hazard-match-between-Chelsea-Milton-Keynes-Dons-January-2016.jpg']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201704/gettyimages-663029916.jpg']
[u'https://upload.wikimedia.org/wikipedia/commons/thumb/2/29/Thorgan_Hazard_2014.jpg/220px-Thorgan_Hazard_2014.jpg']
[u'http://cdn.images.dailystar.co.uk/dynamic/58/photos/91000/620x/Eden-Hazard-632850.jpg']
[u'http://i4.mirror.co.uk/incoming/article7531141.ece/ALTERNATES/s615/A-dejected-looking-Eden-Hazard.jpg']
[u'https://www.thesun.co.uk/wp-content/uploads/2017/05/nintchdbpict000322464448-e1494602676644.jpg?strip=all&w=960&quality=100']
[u'http://images.performgroup.com/di/library/GOAL_INTERNATIONAL/76/92/chelsea-bournemouth-eden-hazard_148j4p900kzba159diso6ewwvo.jpg?t=1004329665&w=620&h=430']
[u'https://images.cdn.fourfourtwo.com/sites/fourfourtwo.com/files/styles/inline-image/public/hazard3.jpg?itok=ap0DtuZx']
[u'https://talksport.com/sites/default/files/styles/taxonomy-img/public/field/image/201707/hazard.jpg']
...
[u'http://a.espncdn.com/combiner/i/?img=/photo/2017/0330/r195127_1296x729_16-9.jpg&w=738&site=espnfc']
299