I want to be able to scrape the image from a website such as the example below where my intention would be to scrape the image of the player and store that in a temp file so that I could display it in a Tkinter window.
https://www.futbin.com/22/player/573/trent-alexander-arnold
I have come across multiple users using BeautifulSoup so any help regarding that would be greatly appreciated.
This is some sample code that I found and have slightly adapted:
from bs4 import *
import requests
import os
# CREATE FOLDER
def folder_create(images):
try:
folder_name = input("Enter Folder Name:- ")
# folder creation
os.mkdir(folder_name)
# if folder exists with that name, ask another name
except:
print("Folder Exist with that name!")
folder_create()
# image downloading start
download_images(images, folder_name)
# DOWNLOAD ALL IMAGES FROM THAT URL
def download_images(images, folder_name):
# initial count is zero
count = 0
# print total images found in URL
print(f"Total {len(images)} Image Found!")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
r = requests.get(image_link).content
try:
# possibility of decode
r = str(r, 'utf-8')
except UnicodeDecodeError:
# After checking above condition, Image Download start
with open(f"{folder_name}/images{i+1}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
count += 1
except:
pass
# There might be possible, that all
# images not download
# if all images download
if count == len(images):
print("All Images Downloaded!")
# if all images not download
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
# MAIN FUNCTION START
def main(url):
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img')
folder_name = input("Enter Folder Name:- ")
download_images(images, folder_name)
# Call folder create function
try:
folder_create(images)
except:
x = 0
# take url
url = input("Enter URL:- ")
# CALL MAIN FUNCTION
main(url)
This code runs, however it does not seem to find the image file at all. I am also aware this scrapes all images. This code also works on other websites I have tested such as https://www.premierleague.com/players/4852/Adri%C3%A1n/overview
Why not use
urls = [img.get('src', None) for img in soup.findAll('img')]
to get a list of image links, then do your download/save commands like
for url in urls:
if url:
save_image(url, folder_name)
Note: It seems like you're using BeautifulSoup v3 because of the findAll() method. If you use BeautifulSoup v4 use find_all().
Related
I have some website links as samples for extracting any email available in their internal sites.
However, even I am trying to render any JS driven website via r.html.render() within scrape_email(url) method, some of the websites like arken.trygge.dk, gronnebakken.dk, dagtilbud.ballerup.dk/boernehuset-bispevangen etc. does not return any email which might be due to rendering issue.
I have attached the sample file for convenience of running
I dont want to use selenium as there can be thousands or millions of webpage I want to extract emails from.
So far this is my code:
import os
import time
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import pandas as pd
from gtts import gTTS
import winsound
# For convenience of seeing console output in the script
pd.options.display.max_colwidth = 180
#Get the start time of script execution
startTime = time.time()
#Paste file name inside ''
input_file_name = 'sample'
input_df = pd.read_excel(input_file_name+'.xlsx', engine='openpyxl')
input_df = input_df.dropna(how='all')
internal_urls = set()
emails = set()
total_urls_visited = 0
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_internal_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
print("Domain name -- ",domain_name)
try:
soup = BeautifulSoup(requests.get(url, timeout=5).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if parsed_href.netloc != domain_name:
# if the link is not of same domain pass
continue
if parsed_href.path.endswith((".csv",".xlsx",".txt", ".pdf", ".mp3", ".png", ".jpg", ".jpeg", ".svg", ".mov", ".js",".gif",".mp4",".avi",".flv",".wav")):
# Overlook site images,pdf and other file rather than webpages
continue
print(f"Internal link: {href}")
urls.add(href)
internal_urls.add(href)
return urls
except requests.exceptions.Timeout as err:
print("The website is not loading within 5 seconds... Continuing crawling the next one")
pass
except:
print("The website is unavailable. Continuing crawling the next one")
pass
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"Crawling: {url}")
links = get_internal_links(url)
# for link in links:
# if total_urls_visited > max_urls:
# break
# crawl(link, max_urls=max_urls)
def scrape_email(url):
EMAIL_REGEX = r'\b[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# EMAIL_REGEX = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")#(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
try:
# initiate an HTTP session
session = HTMLSession()
# get the HTTP Response
r = session.get(url, timeout=10)
# for JAVA-Script driven websites
r.html.render()
single_url_email = []
for re_match in re.finditer(EMAIL_REGEX, r.html.raw_html.decode()):
single_url_email.append(re_match.group().lower())
r.session.close()
return set(single_url_email)
except:
pass
def crawl_website_scrape_email(url, max_internal_url_no=20):
crawl(url,max_urls=max_internal_url_no)
each_url_emails = []
global internal_urls
global emails
for each_url in internal_urls:
each_url_emails.append(scrape_email(each_url))
URL_WITH_EMAILS={'main_url': url, 'emails':each_url_emails}
emails = {}
internal_urls = set()
return URL_WITH_EMAILS
def list_check(emails_list, email_match):
match_indexes = [i for i, s in enumerate(emails_list) if email_match in s]
return [emails_list[index] for index in match_indexes]
URL_WITH_EMAILS_LIST = [crawl_website_scrape_email(x) for x in input_df['Website'].values]
URL_WITH_EMAILS_DF = pd.DataFrame(data = URL_WITH_EMAILS_LIST)
URL_WITH_EMAILS_DF.to_excel(f"{input_file_name}_email-output.xlsx", index=False)
How can I solve the issue of not being able to scrape email from some of those above-mentioned and similar type of websites?
Is there also any way to detect and print strings if my get request is refused by bot detector or related protocols?
Also how can I make this code more robust?
Thank you in advance
On Python (via Selenium library), I am attempting to scrape images from Google Images. All seems well, but if I input a high number of images (e.g. 1500), then the program will just keep looking for images and the webdriver will keep going on (see output below), even though the webdriver has reached the end of the page.
Output:
Found: 32 search results. Extracting links from 20:32
Found: 19 image links, looking for more ...
Found: 32 search results. Extracting links from 32:32
Found: 19 image links, looking for more ...
Here is my Python code for fetching image URL's...
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1):
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
wd.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
scroll_to_end(wd)
thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# extract image urls
actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links, done!")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(30)
# return
load_more_button = wd.find_element_by_css_selector(".mye4qd")
if load_more_button:
wd.execute_script("document.querySelector('.mye4qd').click();")
results_start = len(thumbnail_results)
return image_urls
I've tried using wd.close() after the if load_more_button statement towards the end of the method, and I've tried to make another while loop after while image_count < max_links_to_fetch, but instead of using image_count, I've used number_results.
So I have this piece of code:
from bs4 import *
import requests
import os
import pandas
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
z=1
for j in df['url']:
# DOWNLOAD ALL IMAGES FROM THAT URL
def download_images(images, folder_name):
# initial count is zero
count = 0
# print total images found in URL
print(f"Total {len(images)} Image Found!")
# checking if images is not zero
if len(images) != 0:
for i, image in enumerate(images):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
pass
# After getting Image Source URL
# We will try to get the content of image
try:
r = requests.get(image_link).content
with open(f"{folder_name}/{z}images{i + 1}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
count += 1
except:
pass
# There might be possible, that all
# images not download
# if all images download
if count == len(images):
print("All Images Downloaded!")
# if all images not download
else:
print(f"Total {count} Images Downloaded Out of {len(images)}")
# MAIN FUNCTION START
def main(url):
# content of URL
r = requests.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
download_images(images, folder_name)
# take url
url = j
# CALL MAIN FUNCTION
main(url)
print(z)
z = z + 1
it scrapes a bunch of urls (listed in history.csv) and downloads some images from them.
the only problem is it's really slow for such a simple task.
what is the correct way to implement multiprocessing to speed it up?
I'm a newbie and I don't know how multiprocessing works
Edit:
Here is the csv file:
mega link
The code is supposed to download about 12000 images which amounts to about 1GB of data from 1648 webpages (the gallery portion of the pages on this e-commerce site)
Since you already are using the requests package, the obvious way to proceed is to use multithreading rather than asyncio, which would require you to abandon requests and learn aiohttp.
I have done quite a bit of restructuring of the code and as I have been unable to test it not having access to your CSV file, I strongly suggest you review what I have done and try to understand it as best possible by reading the Python documentation for the various classes and methods that are new to you. What I did not understand is why when you retrieve a an image file you attempt to decode it. I suppose you expect that to generate an error but it just seems like a waste of time.
I have arbitrarily set the multithreading pool size to 100 (multithreading can easily handle a pool size several times larger, although asyncio can handle thousands of concurrent tasks). Set N_THREADS to the number of URLs multiplied by the average number of images per URL you need to be downloading, but not more than 500.
from bs4 import *
import requests
import os
import pandas
from multiprocessing.pool import ThreadPool
from functools import partial
from threading import Lock
class FileIndex:
"""
Increment and return the next index to use for creating a file
that is threadsafe.
"""
def __init__(self):
self._lock = Lock()
self._file_index = 0
#property
def next_file_index(self):
with self._lock:
self._file_index += 1
return self._file_index
# DOWNLOAD AN IMAGE FROM THAT URL
def download_image(image, session, file_index, folder_number, folder_name):
# From image tag ,Fetch image Source URL
# 1.data-srcset
# 2.data-src
# 3.data-fallback-src
# 4.src
# Here we will use exception handling
# first we will search for "data-srcset" in img tag
try:
# In image tag ,searching for "data-srcset"
image_link = image["data-srcset"]
# then we will search for "data-src" in img
# tag and so on..
except:
try:
# In image tag ,searching for "data-src"
image_link = image["data-src"]
except:
try:
# In image tag ,searching for "data-fallback-src"
image_link = image["data-fallback-src"]
except:
try:
# In image tag ,searching for "src"
image_link = image["src"]
# if no Source URL found
except:
return 0 # no image loaded
# After getting Image Source URL
# We will try to get the content of image
try:
r = session.get(image_link).content
# Why are you trying to decode an image?
try:
# possibility of decode
r = str(r, 'utf-8')
return 0 # no error return 0 ?????
except UnicodeDecodeError:
# After checking above condition, Image Download start
with open(f"{folder_name}/{folder_number}images{file_index.next_file_index}.jpg", "wb+") as f:
f.write(r)
# counting number of image downloaded
return 1 # 1 downloaded
except:
return 0 # 0 downloaded
# download_url FUNCTION START
def download_url(folder_number, url, session, folder_name, thread_pool):
# content of URL
r = session.get(url)
# Parse HTML Code
soup = BeautifulSoup(r.text, 'html.parser')
# find all images in URL
images = soup.findAll('img', class_='pannable-image')
# Call folder create function
worker = partial(download_image,
session=session,
file_index=FileIndex(),
folder_number=folder_number,
folder_name=folder_name)
counts = thread_pool.map(worker, images)
total_counts = sum(counts)
if total_counts == len(images):
print(f"All Images Downloaded for URL {url}!")
else:
print(f"Total {total_counts} Images Downloaded Out of {len(images)} for URL {url}")
# The real main function:
def main():
df = pandas.read_csv(r'C:\Users\fani\Desktop\History.csv')
folder_name = "downloadedpics"
os.mkdir(folder_name)
N_THREADS_URLS = 50 # or some suitable size for retrieving URLS
N_THREADS_IMAGES = 500 # or some suitable size for retrieving images
# use a session for efficiency:
with requests.Session() as session, \
ThreadPool(N_THREADS_URLS) as thread_pool_urls, \
ThreadPool(N_THREADS_IMAGES) as thread_pool_images:
worker = partial(download_url,
session=session,
folder_name=folder_name,
thread_pool=thread_pool_images)
results = thread_pool_urls.starmap(worker, enumerate(df))
if __name__ == '__main__':
main()
import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?
The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.
How can we get all the images of this site: http://www.theft-alerts.com
We need the images of the 19 pages. Sow far we have this code, but it doesn't work yet. We want the images in a new map.
#!/usr/bin/python
import [urllib2][1]
from bs4 import BeautifulSoup
from urlparse import urljoin
url = "http://www.theft-alerts.com/index-%d.html"
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
base = "http://www.theft-alerts.com"
images = [urljoin(base,a["href"]) for a in soup.select("td a[href^=images/]")]
for url in images:
img = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find("img")["src"]
with open("myimages/{}".format(img), "w") as f:
f.write(urllib2.urlopen("{}/{}".format(url.rsplit("/", 1)[0], img)).read())
IMAGE SCRAPING WITH PYTHON
This code will surely work for Scraping Google Images.
import os
import time
import requests
from selenium import webdriver
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver,
sleep_between_interactions: int = 1):
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
# build the google query
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q=
{q}&oq={q}&gs_l=img"
# load the page
wd.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
scroll_to_end(wd)
# get all image thumbnail results
thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found: {number_results} search results. Extracting links from
{results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
# try to click every thumbnail such that we can get the real image behind it
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# extract image urls
actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in
actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links, done!")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(30)
return
load_more_button = wd.find_element_by_css_selector(".mye4qd")
if load_more_button:
wd.execute_script("document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
return image_urls
def persist_image(folder_path:str,url:str, counter):
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
f = open(os.path.join(folder_path, 'jpg' + "_" + str(counter) + ".jpg"), 'wb')
f.write(image_content)
f.close()
print(f"SUCCESS - saved {url} - as {folder_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
def search_and_download(search_term: str, driver_path: str, target_path='./images',
number_images=10):
target_folder = os.path.join(target_path, '_'.join(search_term.lower().split('
')))
if not os.path.exists(target_folder):
os.makedirs(target_folder)
with webdriver.Chrome(executable_path=driver_path) as wd:
res = fetch_image_urls(search_term, number_images, wd=wd,
sleep_between_interactions=0.5)
counter = 0
for elem in res:
persist_image(target_folder, elem, counter)
counter += 1
# How to execute this code
# Step 1 : pip install selenium. pillow, requests
# Step 2 : make sure you have chrome installed on your machine
# Step 3 : Check your chrome version ( go to three dot then help then about google
chrome )
# Step 4 : Download the same chrome driver from here "
https://chromedriver.storage.googleapis.com/index.html "
# Step 5 : put it inside the same folder of this code
DRIVER_PATH = './chromedriver'
search_term = 'iphone'
# num of images you can pass it from here by default it's 10 if you are not passing
#number_images = 10
search_and_download(search_term=search_term, driver_path=DRIVER_PATH)
You need to loop over every page and extract the images, you can keep looping until the anchor with the text "Next" is in the code tag with the class resultnav:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
def get_pages(start):
soup = BeautifulSoup(requests.get(start).content)
images = [img["src"] for img in soup.select("div.itemspacingmodified a img")]
yield images
nxt = soup.select("code.resultnav a")[-1]
while True:
soup = BeautifulSoup(requests.get(urljoin(url, nxt["href"])).content)
nxt = soup.select("code.resultnav a")[-1]
if nxt.text != "Next":
break
yield [img["src"] for img in soup.select("div.itemspacingmodified a img")]
url = "http://www.theft-alerts.com/"
for images in get_pages(url):
print(images)
Which gives you the images from all 19 pages.