Taking screenshots from part of the web page with python and selenium - python

I have been able to catch screenshots as pngs of some elements such the one with following code
from selenium import webdriver
from PIL import Image
from io import BytesIO
from os.path import expanduser
from time import sleep
# Define url and driver
url = 'https://www.formula1.com/'
driver = webdriver.Chrome('chromedriver')
# Go to url, scroll down to right point on page and find correct element
driver.get(url)
driver.execute_script('window.scrollTo(0, 4100)')
sleep(4) # Wait a little for page to load
element = driver.find_element_by_class_name('race-list')
location = element.location
size = element.size
png = driver.get_screenshot_as_png()
driver.quit()
# Store image as bytes, crop it and save to desktop
im = Image.open(BytesIO(png))
im = im.crop((200, 150, 700, 725))
path = expanduser('~/Desktop/')
im.save(path + 'F1-info.png')
This outputs to:
Which is what I want but not exactly how I want. I needed to manually input some scrolling down and as I couldn't get the element I wanted (class='race step-1 step-2 step-3') I had to manually crop the image too.
Any better solutions?

In case someone is wondering. This is how I managed it in the end. First I found and scrolled to the right part of the page like this
element = browser.find_element_by_css_selector('.race.step-1.step-2.step-3')
browser.execute_script('arguments[0].scrollIntoView()', element)
browser.execute_script('window.scrollBy(0, -80)')
and then cropped the image
im = im.crop((200, 80, 700, 560))

Related

python web scraping of a dynamic bus map

The link contains a map showing the current location of the bus, and I want to scrape the map every few minutes with python and output it as an image. I tried to manage it with the following code, but the output is not showing the map but only showing the route. Moreover, if I want to run multiple times with selenium, it will open a lot of browsers on the backend. Is there any other way to do this? Thanks
Code I tried:
from PIL import Image
from selenium import webdriver
driver = webdriver.Chrome('./chromedriver')
driver.maximize_window() # maximize window
driver.get("https://mobi.mit.edu/default/transit/route?feed=nextbus&direction=loop&agency=mit&route=tech&_tab=map")
element = driver.find_element("xpath", "/html/body/div/div/main/div[2]/div/div[2]/div/div[3]/div/div/div/div/div/div"); # this is the map xpath
location = element.location;
size = element.size;
driver.save_screenshot("canvas.png");
x = location['x'];
y = location['y'];
width = location['x']+size['width'];
height = location['y']+size['height'];
im = Image.open('canvas.png')
im = im.crop((int(x), int(y), int(width), int(height)))
im.save('canvas_el.png') # your file
Output:
Expected:

Python how to take screenshot of div

i'm trying to take a screenshot of product detail of Amazon item. I found that div id = aplus is the product detail description which is i'm looking for.
So i create code using python and selenium to take the full screen shot of the div section.
However, the result is cropped and only shows partial top of div.
options = webdriver.ChromeOptions()
options.headless = True
driver = webdriver.Chrome()
URL = "https://www.amazon.co.jp/-/en/Figuarts-Dragon-Saiyan-Approx-Painted/dp/B08S7KVHMP/ref=sr_1_1?crid=3O3TF6V9FJHS5&currency=JPY&keywords=b08s7kvhmp&qid=1668143838&qu=eyJxc2MiOiIwLjAwIiwicXNhIjoiMC4wMCIsInFzcCI6IjAuMDAifQ%3D%3D&sprefix=%2Caps%2C140&sr=8-1"
driver.get(URL)
time.sleep(5)
S = lambda X: driver.execute_script('return document.body.parentNode.scroll' +X)
time.sleep(1)
driver.set_window_size(S('Width'), S('Height'))
image = driver.find_element('id','aplus')
image.screenshot('yes.png')
and if i put
options=options
inside webdriver.Chrome(), depending on product it takes full screenshot of the div, but it does not contain any image.
I have no idea how to take full screenshot of the div :S
This example you need import the library PIL.
pip install Pillow
from selenium import webdriver
from PIL import Image
from io import BytesIO
options = webdriver.ChromeOptions()
options.headless = True
driver = webdriver.Chrome()
URL = "https://www.amazon.co.jp/-/en/Figuarts-Dragon-Saiyan-Approx-Painted/dp/B08S7KVHMP/ref=sr_1_1?crid=3O3TF6V9FJHS5&currency=JPY&keywords=b08s7kvhmp&qid=1668143838&qu=eyJxc2MiOiIwLjAwIiwicXNhIjoiMC4wMCIsInFzcCI6IjAuMDAifQ%3D%3D&sprefix=%2Caps%2C140&sr=8-1"
driver.get(URL)
# now that we have the preliminary stuff out of the way time to get that image :D
element = options.find_element_by_id('aplus') # find part of the page you want image of
location = element.location
size = element.size
png = options.get_screenshot_as_png() # saves screenshot of entire page
options.quit()
im = Image.open(BytesIO(png)) # uses PIL library to open image in memory
left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
im = im.crop((left, top, right, bottom)) # defines crop points
im.save('screenshot.png') # saves new cropped image

Downloading images with selenium and requests: why does the .get_attribute() method of a WebElement returns a URL in base64?

I have written a webscraping program that goes to an online marketplace like www.tutti.ch, searches for a category key word, and then downloads all the resulting photos of the search result to a folder.
#! python3
# imageSiteDownloader_stack.py - A program that goes to an online marketplace like
# tutti.ch, searches for a category of photos, and then downloads all the
# resulting images.
import requests, os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox() # Opens Firefox webbrowser
browser.get('https://www.tutti.ch/') # Go to tutti.ch website
wait = WebDriverWait(browser, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click() # accepts cookies terms
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "._1CFCt > input:nth-child(1)"))).send_keys('Gartenstuhl') # enters search key word
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[id*='1-val-searchLabel']"))).click() # clicks submit button
os.makedirs('tuttiBilder', exist_ok=True) # creates new folder
images = browser.find_elements(By.TAG_NAME, 'img') # stores every img element in a list
for im in images:
imageURL = im.get_attribute('src') # get the URL of the image
print('Downloading image %s...' % (imageURL))
res = requests.get(imageURL) # downloads the image
res.raise_for_status()
imageFile = open(os.path.join('tuttiBilder', os.path.basename(imageURL)), 'wb') # creates an image file
for chunk in res.iter_content(100000): # writes to the image file
imageFile.write(chunk)
imageFile.close()
print('Done.')
browser.quit()
My program crashes at line 26, the exception is as follows:
The program downloads the first couple of photos correctly, but then, suddenly, crashes.
Looking for solutions on stackoverflow, I have found this post: Requests : No connection adapters were found for, error in Python3
The answer of the post above suggests that the problem arises because of a newline charachter in the URL.
I checked the source URLs of the photos im the HTML code that couldn't be downloaded. They seem to be OK.
The problem seems to be either the browser.find_elements() method which parses the 'src' attribute values incorrectly or the .get_attribute() method which cannot fetch some of the URLs correctly. Instead of getting something like
https://c.tutti.ch/images/23452346536.jpg
the method gives back strings like

Of course, this is not a valid URL which the requests.get() method can use to download the image. I did some research and I have found out that this might be a base64 string...
Why does the .get_attribute() method return a base64 string in some of the cases? Can I prevent it to do so? Or do I have to convert it to a normal string?
Update: Another approach using beautifulsoup for parsing instead ob WebDriver. (This code is not working yet. The Data URLs are still a problem)
import requests, sys, os, bs4
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox() # Opens Firefox webbrowser
browser.get('https://www.tutti.ch/') # Go to tutti.ch website
wait = WebDriverWait(browser, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click()
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "._1CFCt > input:nth-child(1)"))).send_keys(sys.argv[1:])
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[id*='1-val-searchLabel']"))).click() # https://www.tutorialspoint.com/how-to-locate-element-by-partial-id-match-in-selenium
os.makedirs('tuttiBilder', exist_ok=True)
url = browser.current_url
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
#Check for errors from here
images = soup.select('div[style] > img')
for im in images:
imageURL = im.get('src') # get the URL of the image
print('Downloading image %s...' % (imageURL))
res = requests.get(imageURL) # downloads the image
res.raise_for_status()
imageFile = open(os.path.join('tuttiBilder', os.path.basename(imageURL)), 'wb') # creates an image file
for chunk in res.iter_content(100000): # writes to the image file
imageFile.write(chunk)
imageFile.close()
print('Done.')
browser.quit()
The program crashes, as you attempt to download a file (image) using a base64 encoded string (which is not a valid image URL). The reason these base64 strings show up in your images list is that each image (in <img> tags) appears to initially be a base64 string, and once it is loaded, the src value changes to a valid image url (you can check that by opening the DevTools in your browser while accessing your website at https://...ganze-schweiz?q=Gartenstuhl, and searching for "base64" in the Elements section of the DevTools. By moving to the next image in the search findings - using the arrow buttons - you'll notice the behaviour described above). This is also the reason (as shown in your cmd window snippet, and as tested it myself as well) that only 3 to 5 images are found and downloaded. That is because these 5 images are the ones appearing at the top of the page, and are succesfully loaded and given a valid image URL, when the page is accessed; whereas, the remaining <img> tags still include a base64 string.
So, the first step is - once the "search results" operation is completed- to slowly scroll down the page, in order for every image in the page to be loaded and given a valid URL. You can achieve that by using the method described here. You can adjust the speed as you wish, as long as it allows items/images to load properly.
The second step is to ensure that only valid URLs are passed to requests.get() method. Although every base64 string will be replaced by a valid URL due to the above fix, there might still be invalid image URLs in the list; in fact, there seems to be one (which is not related to the items) starting with https://bat.bing.com/action/0?t..... Thus, it is prudent to check that the requested URLs are valid image URLs, before attempting downloading them. You can do that by using str.endswith() method, looking for strings ending with specific suffixes (extensions), such as ".png" and ".jpg". If a string in the images list does end with any of the above extensions, you can then proceed downloading the image. Working example is given below (please note, the below will download the images appearing in the first page of search results. If you require downloading further image results, you can extend the program to navigate to the next page and then repeat the same steps as below).
Update 1
The code below has been updated, so that one can obtain further results by navigating to the following pages and downloading the images. You can set the number of "next pages" from which you would like to get results by adjusting the next_pages_no variable.
import requests, os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
suffixes = (".png", ".jpg")
next_pages_no = 3
browser = webdriver.Firefox() # Opens Firefox webbrowser
#browser = webdriver.Chrome() # Opens Chrome webbrowser
wait = WebDriverWait(browser, 10)
os.makedirs('tuttiBilder', exist_ok=True)
def scroll_down_page(speed=40):
current_scroll_position, new_height= 0, 1
while current_scroll_position <= new_height:
current_scroll_position += speed
browser.execute_script("window.scrollTo(0, {});".format(current_scroll_position))
new_height = browser.execute_script("return document.body.scrollHeight")
def save_images(images):
for im in images:
imageURL = im.get_attribute('src') # gets the URL of the image
if imageURL.endswith(suffixes):
print('Downloading image %s...' % (imageURL))
res = requests.get(imageURL, stream=True) # downloads the image
res.raise_for_status()
imageFile = open(os.path.join('tuttiBilder', os.path.basename(imageURL)), 'wb') # creates an image file
for chunk in res.iter_content(1024): # writes to the image file
imageFile.write(chunk)
imageFile.close()
def get_first_page_results():
browser.get('https://www.tutti.ch/')
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#onetrust-accept-btn-handler"))).click() # accepts cookies terms
wait.until(EC.presence_of_element_located((By.XPATH, '//form//*[name()="input"][#data-automation="li-text-input-search"]'))).send_keys('Gartenstuhl') # enters search keyword
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[id*='1-val-searchLabel']"))).click() # clicks submit button
scroll_down_page() # scroll down the page slowly for the images to load
images = browser.find_elements(By.TAG_NAME, 'img') # stores every img element in a list
save_images(images)
def get_next_page_results():
wait.until(EC.visibility_of_element_located((By.XPATH, '//button//*[name()="svg"][#data-testid="NavigateNextIcon"]'))).click()
scroll_down_page() # scroll down the page slowly for the images to load
images = browser.find_elements(By.TAG_NAME, 'img') # stores every img element in a list
save_images(images)
get_first_page_results()
for _ in range(next_pages_no):
get_next_page_results()
print('Done.')
browser.quit()
Update 2
As per your request, here is an alternative approach to the problem, using Python requests to download the HTML content of a given URL, as well as BeautifulSoup library to parse the content, in order to get the image URLs. As it appears in the HTML content, both base64 strings and actual image URLs are included (base64 strings occur exactly in the same number as image URLs). Thus, you can use the same approach as above to check for their suffixes, before proceeding downloading them. Complete working example below (adjust the page range in the for loop as you wish).
import requests
from bs4 import BeautifulSoup as bs
import os
suffixes = (".png", ".jpg")
os.makedirs('tuttiBilder', exist_ok=True)
def save_images(imageURLS):
for imageURL in imageURLS:
if imageURL.endswith(suffixes):
print('Downloading image %s...' % (imageURL))
res = requests.get(imageURL, stream=True) # downloads the image
res.raise_for_status()
imageFile = open(os.path.join('tuttiBilder', os.path.basename(imageURL)), 'wb') # creates an image file
for chunk in res.iter_content(1024): # writes to the image file
imageFile.write(chunk)
imageFile.close()
def get_results(page_no, search_term):
response = requests.get('https://www.tutti.ch/de/li/ganze-schweiz?o=' + str(page_no) + '&q=' + search_term)
soup = bs(response.content, 'html.parser')
images = soup.findAll("img")
imageURLS = [image['src'] for image in images]
save_images(imageURLS)
for i in range(1, 4): # get results from page 1 to page 3
get_results(i, "Gartenstuhl")
Update 3
To clear things up, the base64 strings are all the same i.e., R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7. You can check this by saving the received HTML content in a file (to do this, please add the code below in the get_results() method of the second solution), opening it with a text editor and searching for "base64".
with open("page.html", 'wb') as f:
f.write(response.content)
If you enter the above base64 string into a "base64-to-image" converter online, then download and open the image with a graphics editor (such as Paint), you will see that it is a 1px image (usually called a "tracking pixel"). This "tracking pixel" is used in Web beacon technique to check that a user has accessed some content - in your case, a product in the list.
The base64 string is not an invalid URL that somehow turns into a valid one. It is an encoded image string, which can be decoded to recover the image. Thus, in the first solution using Selenium, when scrolling down on the page, those base64 strings are not converted into valid image URLs, but rather, tell the website that you have accessed some content, and then the website removes/hides them; that is the reason they do not show up in the results. The images (and hence, the image URLs) appear as soon as you scroll down to a product, as a common technique, called "Image Lazy Loading" is used (which is used to improve performance, user experience, etc.). Lazy-loading instructs the browser to defer loading of images that are off-screen until the user scrolls near them. In the second solution, since requests.get() is used to retrieve the HTML content, the base64 strings are still in the HTML document; one per each product. Again, those base64 strings are all the same, and are 1px images used for the purpose mentioned earlier. So, you don't need them in your results and should be ignored. Both solutions above download all the product images present in the webpage. You can check that by looking into the tuttiBilder folder, after running the programs. If you still, however, want to save those base64 images (which is worthless, as they are all the same and not useful), replace the save_images() method in the second solution (i.e., using BeautifulSoup) with the one below. Make sure to import the extra libraries (as shown below). The below will save all the base64 images, along with products' images, in the same tuttiBilder folder, assigning them unique identifiers as filenames (as they don't carry a filename).
import re
import base64
import uuid
def save_images(imageURLS):
for imageURL in imageURLS:
if imageURL.endswith(suffixes):
print('Downloading image %s...' % (imageURL))
res = requests.get(imageURL, stream=True) # downloads the image
res.raise_for_status()
imageFile = open(os.path.join('tuttiBilder', os.path.basename(imageURL)), 'wb') # creates an image file
for chunk in res.iter_content(1024): # writes to the image file
imageFile.write(chunk)
imageFile.close()
elif imageURL.startswith("data:image/"):
base64string = re.sub(r"^.*?/.*?,", "", imageURL)
image_as_bytes = str.encode(base64string) # convert string to bytes
recovered_img = base64.b64decode(image_as_bytes) # decode base64string
filename = os.path.join('tuttiBilder', str(uuid.uuid4()) + ".png")
with open(filename, "wb") as f:
f.write(recovered_img)
Can I suggest not using Selenium, there is a backend api that serves the data for each page. The only tricky thing is that requests to the api need to have a certain uuid hash which is in the HTML of the landing page. So you can get that when you go to the landing page, then use it to sign your subsequent api calls, here is an example which will loop through the pages and images for each post:
import requests
import re
import os
search = 'Gartenstuhl'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = f'https://www.tutti.ch/de/li/ganze-schweiz?q={search}'
step = requests.get(url,headers=headers)
print(step)
uuids = re.findall( r'[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}',step.text)
print(f'tutti hash code: {uuids[0]}') #used to sign requests to api
os.makedirs('tuttiBilder', exist_ok=True)
for page in range(1,10):
api = f'https://www.tutti.ch/api/v10/list.json?aggregated={page}&limit=30&o=1&q={search}&with_all_regions=true'
new_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'x-tutti-hash':uuids[0],
'x-tutti-source':'web latest-staging'
}
resp = requests.get(api,headers=new_headers).json()
for item in resp['items']:
for image in item['image_names']:
image_url = 'https://c.tutti.ch/images/'+image
pic = requests.get(image_url)
with open(os.path.join('tuttiBilder', os.path.basename(image)),'wb') as f:
f.write(pic.content)
print(f'Saved: {image}')
That is not a URL of any kind. The actual image data is stored in there, hence it is base 64 encoded. Try copying it into your browser (starting with the data: part), and you will see the image.
What just happened is the image is not hosted on a separate URL but it is embedded into the website, your browser only decoded that data to render the image. If you want to get the raw image data, base64decode everything after ;base64, part.

How to download an image from a website every six minutes?

I am working on a machine learning project and need a LOT of pictures for the data set that will train my program. The website https://its.txdot.gov/ITS_WEB/FrontEnd/default.html?r=SAT&p=San%20Antonio&t=cctv has pictures that are updated every six minutes. I need to save the image at LP 1604 at Kyle Seal Pkwy, but can't figure out how. I'm trying to right click on the image using action chains to save the image. Here's what I have so far:
driver.get('https://its.txdot.gov/ITS_WEB/FrontEnd/default.html?r=SAT&p=San%20Antonio&t=cctv')
time.sleep(5) #to let the site load
driver.find_element_by_id('LP-1604').click() #to get to the 1604 tab
time.sleep(5) #to let the site load
pic = driver.find_element_by_id('LP 1604 at Kyle Seale Pkwy__SAT')
action = ActionChains(driver)
action.context_click(pic)
The drop-down menu that usually pops up when you right-click is not showing up. And I feel like there has to be a better way to do this than right-click. I know how to wrap this in a loop that will execute every six minutes, so I don't need help there. It's just the downloading the image part. One of the problems I run into is that all the images are under the same url and most examples out there use urls. Any suggestions would be helpful.
I think that it could be help you do save the images in your Pc:
from PIL import Image
def save_image_on_disk(driver, element, path):
location = element.location
size = element.size
# saves screenshot of entire page
driver.save_screenshot(path)
# uses PIL library to open image in memory
image = Image.open(path)
left = location['x']
top = location['y'] + 0
right = location['x'] + size['width']
bottom = location['y'] + size['height'] + 0
image = image.crop((left, top, right, bottom)) # defines crop points
image = image.convert('RGB')
image.save(path, 'png') # saves new cropped image
def your_main_method():
some_element_img = driver.find_element_by_xpath('//*[#id="id-of-image"]')
save_image_on_disk(driver, some_element_img, 'my-image.png')
About the time you should use time.sleep(6*60)
The image data is located in the src property of the currentSnap element. It's encoded in base64, so you need to capture it and decode it. Then using PIL you can do anything you like with the image.
Also you can use selenium's built in wait functions instead of hardcoding sleeps. In this case the image sometimes loads even after the image element loads, so there's an additional short sleep still in the code to allow it to load.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from io import BytesIO
import base64
import re
# Max time to wait for page to load
timeout=10
driver = webdriver.Chrome()
driver.get('https://its.txdot.gov/ITS_WEB/FrontEnd/default.html?r=SAT&p=San%20Antonio&t=cctv')
# Wait for element to load before clicking
element_present = EC.presence_of_element_located((By.ID, 'LP-1604'))
WebDriverWait(driver, timeout).until(element_present)
driver.find_element_by_id('LP-1604').click() #to get to the 1604 tab
# Waat for image to load before capturing data
element_present = EC.presence_of_element_located((By.ID, 'currentSnap'))
WebDriverWait(driver, timeout).until(element_present)
# Sometimes the image still loads after the element is present, give it a few more seconds
time.sleep(4)
# Get base64 encoded image data from src
pic = driver.find_element_by_id('currentSnap').get_attribute('src')
# Strip prefix
pic = re.sub('^data:image/.+;base64,', '', pic)
# Load image file to memory
im = Image.open(BytesIO(base64.b64decode(pic)))
# Write to disk
im.save('image.jpg')
# Display image in Jupyter
im
# Open in your default image viewer
im.show()

Automatically substituting logos in web images

Consider the following task:
Open a given URL
Find the first image tag in the URL
Substitute it for an image in your local drive
Save the resulting webpage as a png
I want to automatize this task with a Python script, and I am unsure of the best approach.
I have been using selenium to convert URLs into screenshots, but I am unsure of how to introduce the part about modifying the first image tag to load a local file.
You can use execute_script to replace the image should look something like:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
url = 'https://www.aircanada.com/en/'
browser.get(url)
my_image = browser.find_element_by_xpath('//*[#id="pagePromoBanner-wrapper"]/div/a/img')
# or
# my_image = browser.find_element_by_xpath('any XPath')
link_to_new_image = "https://images.pexels.com/photos/67636/rose-blue-flower-rose-blooms-67636.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=750&w=1260"
# if you are using python 3.6 and up:
browser.execute_script(f"arguments[0].src = '{link_to_new_image}'", my_image )
# else:
# browser.execute_script("arguments[0].src = '"+link_to_new_image+"'", my_image )
Hope this helps you!

Categories

Resources