I have written this simple script to retrieve the thumbnail URLs after performing a YouTube search for "programming".
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
url = "https://www.youtube.com/results?search_query=programming"
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
browser.get(url)
time.sleep(10)
image_thumbnails = set()
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope ytd-thumbnail no-transition")
while len(image_thumbnails) < 10:
thumbnails = browser.find_elements(By.CLASS_NAME, "style-scope yt-img-shadow")
for img in thumbnails:
image_thumbnails.add(img.get_attribute('src'))
print(img)
time.sleep(10)
browser.close()
However, the output I get is and not the URLs:
[<selenium.webdriver.remote.webelement.WebElement (session="b9a60c0fe036ab4d592094d611ed7da0", element="cfdf2ad0-41b5-47a6-af7d-8bb00a80175f")>, ...]
You are printing the element used in the for loop. To solve this just print the value you are adding. E.g:
print(img.get_attribute('src'))
Related
I have a question, I'm making myself a portfolio app (scrapping + Python) and I can't figure out two things,
How to download thumbnails of photos?
https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944
I had a code that downloaded after XPath:
//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[{counter}]/a/div/div/div[1]/div[1]/div
and it was working fine and since 2 days I get the information
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element...
and I'm pulling my hair out of my head... as for the code, I'm just learning Scrapping so maybe it's not the best solution (maybe JS changes something dynamically?), How to get these thumbnails?
Second question: Which DIV to click a Scrapping to enter a specific car listing? also I'm lost in this because you can click both the price and the photo.... and one enters (example code: driver.find_element(By.CLASS_NAME, 'css-8wsg1m').click())
Thank you very much for your help.
import request
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img=driver.find_element(By.CLASS_NAME, 'css-gl6djm')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
name = r'/home/.../Pulpit/GitHub_Public/Selenium_Porsche/work_dir/Image.jpg'
url = r'https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944'
downloadImage(url, name)
The Easiest way to achieve this is by creating an imageDownload() function which takes the image URL as an input and it downloads the image to your machine. You can easily get the image URL with the help of selenium.
import request
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
img=driver.driver.find_element(By.CLASS_NAME, 'CLASS_NAME_HERE')
imgURL=img.get_attribute('src')
savedImageName='Image.jpg'
downloadImage(imgURL,savedImageName)
Your Xpath is wrong, Here's the final code:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def downloadImage(url, name):
r = requests.get(url)
with open(name, "wb") as f:
f.write(r.content)
option = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=option)
driver.get('https://www.olx.pl/d/motoryzacja/samochody/porsche/?search%5Bfilter_enum_model%5D%5B0%5D=944')
driver.find_element(By.ID, 'onetrust-accept-btn-handler').click() # Cookies
img = driver.find_element(
By.XPATH, '//*[#id="root"]/div[1]/div[2]/form/div[5]/div/div[2]/div[2]/a/div/div/div[1]/div[1]/div/img')
imgURL = img.get_attribute('src')
print(imgURL)
savedImageName = 'Image.jpg'
downloadImage(imgURL, savedImageName)
Output:
I am currently working on a scraper for aniworld.to.
My goal is it to enter the anime name and get all of the Episodes downloaded.
I have everything working except one thing...
The websites has a Watch button. That Button redirects you to https://aniworld.to/redirect/SOMETHING and that Site has a captcha which means the link is not in the html...
Is there a way to bypass this/get the link in python? Or a way to display the captcha so I can solve it?
Because the captcha only appears every lightyear.
The only thing I need from that page is the redirect link. It looks like this:
https://vidoza.net/embed-something.html
My very very wip code is here if it helps: https://github.com/wolfswolke/aniworld_scraper
Mitchdu showed me how to do it.
If anyone else needs help here is my code: https://github.com/wolfswolke/aniworld_scraper/blob/main/src/logic/captcha.py
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from threading import Thread
import os
def open_captcha_window(full_url):
working_dir = os.getcwd()
path_to_ublock = r'{}\extensions\ublock'.format(working_dir)
options = webdriver.ChromeOptions()
options.add_argument("app=" + full_url)
options.add_argument("window-size=423,705")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
if os.path.exists(path_to_ublock):
options.add_argument('load-extension=' + path_to_ublock)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(full_url)
wait = WebDriverWait(driver, 100, 0.3)
wait.until(lambda redirect: redirect.current_url != full_url)
new_page = driver.current_url
Thread(target=threaded_driver_close, args=(driver,)).start()
return new_page
def threaded_driver_close(driver):
driver.close()
I am working on an office project to get data to check active status on different websites but whenever I want to get data sometimes it shows none and sometimes it shows this Attribute error, I follow youtube videos steps but still get this error. help, please.
//Python Code
from bs4 import BeautifulSoup
import requests
html_text = requests.get(
"https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw").text
soup = BeautifulSoup(html_text, 'lxml')
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
The url is dynamic meaning data is populated by javascript. So you need automation tool something like selenium.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(8)
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
#driver.close()
status = soup.find('div', {'class': "ValidatorInfo_statusBadge__PBIGr"})
para = status.find('p').text
print(para)
Output:
Active
You have the most common problem - modern pages use JavaScript to add elements but requests/BeautifulSoup can't run JavaScript.
So soup.find('div',...) gives None instead expected element and later it makes problem with None.find('p')
You may use Selenium to control real web browser which can run JavaScript.
from selenium import webdriver
#from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
url = "https://www.mintscan.io/cosmos/validators/cosmosvaloper1we6knm8qartmmh2r0qfpsz6pq0s7emv3e0meuw"
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
#status = driver.find_element(By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')
wait = WebDriverWait(driver, 10)
status = wait.until(EC.visibility_of_element_located((By.XPATH, '//div[#class="ValidatorInfo_statusBadge__PBIGr"]')))
print(status.text)
Eventually you should check if page gives some API to get data.
You may also use DevTools (tab: Network) to check if JavaScript reads data from some URL and you may try to use this URL with requests. It could work faster than with Selenium but server may detect script/bot and block it.
JavaScript usually get data as JSON so it may not need to scrape HTML with BeautifulSoup
I am learning web crawling and scraping in python. I want to scrape data where in a site there are links, and inside those links there are more links. So I want to scrape data till predefined level n.
This is my basic code
import requests
from selenium import webdriver
from requests_ntlm import HttpNtlmAuth
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from webdrivermanager import GeckoDriverManager
import pickle
from selenium.webdriver.common.keys import Keys
from urllib.parse import urljoin
from seleniumrequests import Chrome
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
web_url = 'https://spaceflightnow.com/'
driver.get("https://spaceflightnow.com/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
#section = soup.section
links=[]
for url in soup.find_all('a',href=True):
links.append(urljoin(web_url,url.get('href')))
#print(urljoin(web_url,url.get('href')))
links = list(filter(lambda x: x != web_url,links))
print(links)
This prints multiple links of first page. Now I want to click and go to all the links in subsequent level and scrape it again,getting more links inside. There is the possibility of same links getting displayed again internally from news feed. So what I want to know is what should be my approach to do it. I can understand I need a tree, but cannot figure out exactly how ?
Like I create a list inside list, but how to do it dynamically till n level ? and how to map it with the data saved in file ?? Can anyone help me with this ? maybe with a sample solution ?
Thank you :)
I made example which work without recursion - I would say it is similar to Breadth-First Search algorithm.
It keeps urls on list [(url, level),...] to control level and in set() to filter visited page. It also filters links to external pages.
Tested with Firefox.
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# ---
def get_links(driver, url):
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source,"lxml")
links = []
for new_url in soup.find_all('a', href=True):
new_url = new_url.get('href')
new_url = urljoin(url, new_url)
links.append(new_url)
return links
# ---
options = Options()
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--profile-directory=Default")
driver = webdriver.Chrome("./chromedriver",options=options)
#driver = webdriver.Firefox()
# ---
domain = 'https://spaceflightnow.com/' # to filter external links
start_url = 'https://spaceflightnow.com/'
max_level = 2
links_visited = set([start_url]) # to test visited links
links_with_levels = [(start_url, 0)] # to control levels
# ---
for link, level in links_with_levels:
if level >= max_level:
print('skip:', level, link)
continue
print('visit:', level, link)
links = get_links(driver, link)
print('found:', len(links))
links = list(set(links) - links_visited)
print('after filtering:', len(links))
level += 1
for new_link in links:
if new_link.startswith(domain): # filter external links
links_visited.add(new_link)
links_with_levels.append( (new_link, level) )
# ---
for link, level in links_with_levels:
print('skip:', level, link)
I am using BeautifulSoup for extracting pictures which works well for normal pages.
Now I want to extract the picture of the Chromebook from a web page like this
https://twitter.com/banprada/statuses/829102430017187841
The page apparently contains a link to another page with the image. Here is my code for downloading an image from mentioned link but I am only getting the image of the person who posted the link.
import urllib.request
import os
from bs4 import BeautifulSoup
URL = "http://twitter.com/banprada/statuses/829102430017187841"
list_dir="D:\\"
default_dir = os.path.join(list_dir,"Pictures_neu")
opener = urllib.request.build_opener()
urllib.request.install_opener(opener)
soup = BeautifulSoup(urllib.request.urlopen(URL).read())
imgs = soup.findAll("img",{"alt":True, "src":True})
for img in imgs:
img_url = img["src"]
filename = os.path.join(default_dir, img_url.split("/")[-1])
img_data = opener.open(img_url)
f = open(filename,"wb")
f.write(img_data.read())
f.close()
Is there an opportunity to download the image somehow?
Many thanks and regards,
Andi
This is how you can get only mentioned image using Selenium + requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.PhantomJS()
driver.get(link)
wait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[starts-with(#id, 'xdm_default')]")))
image_src = driver.find_element_by_tag_name('img').get_attribute('src')
response = requests.get(image_src).content
with open('C:\\Users\\You\\Desktop\\Image.jpeg', 'wb') as f:
f.write(response)
If you want to get all the images from all iframes on page (excluding images on initial page source that you can get with your code):
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import time
link = 'https://twitter.com/banprada/statuses/829102430017187841'
driver = webdriver.Chrome()
driver.get(link)
time.sleep(5) # To wait until all iframes completely rendered. Might be increased
iframe_counter = 0
while True:
try:
driver.switch_to_frame(iframe_counter)
pictures = driver.find_elements_by_xpath('//img[#src and #alt]')
if len(pictures) > 0:
for pic in pictures:
response = requests.get(pic.get_attribute('src')).content
with open('C:\\Users\\You\\Desktop\\Images\\%s.jpeg' % (str(iframe_counter) + str(pictures.index(pic))), 'wb') as f:
f.write(response)
driver.switch_to_default_content()
iframe_counter += 1
except WebDriverException:
break
Note, that you can use any webdriver