Scraping with Selenium. for page in range() issue - python

I am trying to scrape on ethplorer.io. I want to scrape many pages. My code is like this. But it scrapes page(11) three times. range(11,14) I couldn't understand why?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
base_url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders="
results=[]
for page_number in range(11,14):
url = base_url+str(page_number)
driver.get(url)
data = driver.find_elements(By.CLASS_NAME, "local-link")
for x in data:
results.append(x.text)
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

I have applied some modifications to your code to have it working through the several pages you are calling and to capture the text within the hyperlinks I assume you are targeting, please check below:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
url = "https://ethplorer.io/tr/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d#pageSize=100&tab=tab-holders&holders=11" # Here we are calling the first page we need, which is page #11 in this case.
xpath = "//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]"
driver.get(url)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
results=[]
for page_number in range(12,15): # Range should start from the next page (page # 12 in this case). Range end with last page you need + 2, in this case you need to scrape from 11 untill 13 ,so rage end should be 15.
for x in data: # On first round it will get page # 11 data.
results.append(x.text)
nextPage = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,"//div[#id='token-holders-tab']//tr[contains(#class,'paginationFooter')]//A[#class='page-link'][text()='"+str(page_number)+"']")))
driver.execute_script("arguments[0].click();", nextPage)
time.sleep(3)
data =driver.find_elements_by_xpath("//div[#id='token-holders-tab']//div[#id='address-token-holders']//div[#class='block']//table//tr//td//a[contains(#class,'local-link')]")
driver.quit
with open("all_data.txt" , "w") as file:
for x in results:
file.write(x + "\n")

Related

Why are elements missing in HTML while using selenium?

I am trying to scrape the text of labels by
url='https://www.hydac.com/shop/en/GP_1000188028'
in Product Search section. I've tried all the solutions I know but got nowhere.
Here is my code:
items=soup.find_all('div',attrs={'class':'filter-options-item'})
for item in items:
p=(item.find('label',attrs={'data-bind':'attr: {for: id}'})).find_all('span')
for q in p:
print(q.text)
BeautifulSoup only parses the HTML, it do not handle requesting or rendering what seems to be your issue.
Check the behaviour of the website in your browser, it needs some time to render the labels, so you simply have to wait.
Option#1
Simply use time.sleep() to wait:
...
driver.get(url)
time.sleep(5)
...
Option#2
Use selenium waits(recommended) to solve the issue:
...
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
...
Example
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.hydac.com/shop/en/GP_1000188028'
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-bind="text: label"]')))
soup = BeautifulSoup(driver.page_source)
[x.get_text(strip=True) for x in soup.select('#narrow-by-list label')]
Output
['3.5 m/s (piston 2)58',
'0.8 m/s (piston 3)8',
'Aluminium31',
'Carbon steel35',
'NBR / PTFE compound58',
'PUR8',
'10 l6',
'100 l5',
'120 l3',...]

Web scraping text returns 0

Whenever I try to scrape a number from a website and print it always returns 0 even if I delay it to let the window load first.
Here's my code,
from selenium import webdriver
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
browser.get(url)
text = browser.find_element_by_xpath('//*[#id="stat_totalPlayers"]').text
time.sleep(10)
print(text)
All I need it to do is print some text that it takes from a website.
Have I done something wrong or am I just completely missing something?
You should put the delay before getting the element!
from selenium import webdriver
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
browser.get(url)
time.sleep(10)
text = browser.find_element_by_xpath('//*[#id="stat_totalPlayers"]').text
print(text)
While it's better to use explicit wait, like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = 'https://hytrack.me/'
browser = webdriver.Chrome(r'C:\Users\kinet\OneDrive\Documents\webscraper\chromedriver.exe')
wait = WebDriverWait(driver, 20)
browser.get(url)
text = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="stat_totalPlayers"]'))).text
print(text)

Scrape href/URL

My code goes into a webpage which contains multiple entries, gets their URL and then puts them into a list.
Then it navigates through each list of URL 1 by 1, and then does a scape per presentation.
Right now I scrape each title of each presentation (you can see if you run the code), but within the title, there is another URL/href that I would want.
Is there a way to scrape this?
Thanks
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
val=[]
driver = webdriver.Chrome()
for x in range (1,3):
driver.get(f'https://www.abstractsonline.com/pp8/#!/9325/sessions/#sessiontype=Advances%20in%20Diagnostics%20and%20Therapeutics/{x}')
time.sleep(9)
page_source = driver.page_source
eachrow = ["https://www.abstractsonline.com/pp8/#!/9325/session/" + x.get_attribute('data-id') for x in driver.find_elements_by_xpath('//*[#id="results"]/li//h1[#class="name"]')]
for row in eachrow:
val.append(row)
print(row)
for b in val:
driver.get(b)
time.sleep(3)
page_source1=driver.page_source
soup=BeautifulSoup(page_source1,'html.parser')
productlist=soup.find_all('a',class_='title color-primary')
for item in productlist:
presentationTitle=item.text.strip()
print(presentationTitle)
I think you want some wait conditions in there and then to extract the href attribute for each presentation within a page
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
base = 'https://www.abstractsonline.com/pp8/#!/9325/session/'
for x in range (1, 3):
driver.get(f'https://www.abstractsonline.com/pp8/#!/9325/sessions/#sessiontype=Advances%20in%20Diagnostics%20and%20Therapeutics/{x}')
links = [base + i.get_attribute('data-id') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li .name")))]
for link in links:
driver.get(link)
print(WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "spnSessionTitle"))).text)
for presentation in driver.find_elements_by_css_selector('.title'):
print(presentation.text.strip())
print('https://www.abstractsonline.com/pp8' + presentation.get_attribute('href'))
links = driver.find_elements_by_partial_link_text('https://yourlinks.com/?action=')
for link in links:
print(link.get_attribute("href"))

Web page is loaded in selenium and reaches to end but does not contain all the elements inside the div

This is the site.
https://www.talabat.com/uae/top-selling.
There are somewhat 100 products and only 30 gets loaded. I was trying to fetch all the links and page reaches to end but only display 30 products and when clicked somewhere in the webdriver then loads the rest of the products. How can I print the links of all the products?
Thanks in advance!!
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
HOME_PAGE_URL = "https://www.talabat.com/uae/top-selling"
PATIENCE_TIME = 60
LOAD_MORE_XPATH = '//*[#id="comment-ajx"]/div'
driver = webdriver.Chrome(executable_path='C:\\Users\\Mansi Dhingra\\Downloads\\chromedriver.exe')
driver.get(HOME_PAGE_URL)
soup=BeautifulSoup(driver.page_source)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# sleep for 30s
res=[]
results = driver.find_elements_by_xpath("/html/body/div[3]/ui-view/section/div/div[2]/div/div[2]/div/div[2]")
html_code = driver.find_element_by_tag_name("section").text
print(html_code)
for res in results:
link=res.find_elements_by_tag_name('a')
for x in link:
product_link = x.get_attribute("href")
print(product_link)
print(results)
The main point is that selenium reads the page before the page has loaded all the items, you need a wait.
Just read the docs:
https://selenium-python.readthedocs.io/waits.html
Choose the best condition for you case and go for it.

Element not found on page while scrolling down (StaleElementReferenceException)

I am currently scrolling through a TripAdivsor page to scrape images and need to scroll until the bottom of the page but keep getting the error of:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
I am assuming it is because it is trying to go very fast through the page but even when I change the implicit wait time to be larger, it does not solve the issue. I also tried making sure the new location is visible first before parsing through to get the url but that also did not do any good. Any help would be greatly appreciated!
# import dependencies
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
import selenium
import io
import pandas as pd
import urllib.request
import urllib.parse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
from _datetime import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.headless=False
driver = webdriver.Chrome("/Users/rishi/Downloads/chromedriver 3")
driver.maximize_window()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
#open up website
driver.get(
"https://www.tripadvisor.com/Hotel_Review-g28970-d84078-Reviews-Hyatt_Regency_Washington_on_Capitol_Hill-Washington_DC_District_of_Columbia.html#/media/84078/?albumid=101&type=2&category=101")
image_url = []
end = False
while not(end):
#wait until element is found and then store all webelements into list
images = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[#class="media-viewer-dt-root-GalleryImageWithOverlay__galleryImage--1Drp0"]')))
#iterate through visible images and acquire their url based on background image style
for index, image in enumerate(images):
image_url.append(images[index].value_of_css_property("background-image"))
#if you are at the end of the page then leave loop
# if(length == end_length):
# end = True
#move to next visible images in the array
driver.execute_script("arguments[0].scrollIntoView();", images[-1])
#time.sleep(1)
#wait until the new web element is visible
driver.implicitly_wait(10)
#WebDriverWait(driver, 20).until(EC.visibility_of_element_located(images[-1]))
#clean the list to provide clear links
for i in range(len(image_url)):
start = image_url[i].find("url(\"") + len("url(\"")
end = image_url[i].find("\")")
print(image_url[i][start:end])

Categories

Resources