How do I access an internal link with Selenium that seems restricted - python

I am trying to fetch data from nj.58.com using selenium. I can access the homepapage and some internal links. While navigating through the links, I noticed that the website sees me as a web crawler when I visit a specific url; even if I interact with the links as a human.
I have built my selenium script to a point but I'm stock because the sites throws antibot response back at me.
Here is what I've done:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
import undetected_chromedriver as uc
import time
import pandas as pd
driver = uc.Chrome()
website = 'https://nj.58.com/'
driver.get(website)
driver.implicitly_wait(4)
wait = WebDriverWait(driver, 10)
driver.maximize_window()
switch_city = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#commonTopbar_ipconfig > a"))).click()
city_location = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#selector-search-input')))
city_location.clear()
city_location.send_keys('南京' + Keys.RETURN)
keyword = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#keyword')))
keyword.clear()
keyword.send_keys('"废纸回收"')
time.sleep(2)
search_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#searchbtn')))
search_btn.click()
When I click on search_btn, I'm expecting to see a list of items that I'm interested in. But instead it sees me as a web crawler at this variable position (search_btn) even before using the selenium.
How can I bypass this antibot/antihuman detection at point when I click on search_btn?

Related

Python, Selenium. Google Chrome. Web Scraping. How to navigate between 'tabs' in website

im quite noob in python and right now building up a web scraper in Selenium that would take all URL's for products in the clicked 'tab' on web page. But my code take the URL's from the first 'tab'. Code below. Thank you guys. Im starting to be kind of frustrated lol.
Screenshot
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import html
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
url = 'https://www.alza.sk/vypredaj-akcia-zlava/e0.htm'
driver.get(url)
driver.find_element_by_xpath('//*[#id="tabs"]/ul/li[2]').click()
links = []
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'blockFilter')))
link = driver.find_elements_by_xpath("//a[#class='name browsinglink impression-binded']")
for i in link:
links.append(i.get_attribute('href'))
finally:
driver.quit()
print(links)
To select current tab:
current_tab = driver.current_window_handle
To switch between tabs:
driver.switch_to_window(driver.window_handles[1])
driver.switch_to.window(driver.window_handles[-1])
Assuming you have the new tab url as TAB_URL, you should try:
from selenium.webdriver.common.action_chains import ActionChains
action = ActionChains(driver)
action.key_down(Keys.CONTROL).click(TAB_URL).key_up(Keys.CONTROL).perform()
Also, apparently the li doesn't have a click event, are you sure this element you are getting '//*[#id="tabs"]/ul/li[2]' has the aria-selected property set to true or any of these classes: ui-tabs-active ui-state-active?
If not, you should call click on the a tag inside this li.
Then you should increase the timeout parameter of your WebDriverWait to guarantee that the div is loaded.

selenium wait expected conditions not working

I am trying to click the google store once the google webpage loads. I do not want to use time.sleep() for a few seconds for the google page to load in. I want the browser to click "store" once the page loads. Below is my code, what am I doing wrong?
from selenium import webdriver
import requests
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pause
driver = webdriver.Chrome('/applications/chromedriver')
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get("https://www.google.com")
wait = WebDriverWait(driver, 10)
element = wait.until(EC.element_to_be_clickable((By.xpath, "/html/body/div[2]/div[2]/div[1]/a[2]")))
element.click()
The html for xpath is correct too since it works with driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[1]/a[2]").click()

How to select php filter and downloading file with Selenium

I've been trying to select the filter for "pitchers" and download to Excel from here: https://www.rotowire.com/baseball/stats.php
I've tried the following but am getting an error/don't sure how to select the necessary items
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("https://www.rotowire.com/baseball/stats.php")
elem = driver.find_elements_by_xpath("//div[contains(#class,'filter-tab is-selected')]")
Ideally (for now), the script runs and downloads the file locally.
This downloads the pitcher data. It seems that there is some type of hidden html in the website. That's why the code finds the entire table first, then the excel button.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("https://www.rotowire.com/baseball/stats.php")
pitchers = driver.find_element_by_xpath("//div[#data-name='P']")
pitchers.click()
player_stats_elem = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//div[#data-pos='p']")))
excel_download = WebDriverWait(player_stats_elem, 5).until(EC.presence_of_element_located((By.XPATH, ".//img[#alt='Excel']/..")))
excel_download.click()

Python, selenium find_element_by_link_text not working

I am trying to scrape a website. Where in I have to press a link. for this purpose, I am using selenium library with chrome drive.
from selenium import webdriver
url = 'https://sjobs.brassring.com/TGnewUI/Search/Home/Home?partnerid=25222&siteid=5011&noback=1&fromSM=true#Applications'
browser = webdriver.Chrome()
browser.get(url)
time.sleep(3)
link = browser.find_element_by_link_text("Don't have an account yet?")
link.click()
But it is not working. Any ideas why it is not working? Is there a workaround?
You can get it done in several ways. Here is one of such. I've used driver.execute_script() command to force the clicking. You should not go for hardcoded delay as they are very inconsistent.
Modified script:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://sjobs.brassring.com/TGnewUI/Search/Home/Home?partnerid=25222&siteid=5011&noback=1&fromSM=true#Applications'
driver = webdriver.Chrome()
driver.get(url)
item = wait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[ng-click='newAccntScreen()']")))
driver.execute_script("arguments[0].click();",item)

python dynamic webscraping javascript content

I am using Python and Selenium to scrape a website. What I do is go to the homepage, type in a keyword, such as 1300746-79-5. On the resulting page, I am trying to scrape the data in the "pricing" section. Specifically, I need to get the "SKU-Pack Size" and "Price(USD)" information. But these information is Javascript encripted, so I cannot see them in the source code. I am wondering how I can achieve this.
I have written some code that gets me to the page of interest, but I still cannot see the javascript information. Here is what I have so far.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint
# Create a new instance of the Firefox driver
driver = webdriver.Chrome('C:\Users\Rei\Desktop\chromedriver.exe')
driver.get("http://www.sigmaaldrich.com/united-states.html")
print driver.title
inputElement = driver.find_element_by_name("Query")
# type in the search
inputElement.send_keys("1300746-79-5")
inputElement.submit()
Everything you have done looks correct to me.
"SKU-Pack Size" and "Price(USD)" information are not "encrypted", but retrieved after JavaScript clicking action. All you need to do is to click product name or pricing link.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint
driver = webdriver.Chrome()
driver.get("http://www.sigmaaldrich.com/united-states.html")
print driver.title
inputElement = driver.find_element_by_name("Query")
# type in the search
inputElement.send_keys("1300746-79-5")
inputElement.submit()
pricing_link = driver.find_element_by_css_selector("li.priceValue a")
print pricing_link.text
pricing_link.click()
# then deal with the data you want
price_table = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".priceAvailContainer tbody"))
)
print 'price_table.text: ' + price_table.text
driver.quit()

Categories

Resources