python Selenium taking instance of webdriver - python

i defined two separate functions for opening url with selenium, and fetching data with selenium.
In my second function driver variable is unassignable because it stays local inside first function.
I do not know if it s logical to separate selenium activity in two separate ways, I use this method first time.
Any suggestions to take instance of webdriver and use it inside second function?
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
#going to urls 1-by-1
def goToUrl_Se():
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(a,amazn) via Selenium WebDriver
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\pythonPro\w_crawl\AmznScrpBot\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
driver.get(xUrl)
driver.quit()
#fetch-parse the data from url page
def parse_data():
x_title=driver.find_element(By.XPATH,'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[2]/div/div/div/div/div/div[2]/div/div/div[1]/h2/a/span')
goToUrl_Se()

As I see, you trying to parse data from each URL you opening in goToUrl_Se(). If so the better way is to put the parsing data code inside the loop used in goToUrl_Se() method.
Also, no need to define and create driver each time.
And you definitely have to improve your locators. Very long absolute XPaths are extremely fragile and breakable.
The following flow seems for me to be better.
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = Null
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = False
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\pythonPro\w_crawl\AmznScrpBot\chromedriver.exe')
global driver
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
#going to urls 1-by-1
def goToUrl_Se():
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(a,amazn) via Selenium WebDriver
driver.get(xUrl)
x_title=driver.find_element(By.XPATH,'//*[#id="search"]/div[1]/div[1]/div/span[3]/div[2]/div[2]/div/div/div/div/div/div[2]/div/div/div[1]/h2/a/span')
driver.quit()
create_driver()
goToUrl_Se()

You should return the driver from your create_driver() function:
def create_drive():
// ...
return driver
and change your function to accept a parameter:
def parse_data(driver):
// ...
Now you can get the driver with an assignment and pass it to your function:
driver = create_driver()
parse_data(driver)
I suggest you read more about return values and function parameters to understand this better.

In this structure you can call your second function parse_data within your first function goToUrl_Se() only.
like:
driver.get(xUrl)
somoething = parse_data()
and change parse_data for it to return something
if you want to call them both outside themselves, then you need to do 2 things:
parse_data should get driver as and argumentdef parse_data(driver)
you should not quit selenium within goToUrl_Se()
and if you want to do it as it really should be done, then just use OOP. If you still don't want to, then you'd better initiate driver name ouside any functions and use function to change it. For instance you can have a function that change driver's options only. But that's bad practice when one function does multiple things, like your goToUrl_Se() one.

Related

locating element via selenium explicitly waiting and without gives 0

Using this url I want to locate div tags which has attribute data-asin . When I use //div[#data-asin] in Chrome Inspect mode it gives 21 elements. But while trying to get these elements via Selenium in both ways, explicit wait and direct length gives 0. As I guess Selenium remote browser is unable to get anyone of these elements as a DOM tree. code is below
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\Users\Admin\Downloads\chromedriver107v\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
return driver
#going to urls 1-by-1
def goToUrl_Se(driver):
global counter
counter = 0
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(amazn) via Selenium WebDriver
driver.get(xUrl)
parse_data()
counter+=1
driver.quit()
#fetch-parse the data from url page
def parse_data():
global asin, title, bookform, priceNewProd,author
wait=WebDriverWait(driver,timeout=77)
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.TAG_NAME,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.TAG_NAME,'//div[#data-asin]')###Anf attention here
print(len(y_index))
driver=create_driver()
goToUrl_Se(driver)
You have to mention XPATH not TAG_NAME:
try:
x_index=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#data-asin]')))###Attention here
print(len(x_index))
except:
y_index=driver.find_elements(By.XPATH,'//div[#data-asin]')###Anf attention here
print(len(y_index))

Storing file on a specific path after scraping

i want to store my file into a folder after scraping data and i didn't know how to add exactly to my script :
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("C:/chrome/chromedriver.exe")
driver.execute("get", {'url': 'http://www.ins.tn/statistiques/90#'})
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#class='export']//a[#class='btnexport ' and starts-with(#id, 'btnExporttoExcel')]"))).click()
This is the path of the foler : C:\Users\ASUS\Documents\data
Also, how to change the name of the file ecerytime i scrape data ? like adding the time of the last time that i excuted the code !
Selenium has an obtions object in the webdriver module that you can modify and pass to the driver when instantiating the driver.
For chrome, I set up the options object in a method like this, so I can just pass in my path to dest and get the modified object.
def change_download_folder(dest):
options = selenium.webdriver.ChromeOptions()
prefs = {}
os.makedirs(dest, exist_ok=True)
prefs["profile.default_content_settings.popups"] = 0
prefs["download.default_directory"] = dest
options.add_experimental_option("prefs", prefs)
return options
Then you can call this and pass the return to the driver instantiation:
options = change_download_folder("YOUR_FILE_PATH")
driver = selenium.webdriver.Chrome(
options = options,
executable_path = "PATH_TO_DRIVER_FILE"
)

Element not interactable with selenium python

I know that this question was asked many times on stackoverflow. I tried different solutions but did not get it work. Here is a simple MWE to automate the search on Youtube. Any body familiar with this can help explain the raison ?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = Options()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9515')
options.add_argument('--disable-setuid-sandbox')
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)
url = "https://www.youtube.com/"
driver.get(url)
search_area = driver.find_element(By.XPATH, '//*[#id="search"]')
driver.implicitly_wait(10)
search_area.send_keys('Lionel Messi', Keys.ENTER)
print(search_area.text)
There are 2 problems with your code:
You should use unique locator.
The locator you are using here matching 5 elements on that page, so Selenium returns you the first element on the page matching the locator you passing here while you need the second match.
This locator will work better //input[#id="search"]
You need to add a delay.
Selenium returns you search_area element at the moment that element is found on the page while it is still may be not ready to be interacted with.
The best way to overcome this issue is to use Explicit Waits.
So, this code should work better:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
options = Options()
options.add_argument('--disable-extensions')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9515')
options.add_argument('--disable-setuid-sandbox')
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=options)
wait = WebDriverWait(driver, 20)
url = "https://www.youtube.com/"
driver.get(url)
search_area = wait.until(EC.visibility_of_element_located((By.XPATH, "//input[#id='search']")))
search_area.send_keys('Lionel Messi', Keys.ENTER)
print(search_area.text)
P.S.
driver.implicitly_wait(10) is NOT a delay command, this will not put a delay of 10 seconds on the line where you put it. This only defines the timeout for Selenium to way for element presence. Also, you put if AFTER locating the element, so this could not affect the previously located element.

ChromeDriver not opening new page with chrome_options parameter

I'm trying to use the following code to open a new page using ChromeDriver
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=r"path of chromedriver.exe",chrome_options=options)
I still get the "DevTools listening on...." print but no new page is being opened. If however I run:
driver = webdriver.Chrome(executable_path = r"path")
without the chrome_options parameter, the page opens. Not sure why this is?
chrome_options was deprecated long back.
DeprecationWarning: use options instead of chrome_options
you have to use an instance of options instead as well as pass the absolute path of the ChromeDriver along with the extension as follows:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument("--headless")
driver = webdriver.Chrome(executable_path=r"path of chromedriver.exe", options=options)
Use following code :
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
#object of ChromeOptions
op = webdriver.ChromeOptions()
#add option
op.add_argument('--enable-extensions')
#pass option to webdriver object
driver = webdriver.Chrome(chrome_options=op)

Selenium - Google Travel Scraping Price History missing

I am returning html with this python script but it doesn't return price history (see screenshot). Using non-selenium browser does return html with the prices (even without expending this section by simple regex); chrome/safari/firefox all do, incognito as well.
from selenium import webdriver
import time
url = 'https://www.google.com/flights?hl=en#flt=SFO.JFK.2021-06-01*JFK.SFO.2021-06-07'
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
html = driver.page_source
print(html)
driver.quit()
I can't quite pinpoint if it's some setting in chromedriver. It is possible to do because there is a 3rd party scraper that currently returns this data.
Tried this to no avail. Can a website detect when you are using Selenium with chromedriver?
Any thoughts appreciated.
After I added chrome_options.add_argument("--disable-blink-features=AutomationControlled") I started to see this block. Not sure why it is not always loaded.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
url = 'https://www.google.com/flights?hl=en#flt=SFO.JFK.2021-06-01*JFK.SFO.2021-06-07'
chrome_options = Options()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(executable_path='/snap/bin/chromium.chromedriver', chrome_options=chrome_options)
driver.get(url)
# wait = WebDriverWait(driver, 20)
# wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".EA71Tc.q7Eewe")))
time.sleep(10)
history = driver.find_element_by_css_selector(".EA71Tc.q7Eewe").get_attribute("innerHTML")
print(history)
Here the full block is returned, including all tag names. As you see, I tried explicit waits, but this block was not visible. Experiment with adding another explicit wait.

Categories

Resources