Loop through url with Selenium Webdriver

Loop through url with Selenium Webdriver - python

The below request finds the contest id's for the day. I am trying to pass that str into the driver.get url so it will go to each individual contest url and download each contests CSV. I would imagine you have to write a loop but I'm not sure what that would look like with a webdriver.
import time
from selenium import webdriver
import requests
import datetime
req = requests.get('https://www.draftkings.com/lobby/getlivecontests?sport=NBA')
data = req.json()
for ids in data:
contest = ids['id']
driver = webdriver.Chrome() # Optional argument, if not specified will search path.
driver.get('https://www.draftkings.com/account/sitelogin/false?returnurl=%2Flobby');
time.sleep(2) # Let DK Load!
search_box = driver.find_element_by_name('username')
search_box.send_keys('username')
search_box2 = driver.find_element_by_name('password')
search_box2.send_keys('password')
submit_button = driver.find_element_by_xpath('//*[#id="react-mobile-home"]/section/section[2]/div[3]/button/span')
submit_button.click()
time.sleep(2) # Let Page Load, If not it will go to Account!
driver.get('https://www.draftkings.com/contest/exportfullstandingscsv/' + str(contest) + '')

Try in following order:
import time
from selenium import webdriver
import requests
import datetime
req = requests.get('https://www.draftkings.com/lobby/getlivecontests?sport=NBA')
data = req.json()
driver = webdriver.Chrome() # Optional argument, if not specified will search path.
driver.get('https://www.draftkings.com/account/sitelogin/false?returnurl=%2Flobby')
time.sleep(2) # Let DK Load!
search_box = driver.find_element_by_name('username')
search_box.send_keys('Pr0c3ss')
search_box2 = driver.find_element_by_name('password')
search_box2.send_keys('generic1!')
submit_button = driver.find_element_by_xpath('//*[#id="react-mobile-home"]/section/section[2]/div[3]/button/span')
submit_button.click()
time.sleep(2) # Let Page Load, If not it will go to Account!
for ids in data:
contest = ids['id']
driver.get('https://www.draftkings.com/contest/exportfullstandingscsv/' + str(contest) + '')

You do not need to send load selenium for x nos of times to download x nos of files. Requests and selenium can share cookies. This means you can login to site with selenium, retrieve the login details and share them with requests or any other application. Take a moment to check out httpie, https://httpie.org/doc#sessions it seems you manually control sessions like requests does.
For requests look at: http://docs.python-requests.org/en/master/user/advanced/?highlight=sessions
For selenium look at: http://selenium-python.readthedocs.io/navigating.html#cookies
Looking at the Webdriver block,you can add proxies and load the browser headless or live: Just comment the headless line and it should load the browser live, this makes debugging easy, easy to understand movements and changes to site api/html.
import time
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import datetime
import shutil
LOGIN = 'https://www.draftkings.com/account/sitelogin/false?returnurl=%2Flobby'
BASE_URL = 'https://www.draftkings.com/contest/exportfullstandingscsv/'
USER = ''
PASS = ''
try:
data = requests.get('https://www.draftkings.com/lobby/getlivecontests?sport=NBA').json()
except BaseException as e:
print(e)
exit()
ids = [str(item['id']) for item in data]
# Webdriver block
driver = webdriver.Chrome()
options.add_argument('headless')
options.add_argument('window-size=800x600')
# options.add_argument('--proxy-server= IP:PORT')
# options.add_argument('--user-agent=' + USER_AGENT)
try:
driver.get(URL)
driver.implicitly_wait(2)
except WebDriverException:
exit()
def login(USER, PASS)
'''
Login to draftkings.
Retrieve authentication/authorization.
http://selenium-python.readthedocs.io/waits.html#implicit-waits
http://selenium-python.readthedocs.io/api.html#module-selenium.common.exceptions
'''
search_box = driver.find_element_by_name('username')
search_box.send_keys(USER)
search_box2 = driver.find_element_by_name('password')
search_box2.send_keys(PASS)
submit_button = driver.find_element_by_xpath('//*[#id="react-mobile-home"]/section/section[2]/div[3]/button/span')
submit_button.click()
driver.implicitly_wait(2)
cookies = driver.get_cookies()
return cookies
site_cookies = login(USER, PASS)
def get_csv_files(id):
'''
get each id and download the file.
'''
session = rq.session()
for cookie in site_cookies:
session.cookies.update(cookies)
try:
_data = session.get(BASE_URL + id)
with open(id + '.csv', 'wb') as f:
shutil.copyfileobj(data.raw, f)
except BaseException:
return
map(get_csv_files, ids)

will this help
for ids in data:
contest = ids['id']
driver.get('https://www.draftkings.com/contest/exportfullstandingscsv/' + str(contest) + '')

May be its time to decompose it a bit.
Create few isolated functions, which are:
0. (optional) Provide authorisation to target url.
1. Collecting all needed id (first part of your code).
2. Exporting CSV for specific id (second part of your code).
3. Loop through list of id and call func #2 for each.
Share chromedriver as input argument for each of them to save driver state and auth-cookies.
Its works fine, make code clear and readable.

I think you can set the URL of a contest to an a element in the landing page, and then click on it. Then repeat the step with other ID.
See my code below.
req = requests.get('https://www.draftkings.com/lobby/getlivecontests?sport=NBA')
data = req.json()
contests = []
for ids in data:
contests.append(ids['id'])
driver = webdriver.Chrome() # Optional argument, if not specified will search path.
driver.get('https://www.draftkings.com/account/sitelogin/false?returnurl=%2Flobby');
time.sleep(2) # Let DK Load!
search_box = driver.find_element_by_name('username')
search_box.send_keys('username')
search_box2 = driver.find_element_by_name('password')
search_box2.send_keys('password')
submit_button = driver.find_element_by_xpath('//*[#id="react-mobile-home"]/section/section[2]/div[3]/button/span')
submit_button.click()
time.sleep(2) # Let Page Load, If not it will go to Account!
for id in contests:
element = driver.find_element_by_css_selector('a')
script1 = "arguments[0].setAttribute('download',arguments[1]);"
driver.execute_script(script1, element, str(id) + '.pdf')
script2 = "arguments[0].setAttribute('href',arguments[1]);"
driver.execute_script(script2, element, 'https://www.draftkings.com/contest/exportfullstandingscsv/' + str(id))
time.sleep(1)
element.click()
time.sleep(3)

Related

Data Scraping, but main content return value none

I am able to login properly, keep the session and browse other pages but after doing print(driver.get(full_path)) the return value shows None. I don't understand why it's showing None. But when I run the script I can see the pages being browsed.
Even though I've tried other ways to get data, the same results are showing all the time, like None.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
import pathlib,pickle
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
# Github credentials
email = ""
password = ""
# initialize the Chrome driver
driver = webdriver.Chrome("")
login_path = "https://finbox.com/login/email"
def save_cookies_file():
driver.get(login_path)
# login info
driver.find_element(By.NAME, 'email').send_keys(email)
driver.find_element(By.NAME, 'password').send_keys(password)
# submit button
driver.find_element(By.CLASS_NAME, '_105e3d41').click()
pickle.dump(driver.get_cookies(), open("cookies.pkl", "wb"))
time.sleep(10)
save_cookies_file();
""" cokies detials """
def search(text):
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
bank_path = f"https://finbox.com/DSE:{text}"
full_path = bank_path + "/financials/income_statement"
print(driver.get(full_path)) // problem is here
time.sleep(10)
# print("Main content:", driver_data)
# table = driver_data.find_element(By.CLASS_NAME, "rt-tbody")
# table = driver_data.find_elements(By.CLASS_NAME, "rt-tbody")
# table = driver_data.find(By.CLASS_NAME, "rt-tbody")
# table = driver.find_elements_by_xpath('//*[#id="root"]/div/div[4]/div[3]/div[2]/div/div[1]/div[2]')
# print("table len : ", len(table))
# print("table type: ", type(table))
# print("table content: ", table)
search("ABBANK")

driver.get() returns None. Always. It navigates to the specified URL but cannot be expected to return any value, as you've tried.

Adding an open close Google Chrome browser to Selenium linkedin_scraper code

I am trying to scrape some LinkedIn profiles of well known people. The code takes a bunch of LinkedIn profile URLS and then uses Selenium and scrape_linkedin to collect the information and save it into a folder as a .json file.
The problem I am running into is that LinkedIn naturally blocks the scraper from collecting some profiles. I am always able to get the first profile in the list of URLs. I put this down to the fact that it opens a new Google Chrome window and then goes to the LinkedIn page. (I could be wrong on this point however.)
What I would like to do is to add to the for loop a line which opens a new Google Chrome session and once the scraper has collected the data close the Google Chrome session such that on the next iteration in the loop it will open up a fresh new Google Chrome session.
From the package website here it states:
driver {selenium.webdriver}: driver type to use
default: selenium.webdriver.Chrome
Looking at the Selenium package website here I see:
driver = webdriver.Firefox()
...
driver.close()
So Selenium does have a close() option.
How can I add an open and close Google Chrome browser to the for loop?
I have tried alternative methods to try and collect the data such as changing the time.sleep() to 10 minutes, to changing the scroll_increment and scroll_pause but it still does not download the whole profile after the first one has been collected.
Code:
from datetime import datetime
from scrape_linkedin import ProfileScraper
import pandas as pd
import json
import os
import re
import time
my_profile_list = ['https://www.linkedin.com/in/williamhgates/', 'https://www.linkedin.com/in/christinelagarde/', 'https://www.linkedin.com/in/ursula-von-der-leyen/']
# To get LI_AT key
# Navigate to www.linkedin.com and log in
# Open browser developer tools (Ctrl-Shift-I or right click -> inspect element)
# Select the appropriate tab for your browser (Application on Chrome, Storage on Firefox)
# Click the Cookies dropdown on the left-hand menu, and select the www.linkedin.com option
# Find and copy the li_at value
myLI_AT_Key = 'INSERT LI_AT Key'
with ProfileScraper(cookie=myLI_AT_Key, scroll_increment = 50, scroll_pause = 0.8) as scraper:
for link in my_profile_list:
print('Currently scraping: ', link, 'Time: ', datetime.now())
profile = scraper.scrape(url=link)
dataJSON = profile.to_dict()
profileName = re.sub('https://www.linkedin.com/in/', '', link)
profileName = profileName.replace("?originalSubdomain=es", "")
profileName = profileName.replace("?originalSubdomain=pe", "")
profileName = profileName.replace("?locale=en_US", "")
profileName = profileName.replace("?locale=es_ES", "")
profileName = profileName.replace("?originalSubdomain=uk", "")
profileName = profileName.replace("/", "")
with open(os.path.join(os.getcwd(), 'ScrapedLinkedInprofiles', profileName + '.json'), 'w') as json_file:
json.dump(dataJSON, json_file)
time.sleep(10)
print('The first observation scraped was:', my_profile_list[0:])
print('The last observation scraped was:', my_profile_list[-1:])
print('END')

Here is a way to open and close tabs/browser.
from datetime import datetime
from scrape_linkedin import ProfileScraper
import random #new import made
from selenium import webdriver #new import made
import pandas as pd
import json
import os
import re
import time
my_profile_list = ['https://www.linkedin.com/in/williamhgates/', 'https://www.linkedin.com/in/christinelagarde/',
'https://www.linkedin.com/in/ursula-von-der-leyen/']
myLI_AT_Key = 'INSERT LI_AT Key'
for link in my_profile_list:
my_driver = webdriver.Chrome() #if you don't have Chromedrive in the environment path then use the next line instead of this
#my_driver = webdriver.Chrome(executable_path=r"C:\path\to\chromedriver.exe")
#sending our driver as the driver to be used by srape_linkedin
#you can also create driver options and pass it as an argument
ps = ProfileScraper(cookie=myLI_AT_Key, scroll_increment=random.randint(10,50), scroll_pause=0.8 + random.uniform(0.8,1),driver=my_driver) #changed name, default driver and scroll_pause time and scroll_increment made a little random
print('Currently scraping: ', link, 'Time: ', datetime.now())
profile = ps.scrape(url=link) #changed name
dataJSON = profile.to_dict()
profileName = re.sub('https://www.linkedin.com/in/', '', link)
profileName = profileName.replace("?originalSubdomain=es", "")
profileName = profileName.replace("?originalSubdomain=pe", "")
profileName = profileName.replace("?locale=en_US", "")
profileName = profileName.replace("?locale=es_ES", "")
profileName = profileName.replace("?originalSubdomain=uk", "")
profileName = profileName.replace("/", "")
with open(os.path.join(os.getcwd(), 'ScrapedLinkedInprofiles', profileName + '.json'), 'w') as json_file:
json.dump(dataJSON, json_file)
time.sleep(10 + random.randint(0,5)) #added randomness to the sleep time
#this will close your browser at the end of every iteration
my_driver.quit()
print('The first observation scraped was:', my_profile_list[0:])
print('The last observation scraped was:', my_profile_list[-1:])
print('END')
This scraper by default uses Chrome as the browser but also gives the freedom to choose what browser you want to use in all possible places like CompanyScraper, ProfileScraper, etc.
I have just changed the default arguments to be passed in the initialization of ProfileScrapper() class and made your driver run browser and close it rather than the default one, added some random time into the wait/sleep intervals as you had requested(you can tweak it as per your needs. You can change the Random Noise I have added to your comfort.
There is no need to use scrape_in_parallel() as I had suggested in my comments but if you want to then, you can define the number of browser instances(num_instances) you want to run along with your own dictionary of drivers having it's own options too(in a another dictionary) :
from scrape_linkedin import scrape_in_parallel, CompanyScraper
from selenium import webdriver
driver1 = webdriver.Chrome()
driver2 = webdriver.Chrome()
driver3 = webdriver.Chrome()
driver4 = webdriver.Chrome()
my_drivers = [driver1,driver2,driver3,driver4]
companies = ['facebook', 'google', 'amazon', 'microsoft', ...]
driver_dict = {}
for i in range(1,len(my_drivers)+1):
driver_dict[i] = my_drivers[i-1]
#Scrape all companies, output to 'companies.json' file, use 4 browser instances
scrape_in_parallel(
scraper_type=CompanyScraper,
items=companies,
output_file="companies.json",
num_instances=4,
driver= driver_dict
)
It's an open source code and since it's written solely in Python you can understand the source code very easily. It's quite an interesting scraper, thank you for letting me know about it too!
NOTE:
There are some concerning unresolved issues in this module as it's told in it's GitHub Issues tab. I would wait for a few more forks and updates if I were you if this doesn't work properly.

Why does BeautifulSoup give me the wrong text?

I've been trying to get the availability status of a product on IKEA's website. On IKEA's website, it says in Dutch: 'not available for delivery', 'only available in the shop', 'not in stock' and 'you've got 365 days of warranty'.
But my code gives me: 'not available for delivery', 'only available for order and pickup', 'checking inventory' and 'you've got 365 days of warranty'.
What do I do wrong which causes the text to not be the same?
This is my code:
import requests
from bs4 import BeautifulSoup
# Get the url of the IKEA page and set up the bs4 stuff
url = 'https://www.ikea.com/nl/nl/p/flintan-bureaustoel-vissle-zwart-20336841/'
thepage = requests.get(url)
soup = BeautifulSoup(thepage.text, 'lxml')
# Locate the part where the availability stuff is
availabilitypanel = soup.find('div', {'class' : 'range-revamp-product-availability'})
# Get the text of the things inside of that panel
availabilitysectiontext = [part.getText() for part in availabilitypanel]
print(availabilitysectiontext)

With the help of Rajesh, I created this as the script that does exactly what I want. It goes to a certain shop (the one located in Heerlen) and it can check for any out of stock item when it comes back to stock and send you an email whenever it is back in stock.
The script used for this is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import smtplib, ssl
# Fill in the url of the product
url = 'https://www.ikea.com/nl/nl/p/vittsjo-stellingkast-zwartbruin-glas-20213312/'
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(options=op, executable_path='/Users/Jem/Downloads/chromedriver')
# Stuff for sending the email
port = 465
password = 'password'
sender_email = 'email'
receiver_email = 'email'
message = """\
Subject: Product is back in stock!
Sent with Python. """
# Keep looping until back in stock
while True:
driver.get(url)
# Go to the location of the shop
btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="onetrust-accept-btn-handler"]')))
btn.click()
location = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="content"]/div/div/div/div[2]/div[3]/div/div[5]/div[3]/div/span[1]/div/span/a')))
location.click()
differentlocation = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="range-modal-mount-node"]/div/div[3]/div/div[2]/div/div[1]/div[2]/a')))
differentlocation.click()
searchbar = driver.find_element_by_xpath('//*[#id="change-store-input"]')
# In this part you can choose the location you want to check
searchbar.send_keys('heerlen')
heerlen = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="range-modal-mount-node"]/div/div[3]/div/div[2]/div/div[3]/div')))
heerlen.click()
selecteer = driver.find_element_by_xpath('//*[#id="range-modal-mount-node"]/div/div[3]/div/div[3]/button')
selecteer.click()
close = driver.find_element_by_xpath('//*[#id="range-modal-mount-node"]/div/div[3]/div/div[1]/button')
close.click()
# After you went to the right page, beautifulsoup it
source = driver.page_source
soup = BeautifulSoup(source, 'lxml')
# Locate the part where the availability stuff is
availabilitypanel = soup.find('div', {"class" : "range-revamp-product-availability"})
# Get the text of the things inside of that panel
availabilitysectiontext = [part.getText() for part in availabilitypanel]
# Check whether it is still out of stock, if so wait half an hour and continue
if 'Niet op voorraad in Heerlen' in availabilitysectiontext:
time.sleep(1800)
continue
# If not, send me an email that it is back in stock
else:
print('Email is being sent...')
context = ssl.create_default_context()
with smtplib.SMTP_SSL('smtp.gmail.com', port, context=context) as server:
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message)
break

The page markup is getting added with javascript after the initial server response. BeautifulSoup is only able to see the initial response and doesn't execute javascript to get the complete response. If you want to run JavaScript, you'll need to use a headless browser. Otherwise, you'll have to disassemble the JavaScript and see what it does.
You could get this to work with Selenium. I modified your code a bit and got it to work.
Get Selenium:
pip3 install selenium
Download Firefox + geckodriver or Chrome + chromedriver:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
# Get the url of the IKEA page and set up the bs4 stuff
url = 'https://www.ikea.com/nl/nl/p/flintan-bureaustoel-vissle-zwart-20336841/'
#uncomment the following line if using firefox + geckodriver
#driver = webdriver.Firefox(executable_path='/Users/ralwar/Downloads/geckodriver') # Downloaded from https://github.com/mozilla/geckodriver/releases
# using chrome + chromedriver
op = webdriver.ChromeOptions()
op.add_argument('headless')
driver = webdriver.Chrome(options=op, executable_path='/Users/ralwar/Downloads/chromedriver') # Downloaded from https://chromedriver.chromium.org/downloads
driver.get(url)
time.sleep(5) #adding delay to finish loading the page + javascript completely, you can adjust this
source = driver.page_source
soup = BeautifulSoup(source, 'lxml')
# Locate the part where the availability stuff is
availabilitypanel = soup.find('div', {"class" : "range-revamp-product-availability"})
# Get the text of the things inside of that panel
availabilitysectiontext = [part.getText() for part in availabilitypanel]
print(availabilitysectiontext)
The above code prints:
['Niet beschikbaar voor levering', 'Alleen beschikbaar in de winkel', 'Niet op voorraad in Amersfoort', 'Je hebt 365 dagen om van gedachten te veranderen. ']

How to webscrape password protected website

I have a website from which I need to scrape some data (The website is https://www.merriam-webster.com/ and I want to scrape the saved words).
This website is password protected, and I also think there is some javascript stuff going on that I don't understand (I think certain elements are loaded by the browser since they don't show up when I wget the html).
I currently have a solution using selenium, it does work, but it requires firefox to be opened, and I would really like a solution where I can let it run as a console only programm in the background.
How would I archieve this, if possible using pythons requests library and the least amount of additional third party librarys?
Here is the code for my selenium solution:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import json
# Create new driver
browser = webdriver.Firefox()
browser.get('https://www.merriam-webster.com/login')
# Find fields for email and password
username = browser.find_element_by_id("ul-email")
password = browser.find_element_by_id('ul-password')
# Find button to login
send = browser.find_element_by_id('ul-login')
# Send username and password
username.send_keys("username")
password.send_keys("password")
# Wait for accept cookies button to appear and click it
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "accept-cookies-button"))).click()
# Click the login button
send.click()
# Find button to go to saved words
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "ul-favorites"))).click()
words = {}
# Now logged in
# Loop over pages of saved words
for i in range(2):
print("Now on page " + str(i+1))
# Find next page button
nextpage = browser.find_element_by_class_name("ul-page-next")
# Wait for the next page button to be clickable
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, "ul-page-next")))
# Find all the words on the page
for word in browser.find_elements_by_class_name('item-headword'):
# Add the href to the dictonary
words[word.get_attribute("innerHTML")] = word.get_attribute("href")
# Naivgate to the next page
nextpage.click()
browser.close()
# Print the words list
with open("output.json", "w", encoding="utf-8") as file:
file.write(json.dumps(words, indent=4))

If you want to use the requests module you need to use a session.
To initialise a session you do:
session_requests = requests.session()
Then you need a payload with the username and password
payload = {
"username":<USERNAME>,
"password":<PASSWORD>}
Then to log in you do:
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
Now your session should be logged in, so to go to any other password protect page you use the same session:
result = session_requests.get(
url,
headers = dict(referer = url)
)
Then you can use result.content to view the content of that page.
EDIT if your site includes a CSRF token you will need to include that in the `payload'. To get the CSRF token replace the "payload" section with:
from lxml import html
tree = html.fromstring(result.text)
#you may need to manually inspect the tree to find how your CSRF token is specified.
authenticity_token = list(set(tree.xpath("//input[#name='csrfmiddlewaretoken']/#value")))[0]
payload = {
"username":<USERNAME>,
"password":<PASSWORD>,
"csrfmiddlewaretoken":authenticity_token
}

Why do Chrome.exe and chromedriver.exe still appear in the computer's memory, despite using driver.close() and driver.quit()?

The code I've written successfully does its part in crawling through a website with the appropriate date and time formatted in the URL, grabbing the table from the underlying HTML source code, and appending the results to a cache.
This python file gets run several dozen times (there are many agent IDs whose info I need to grab); after the script runs, however, dozens of chrome.exe and chromedriver.exe instances still appear in the computer's memory (this is visible in the computer's "Resource Monitor.")
Below is my code. I've used driver.quit() as well as driver.close() and even both together (with driver.close() coming first).
Isn't the driver.quit() supposed to close the instances in the computer's system? Why are they appearing in the memory? Is there a solution to this issue?
Please let me know if I can provide any further information. Thank you in advance.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta
import credentials_page
def get_updated_url(agent_id_num):
now = datetime.utcnow()
today = datetime(now.year, now.month, now.day)
yesterday = today - timedelta(days=1)
previous_date_string = str(yesterday)[:10]
return 'https://examplewebsite.com/agentId/'+agent_id_num+'/orderBy/ASC/startDate/'+previous_date_string+'%204:00%20AM/endDate/'+previous_date_string+'%2010:30%20PM'
def login_entry(username, password, browser): # logs into website
login_email = browser.find_element_by_id('UserName')
login_email.send_keys(username)
login_password = browser.find_element_by_id('Password')
login_password.send_keys(password)
submit_elem = browser.find_element_by_xpath("//button[contains(text(), 'Log in')]")
submit_elem.click()
def get_element(xpath, browser): # grabs element, turns it into needed table in raw HTML format
table_of_interest = browser.find_element_by_xpath(xpath)
# this has a type of <class 'selenium.webdriver.remote.webelement.WebElement'>
return str('<table>'+table_of_interest.get_attribute('innerHTML')+'</table>')
def record_source_code(destination_cache, get_element_html): # takes element HTML and writes it to cache
code_destination = open(destination_cache, 'w')
code_destination.write(repr(get_element_html))
code_destination.close()
def main_function(agent_id):
driver = webdriver.Chrome()
# figure out strings for start_date, end_date
url = get_updated_url(agent_id)
driver.get(url)
#login
login_entry(credentials_page.website_username, credentials_page.website_password, driver)
# first test to see if "not found"
if len(driver.find_elements_by_xpath("//*[text()='Not Found']"))>0:
logoff_elem = driver.find_element_by_xpath("//*[contains(text(), 'Log off')]")
logoff_elem.click()
driver.quit()
return False
else:
#grab table needed
WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH,'/html/body/div/div/div[2]/div[2]/table/tbody')))
table_html = get_element('/html/body/div/div/div[2]/div[2]/table/tbody', driver)
driver.quit()
record_source_code('results_cache.html', table_html)
return True

I think the root cause is your code doesn't handle an exception. So when an exception occurs, it won't quit. Try/catch should help.
def main_function(agent_id):
driver = webdriver.Chrome()
# figure out strings for start_date, end_date
url = get_updated_url(agent_id)
try:
driver.get(url)
#login
login_entry(credentials_page.website_username, credentials_page.website_password, driver)
# first test to see if "not found"
if len(driver.find_elements_by_xpath("//*[text()='Not Found']"))>0:
logoff_elem = driver.find_element_by_xpath("//*[contains(text(), 'Log off')]")
logoff_elem.click()
driver.quit()
return False
else:
#grab table needed
WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH,'/html/body/div/div/div[2]/div[2]/table/tbody')))
table_html = get_element('/html/body/div/div/div[2]/div[2]/table/tbody', driver)
driver.quit()
record_source_code('results_cache.html', table_html)
return True
except:
drier.quit() #<-- Try/catch and close it

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Loop through url with Selenium Webdriver - python

will this help for ids in data: contest = ids['id'] driver.get('https://www.draftkings.com/contest/exportfullstandingscsv/' + str(contest) + '')

Related

Data Scraping, but main content return value none

Adding an open close Google Chrome browser to Selenium linkedin_scraper code

Why does BeautifulSoup give me the wrong text?

How to webscrape password protected website

Why do Chrome.exe and chromedriver.exe still appear in the computer's memory, despite using driver.close() and driver.quit()?

Categories

Resources