I am running a selenium code on the website DNCA to scrap for some of the document links. I am trying to get links of each value in the drop down for each section shown in this page. My code is working fine, but when I run the same code with option headless = True, I am getting the following error:
ElementClickInterceptedException: element click intercepted: Element <li data-original-index="0">...</li> is not clickable at point (226, 250). Other element would receive the click: <div class="col-md-12">...</div>
(Session info: headless chrome=104.0.5112.81)
Code:
def get_active_row(active_tab, fund_id):
active_row = active_tab.find_elements(By.XPATH, ".//tr[#style='' or #style='display: table-row;'][#fund-id = '{}']".format(fund_id))
try:
assert len(active_row) == 1
active_row = active_row[0]
return active_row
except AssertionError as asserr:
print(asserr, ' -- More than one active row for the fund id: ', fund_id)
sys.exit(1)
except Exception as err:
print(err, ' -- fund id:', fund_id)
sys.exit(1)
def scrap(driver):
tab_list = driver.find_element(By.XPATH, "//ul[contains(#role, 'tablist')]")
tab_list_names = tab_list.find_elements(By.XPATH, './/li')
data_list = []
for loc, tab_name in enumerate(tab_list_names):
if loc < 20:
tab_name.click()
html = driver.page_source
soup = BeautifulSoup(html)
bs_active_tab = soup.find('div', {'class': 'tab-pane table-datas active'})
bs_headers = bs_active_tab.find('thead')
headers = [i.text for i in bs_headers.find_all('td')]
active_tab = driver.find_element(By.XPATH, "//div[contains(#class, 'tab-pane table-datas active')]")
unique_fund_ids = [i_fund.get_attribute('fund-id') for i_fund in active_tab.find_elements(By.XPATH, ".//tr[#style]") if i_fund.get_attribute('fund-id') != '-']
lookup = set()
unique_fund_ids = [x for x in unique_fund_ids if x not in lookup and lookup.add(x) is None]
for fund_id in unique_fund_ids: #Iterate over each fund
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_list = [i.text for i in active_row.find_elements(By.XPATH, './/li')]
for pos, isin_val in enumerate(isin_list):
isin_selected = active_row.find_elements(By.XPATH, './/li')[pos]
isin_selected.click()
active_row = get_active_row(active_tab, fund_id)
fund_name = ''
for pos_inner, td in enumerate(active_row.find_elements(By.XPATH, ".//td")):
a_tag = td.find_elements(By.XPATH, ".//a")
if len(a_tag) == 1:
a_tag = a_tag[0]
if pos_inner == 0:
fund_name = a_tag.text
link = a_tag.get_attribute('href')
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], link])
else:
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], ''])
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_selected_to_close = active_row.find_elements(By.XPATH, './/li')[0]
isin_selected_to_close.click()
tlg_tr_tab = active_tab.find_element(By.XPATH, ".//tr[#fund-id='-']")
for tlg_pos_inner, tlg_td in enumerate(tlg_tr_tab.find_elements(By.XPATH, ".//td")):
tlg_a_tag = tlg_td.find_elements(By.XPATH, ".//a")
if len(tlg_a_tag) == 1:
tlg_a_tag = tlg_a_tag[0]
tlg_link = tlg_a_tag.get_attribute('href') #Get document link
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], tlg_link])
else:
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], ''])
dataset_links = pd.DataFrame(data_list, columns = ['Tab', 'Fund Name', 'ISIN', 'Type', 'Link'])
driver.quit()
Can someone please explain me why is it working fine with headless = False but not with with headless = True.
In headless mode the default screen size is very small, significantly less than screen size in regular mode.
So, to overcome this problem you need to set the screen size.
It can be done in the following ways:
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1920, 1080")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
Or just
driver.set_window_size(1920, 1080)
Both approaches should work.
I prefer the first way :)
I wrote a selenium script to check the case statuses of uscis cases an I want to speed it up as I am trying to check more than 500 cases every time.
How to use it with multithreading of concurrent futures library in python?
import re
import json
from datetime import date
from unittest import case
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from multiprocessing import Pool
import concurrent.futures
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
# browser = webdriver.Chrome('PATH', options=options)
cases = []
def getStatus(CN):
browser = webdriver.Chrome('PATH', options=options)
browser.get("https://egov.uscis.gov/casestatus/landing.do")
serachField = browser.find_element_by_xpath('/html/body/div[2]/form/div/div[1]/div/div[1]/fieldset/div[1]/div[4]/input')
serachField.click()
serachField.send_keys(CN)
searchButton = browser.find_element_by_xpath('/html/body/div[2]/form/div/div[1]/div/div[1]/fieldset/div[2]/div[2]/input')
searchButton.click()
try:
outputFieldHeading = browser.find_element_by_xpath('/html/body/div[2]/form/div/div[1]/div/div/div[2]/div[3]/h1')
outputField = browser.find_element_by_xpath('/html/body/div[2]/form/div/div[1]/div/div/div[2]/div[3]/p')
dateMatch = re.search(r'\w+\s\d+,\s\d+', outputField.text)
try:
formMatch = re.search(r'([I][-]\d+,^)|([I][-]\d+\w)', outputField.text)
formNumber = formMatch.group()
except:
formNumber = "Form Unknown"
cases.append({'caseNumber': CN,'currentDate': today, 'Date': dateMatch.group(), 'FormNumber': formNumber, 'Status': outputFieldHeading.text, 'Description': outputField.text})
print(f"{CN} : {outputFieldHeading.text} : {dateMatch.group()} : {formNumber}")
return f"{CN} : {outputFieldHeading.text} : {dateMatch.group()} : {formNumber}"
except NoSuchElementException:
cases.append({'caseNumber': CN,'currentDate': today, 'Date': "Unknown", 'FormNumber': "Unknown Form", 'Status': "Not Found", 'Description': ""})
print(f"{CN} : Not Found")
return f"{CN} : Not Found"
pass
if __name__ == '__main__':
casenumbers = ["EAC2134250100", "EAC2134250101", "EAC2134250102", "EAC2134250103", "EAC2134250104", "EAC2134250105", "EAC2134250106", "EAC2134250107", "EAC2134250108", "EAC2134250109", "EAC2134250110", "EAC2134250111", "EAC2134250112", "EAC2134250113", "EAC2134250114", "EAC2134250115", "EAC2134250116", "EAC2134250117", "EAC2134250118", "EAC2134250119", "EAC2134250120", "EAC2134250121", "EAC2134250122", "EAC2134250123", "EAC2134250124", "EAC2134250125", "EAC2134250126", "EAC2134250127", "EAC2134250128", "EAC2134250129", "EAC2134250130", "EAC2134250131", "EAC2134250132", "EAC2134250133", "EAC2134250134", "EAC2134250135", "EAC2134250136", "EAC2134250137", "EAC2134250138", "EAC2134250139", "EAC2134250140", "EAC2134250141", "EAC2134250142", "EAC2134250143", "EAC2134250144", "EAC2134250145", "EAC2134250146", "EAC2134250147", "EAC2134250148", "EAC2134250149", "EAC2134250150", "EAC2134250151", "EAC2134250152", "EAC2134250153", "EAC2134250154", "EAC2134250155", "EAC2134250156", "EAC2134250157", "EAC2134250158", "EAC2134250159", "EAC2134250160", "EAC2134250161", "EAC2134250162", "EAC2134250163", "EAC2134250164", "EAC2134250165", "EAC2134250166", "EAC2134250167", "EAC2134250168", "EAC2134250169", "EAC2134250170", "EAC2134250171", "EAC2134250172", "EAC2134250173", "EAC2134250174", "EAC2134250175", "EAC2134250176", "EAC2134250177", "EAC2134250178", "EAC2134250179", "EAC2134250180", "EAC2134250181", "EAC2134250182", "EAC2134250183", "EAC2134250184", "EAC2134250185", "EAC2134250186", "EAC2134250187", "EAC2134250188", "EAC2134250189", "EAC2134250190", "EAC2134250191", "EAC2134250192", "EAC2134250193", "EAC2134250194", "EAC2134250195", "EAC2134250196", "EAC2134250197", "EAC2134250198", "EAC2134250199", "EAC2134250200", "EAC2134250201", "EAC2134250202", "EAC2134250203", "EAC2134250204", "EAC2134250205", "EAC2134250206", "EAC2134250207", "EAC2134250208", "EAC2134250209", "EAC2134250210", "EAC2134250211", "EAC2134250212", "EAC2134250213", "EAC2134250214", "EAC2134250215", "EAC2134250216", "EAC2134250217", "EAC2134250218", "EAC2134250219", "EAC2134250220", "EAC2134250221", "EAC2134250222", "EAC2134250223", "EAC2134250224", "EAC2134250225", "EAC2134250226", "EAC2134250227", "EAC2134250228", "EAC2134250229", "EAC2134250230", "EAC2134250231", "EAC2134250232", "EAC2134250233", "EAC2134250234", "EAC2134250235", "EAC2134250236", "EAC2134250237", "EAC2134250238", "EAC2134250239", "EAC2134250240", "EAC2134250241", "EAC2134250242", "EAC2134250243", "EAC2134250244", "EAC2134250245", "EAC2134250246", "EAC2134250247", "EAC2134250248", "EAC2134250249", "EAC2134250250", "EAC2134250251", "EAC2134250252", "EAC2134250253", "EAC2134250254", "EAC2134250255", "EAC2134250256", "EAC2134250257", "EAC2134250258", "EAC2134250259", "EAC2134250260", "EAC2134250261", "EAC2134250262", "EAC2134250263", "EAC2134250264", "EAC2134250265", "EAC2134250266", "EAC2134250267", "EAC2134250268", "EAC2134250269", "EAC2134250270", "EAC2134250271", "EAC2134250272", "EAC2134250273", "EAC2134250274", "EAC2134250275", "EAC2134250276", "EAC2134250277", "EAC2134250278", "EAC2134250279", "EAC2134250280", "EAC2134250281", "EAC2134250282", "EAC2134250283", "EAC2134250284", "EAC2134250285", "EAC2134250286", "EAC2134250287", "EAC2134250288", "EAC2134250289", "EAC2134250290", "EAC2134250291", "EAC2134250292", "EAC2134250293", "EAC2134250294", "EAC2134250295", "EAC2134250296", "EAC2134250297", "EAC2134250298", "EAC2134250299", "EAC2134250300", "EAC2134250301", "EAC2134250302", "EAC2134250303", "EAC2134250304", "EAC2134250305", "EAC2134250306", "EAC2134250307", "EAC2134250308", "EAC2134250309", "EAC2134250310", "EAC2134250311", "EAC2134250312", "EAC2134250313", "EAC2134250314", "EAC2134250315", "EAC2134250316", "EAC2134250317", "EAC2134250318", "EAC2134250319", "EAC2134250320", "EAC2134250321", "EAC2134250322", "EAC2134250323", "EAC2134250324", "EAC2134250325", "EAC2134250326", "EAC2134250327", "EAC2134250328", "EAC2134250329", "EAC2134250330", "EAC2134250331", "EAC2134250332", "EAC2134250333", "EAC2134250334", "EAC2134250335", "EAC2134250336", "EAC2134250337", "EAC2134250338", "EAC2134250339", "EAC2134250340", "EAC2134250341", "EAC2134250342", "EAC2134250343", "EAC2134250344", "EAC2134250345", "EAC2134250346", "EAC2134250347", "EAC2134250348", "EAC2134250349", "EAC2134250350", "EAC2134250351", "EAC2134250352", "EAC2134250353", "EAC2134250354", "EAC2134250355", "EAC2134250356", "EAC2134250357", "EAC2134250358", "EAC2134250359", "EAC2134250360", "EAC2134250361", "EAC2134250362", "EAC2134250363", "EAC2134250364", "EAC2134250365", "EAC2134250366", "EAC2134250367", "EAC2134250368", "EAC2134250369", "EAC2134250370", "EAC2134250371", "EAC2134250372", "EAC2134250373", "EAC2134250374", "EAC2134250375", "EAC2134250376", "EAC2134250377", "EAC2134250378", "EAC2134250379", "EAC2134250380", "EAC2134250381", "EAC2134250382", "EAC2134250383", "EAC2134250384", "EAC2134250385", "EAC2134250386", "EAC2134250387", "EAC2134250388", "EAC2134250389", "EAC2134250390", "EAC2134250391", "EAC2134250392", "EAC2134250393", "EAC2134250394", "EAC2134250395", "EAC2134250396", "EAC2134250397", "EAC2134250398", "EAC2134250399", "EAC2134250400", "EAC2134250401", "EAC2134250402", "EAC2134250403", "EAC2134250404", "EAC2134250405", "EAC2134250406", "EAC2134250407", "EAC2134250408", "EAC2134250409", "EAC2134250410", "EAC2134250411", "EAC2134250412", "EAC2134250413", "EAC2134250414", "EAC2134250415", "EAC2134250416", "EAC2134250417", "EAC2134250418", "EAC2134250419"]
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
results = [executor.submit(getStatus, x) for x in casenumbers]
print(results)
This is not working and i get nothing printed in the terminal. How to improve this code, Thanks.
I'm trying to scrape some data off ESPN and run some calculations off the scraped data. Ideally, I will like to iterate through a dataframe, grab the players name with Selenium, send the player's name into the search box and tell Selenium to click the player's name. I was able to do this successfully with one player. I'm not quite sure how to iterate through all the players in my data frame.
The second part of the code is where I'm struggling. For some reason I am not able to get the data. Selenium isn't able to find any of the elements. I don't think I'm doing it properly. If I am able to scrape the required data, I will like to plug them into a calculation and append the calculated projected points into my dataframe, dfNBA.
Can someone please help me with my code? and point me in the right direction. I'm trying to be more efficient writing python codes but right now I'm stuck
Thanks
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#sample data
pp = {'Player Name':['Donovan Mitchell', 'Kawhi Leonard', 'Rudy Gobert', 'Paul George','Reggie Jackson', 'Jordan Clarkson'],
'Fantasy Score': [46.0, 50.0, 40.0, 44.0, 25.0, 26.5]}
#Creating a dataframe from dictionary
dfNBA = pd.DataFrame(pp)
#Scraping ESPN
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/")
#Clicking the search button
driver.find_element_by_xpath("//a[#id='global-search-trigger']").click()
#sending data to the search button
driver.find_element_by_xpath("//input[#placeholder='Search Sports, Teams or Players...']").send_keys(dfNBA.iloc[0,:].values[0])
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".search_results__details")))
playerPage = driver.find_element_by_css_selector(".search_results__details").click()
#Scraping data from last 10 games
points = driver.find_element_by_xpath(".//div[#class='Table__TD']")[13]
#rebs = driver.find_element_by_xpath("//*[#id='fittPageContainer'']/div[2]/div[5]/div/div[1]/div[1]/section/div/div[3]/div/div/div[2]/table/tbody/tr[1]/td[7]")
#asts = driver.find_element_by_xpath("//*[#id='fittPageContainer']/div[2]/div[5]/div/div[1]/div[1]/section/div/div[3]/div/div/div[2]/table/tbody/tr[1]/td[8]")
#blks = driver.find_element_by_xpath("//*[#id='fittPageContainer']/div[2]/div[5]/div/div[1]/div[1]/section/div/div[3]/div/div/div[2]/table/tbody/tr[1]/td[9]")
#stls = driver.find_element_by_xpath("//*[#id='fittPageContainer']/div[2]/div[5]/div/div[1]/div[1]/section/div/div[3]/div/div/div[2]/table/tbody/tr[1]/td[10]")
#tnvrs = driver.find_element_by_xpath("//*[#id='fittPageContainer']/div[2]/div[5]/div/div[1]/div[1]/section/div/div[3]/div/div/div[2]/table/tbody/tr[1]/td[12]")
#projectedPoints = points+(rebs*1.2)+(asts*1.5)+(blks*3)+(stls*3)-(tnvrs*1)
print(points)
I think Selenium is a bit overkill when there's a viable api option.
Give this a try. Note, that in the overview, the L10 games refers to last 10 regular season games. My code here does the last 10 games which include playoffs. If you only want regular season, let me know, and I can adjust it. I also added a variable here so if you wanted for example, just last 5 games, or last 15 games, etc. you could do that too.
import requests
import pandas as pd
previous_games = 10
pp = {'Player Name':['Donovan Mitchell', 'Kawhi Leonard', 'Rudy Gobert', 'Paul George','Reggie Jackson', 'Jordan Clarkson'],
'Fantasy Score': [46.0, 50.0, 40.0, 44.0, 25.0, 26.5]}
#Creating a dataframe from dictionary
dfNBA = pd.DataFrame(pp)
search_api = 'https://site.api.espn.com/apis/search/v2'
for idx, row in dfNBA.iterrows():
playerName = row['Player Name']
payload = {'query': '%s' %playerName}
results = requests.get(search_api, params=payload).json()['results']
for each in results:
if each['type'] == 'player':
playerID = each['contents'][0]['uid'].split('a:')[-1]
break
player_api = 'https://site.web.api.espn.com/apis/common/v3/sports/basketball/nba/athletes/%s/gamelog' %playerID
playload = {'season':'2021' }
jsonData_player = requests.get(player_api, params=payload).json()
#Scraping data from last x games
last_x_gameIDs = list(jsonData_player['events'].keys())
last_x_gameIDs.sort()
last_x_gameIDs = last_x_gameIDs[-1*previous_games:]
gamelog_dict = {}
seasonTypes = jsonData_player['seasonTypes']
for gameID in last_x_gameIDs:
for each in seasonTypes:
categories = each['categories']
for category in categories:
if category['type'] == 'total':
continue
events = category['events']
for event in events:
if gameID == event['eventId']:
gamelog_dict[gameID] = event['stats']
labels = jsonData_player['labels']
# Aggrigate totals
for k, v in gamelog_dict.items():
v = dict(zip(labels, v))
gamelog_dict[k] = v
stats = pd.DataFrame(gamelog_dict.values())
points = stats['PTS'].astype(float).sum() / previous_games
rebs = stats['REB'].astype(float).sum() / previous_games
asts = stats['AST'].astype(float).sum() / previous_games
blks = stats['BLK'].astype(float).sum() / previous_games
stls = stats['STL'].astype(float).sum() / previous_games
tnvrs = stats['TO'].astype(float).sum() /previous_games
projectedPoints = float(points)+(float(rebs)*1.2)+(float(asts)*1.5)+(float(blks)*3)+(float(stls)*3)-(float(tnvrs)*1)
print('%s: %.02f' %(playerName,projectedPoints))
Output:
Donovan Mitchell: 42.72
Kawhi Leonard: 52.25
Rudy Gobert: 38.47
Paul George: 44.18
Reggie Jackson: 24.21
Jordan Clarkson: 25.88
Here's some code to accomplish (I think) what you want. You need to wait for the table elements to appear, fix your xpath, and choose the right elements from the table array.
pp = {'Player Name':['Donovan Mitchell', 'Kawhi Leonard', 'Rudy Gobert', 'Paul George','Reggie Jackson', 'Jordan Clarkson'],
'Fantasy Score': [46.0, 50.0, 40.0, 44.0, 25.0, 26.5]}
#Creating a dataframe from dictionary
dfNBA = pd.DataFrame(pp)
#Scraping ESPN
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/")
#Clicking the search button
driver.find_element_by_xpath("//a[#id='global-search-trigger']").click()
#sending data to the search button
driver.find_element_by_xpath("//input[#placeholder='Search Sports, Teams or Players...']").send_keys(dfNBA.iloc[0,:].values[0])
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".search_results__details")))
playerPage = driver.find_element_by_css_selector(".search_results__details").click()
#Scraping data from last 10 games
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//td[#class='Table__TD']")))
points = driver.find_elements_by_xpath("//td[#class='Table__TD']")[12].text
rebs = driver.find_elements_by_xpath("//td[#class='Table__TD']")[6].text
asts = driver.find_elements_by_xpath("//td[#class='Table__TD']")[7].text
blks = driver.find_elements_by_xpath("//td[#class='Table__TD']")[8].text
stls = driver.find_elements_by_xpath("//td[#class='Table__TD']")[9].text
tnvrs = driver.find_elements_by_xpath("//td[#class='Table__TD']")[11].text
projectedPoints = float(points)+(float(rebs)*1.2)+(float(asts)*1.5)+(float(blks)*3)+(float(stls)*3)-(float(tnvrs)*1)
print(projectedPoints)