I want to scrape Home team and Away team from this page https://www.flashscore.com/match/hY5c1Bhh/#match-summary/match-summary
# Get HomeTeam
_ht = driver.find_element_by_xpath('//*[contains(#class, "home")]')
ht = _ht.find_element_by_xpath('//*[contains(#class, "participantName")]')
_homeName = ht.text
# Get AwayTeam
_at = driver.find_element_by_xpath('//*[contains(#class, "away")]')
at = _at.find_element_by_xpath('//*[contains(#class, "participantName")]')
_awayName = at.text
Output
Longford
Longford
try to store both of them in a list like this :
teams = driver.find_elements(By.CSS_SELECTOR, "div[class^='participantName'] a")
print("Home team : ", teams[0].text)
print("Away team : ", teams[1].text)
You are missing the . when trying to locate element inside other element.
So your code should be
# Get HomeTeam
_ht = driver.find_element_by_xpath('//*[contains(#class, "home")]')
ht = _ht.find_element_by_xpath('.//*[contains(#class, "participantName")]')
_homeName = ht.text
# Get AwayTeam
_at = driver.find_element_by_xpath('//*[contains(#class, "away")]')
at = _at.find_element_by_xpath('.//*[contains(#class, "participantName")]')
_awayName = at.text
Related
I'm trying to scrape product prices from a website and both real price and the monthly payment quota value has exactly the same class, so I can't figure it out how to only get main price.
and this is for the main price: "879.990"
this is for the monthly payment quota: "39.990",
this is the URL: https://listado.mercadolibre.cl/macbook#D[A:macbook]
#THIS GETS ALL THE NAMES AND STORES IT IN A LIST
prod = soup.find_all('h2', class_ ='ui-search-item__title shops__item-title')
productos = list()
count=0
for i in prod:
if count < 33:
productos.append(i.text)
else:
break
count +=1
size= len(productos) +1
#print(size)
#print(productos, len(productos))
print(productos)
#THIS GETS ALL THE NAMES AND STORES IT IN A LIST
pri = soup.find_all('span',class_ ="price-tag-fraction")
precios = list()
count=0
for i in pri:
if count < 33:
precios.append(i.text)
else:
break
count +=1
#rint(precios)
prices= [item.split(',')for item in precios]
Here is the output
You can filter out the other prices using CSS selectors
# filsel = 'span.price-tag-fraction:not(span.ui-search-installments span):not(s.price-tag__disabled span)'
emiSp_sel = 'span.ui-search-installments span' # monthly
disab_sel = 's.price-tag__disabled span' # crossed out
filsel = f'span.price-tag-fraction:not({emiSp_sel}):not({disab_sel})'
pri = [p.get_text() for p in soup.select(filsel)]
or using lambda with find
pri = soup.find_all(
lambda p: p.name == 'span' and 'price-tag-fraction' in p.get('class', '')
and p.find_parent('span', {'class': 'ui-search-installments'}) is None
and p.find_parent('s', {'class': 'price-tag__disabled'}) is None
)
or even by combining lists comprehension with your current method
pri = [
p for p in soup.find_all('span',class_ ="price-tag-fraction")
if p.find_parent('span', {'class': 'ui-search-installments'}) is None
and p.find_parent('s', {'class': 'price-tag__disabled'}) is None
]
I am running a selenium code on the website DNCA to scrap for some of the document links. I am trying to get links of each value in the drop down for each section shown in this page. My code is working fine, but when I run the same code with option headless = True, I am getting the following error:
ElementClickInterceptedException: element click intercepted: Element <li data-original-index="0">...</li> is not clickable at point (226, 250). Other element would receive the click: <div class="col-md-12">...</div>
(Session info: headless chrome=104.0.5112.81)
Code:
def get_active_row(active_tab, fund_id):
active_row = active_tab.find_elements(By.XPATH, ".//tr[#style='' or #style='display: table-row;'][#fund-id = '{}']".format(fund_id))
try:
assert len(active_row) == 1
active_row = active_row[0]
return active_row
except AssertionError as asserr:
print(asserr, ' -- More than one active row for the fund id: ', fund_id)
sys.exit(1)
except Exception as err:
print(err, ' -- fund id:', fund_id)
sys.exit(1)
def scrap(driver):
tab_list = driver.find_element(By.XPATH, "//ul[contains(#role, 'tablist')]")
tab_list_names = tab_list.find_elements(By.XPATH, './/li')
data_list = []
for loc, tab_name in enumerate(tab_list_names):
if loc < 20:
tab_name.click()
html = driver.page_source
soup = BeautifulSoup(html)
bs_active_tab = soup.find('div', {'class': 'tab-pane table-datas active'})
bs_headers = bs_active_tab.find('thead')
headers = [i.text for i in bs_headers.find_all('td')]
active_tab = driver.find_element(By.XPATH, "//div[contains(#class, 'tab-pane table-datas active')]")
unique_fund_ids = [i_fund.get_attribute('fund-id') for i_fund in active_tab.find_elements(By.XPATH, ".//tr[#style]") if i_fund.get_attribute('fund-id') != '-']
lookup = set()
unique_fund_ids = [x for x in unique_fund_ids if x not in lookup and lookup.add(x) is None]
for fund_id in unique_fund_ids: #Iterate over each fund
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_list = [i.text for i in active_row.find_elements(By.XPATH, './/li')]
for pos, isin_val in enumerate(isin_list):
isin_selected = active_row.find_elements(By.XPATH, './/li')[pos]
isin_selected.click()
active_row = get_active_row(active_tab, fund_id)
fund_name = ''
for pos_inner, td in enumerate(active_row.find_elements(By.XPATH, ".//td")):
a_tag = td.find_elements(By.XPATH, ".//a")
if len(a_tag) == 1:
a_tag = a_tag[0]
if pos_inner == 0:
fund_name = a_tag.text
link = a_tag.get_attribute('href')
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], link])
else:
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], ''])
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_selected_to_close = active_row.find_elements(By.XPATH, './/li')[0]
isin_selected_to_close.click()
tlg_tr_tab = active_tab.find_element(By.XPATH, ".//tr[#fund-id='-']")
for tlg_pos_inner, tlg_td in enumerate(tlg_tr_tab.find_elements(By.XPATH, ".//td")):
tlg_a_tag = tlg_td.find_elements(By.XPATH, ".//a")
if len(tlg_a_tag) == 1:
tlg_a_tag = tlg_a_tag[0]
tlg_link = tlg_a_tag.get_attribute('href') #Get document link
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], tlg_link])
else:
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], ''])
dataset_links = pd.DataFrame(data_list, columns = ['Tab', 'Fund Name', 'ISIN', 'Type', 'Link'])
driver.quit()
Can someone please explain me why is it working fine with headless = False but not with with headless = True.
In headless mode the default screen size is very small, significantly less than screen size in regular mode.
So, to overcome this problem you need to set the screen size.
It can be done in the following ways:
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1920, 1080")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
Or just
driver.set_window_size(1920, 1080)
Both approaches should work.
I prefer the first way :)
I'm am currently using selenium to take the product information from Sneider electric and this is currently the error I am receiving:
selenium.common.exceptions.NoSuchElementException: Message:
no such element: Unable to locate element:
{"method":"xpath","selector":"/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li[1]/product-card/article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3"}
Currently, the website I am trying to pull this information from is this URL:
https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=0&Nrpp=12
The Xpath file is for the description of their products which according to my inspection and findings is this:
/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li[1]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3
Any ideas??
Current Code:
def page_function():
driver.get('https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=12&Nrpp=12')
driver.maximize_window()
# gets the amount of items in the search bar
print("Number of products:", 69)
# for loop to read the product name and descriptions
# product = driver.find_element(By.CSS_SELECTOR, ".search-item")
# product = product.text
# print(product)
pr = "]/product-card//article/div/div[2]/div[1]/pes-product-price/p/span[1]"
nam = "]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[1]/a"
des = "]/product-card//article/div/div[1]/product-card-main-info//div/pes-router-link[2]/a/h3"
# des_path = "#search-items > .search-item .details > a > .row.pt-5.pb-sm-5 > .multilines-3.text-truncate-multilines.xs-single-col-8.col-12 > .font-weight-bold.text-dark"
follow_loop = range(1, 70)
for x in follow_loop:
y = x
if (x > 61):
y = x - 60
elif (x > 49):
y = x - 48
elif (x > 37):
y = x - 36
elif (x > 25):
y = x - 24
elif(x > 13):
y = x - 12
else:
print("")
if ( ((x % 13) == 0) ):
driver.delete_all_cookies()
next_arrow = driver.find_element(By.CLASS_NAME, "page-links__arrow page-links__arrow--next js-page-link js-page-link-next")
driver.execute_script("arguments[0].click();", next_arrow)
xpath = "/html/body/div[2]/main/div[5]/ul/li/div/div/div/div/div/ul/li[1]/div/div/div[2]/div[2]/section/div/product-cards-wrapper//div/ul/li["
xpath += str(y)
xpath += des
driver.implicitly_wait(5)
description.append(driver.find_element(By.XPATH, xpath))
xpath2 = xpath.replace(des, '')
xpath2 += pr
unit_price.append(driver.find_element(By.XPATH, xpath2).text)
xpath3 = xpath2.replace(pr, '')
xpath3 += nam
name.append(driver.find_element(By.XPATH, xpath3).text)
The product description is within a #shadow-root (open)
Solution
Tto extract the desired text you need to use shadowRoot.querySelector() and you can use the following Locator Strategy:
driver.get("https://www.se.com/us/en/product-range/63426-powerlogic-accusine-pcs%2B/?N=4176697776&No=0&Nrpp=12")
time.sleep(5)
description = driver.execute_script('''return document.querySelector("product-cards-wrapper.hydrated").shadowRoot.querySelector("product-card.hydrated").shadowRoot.querySelector("product-card-main-info.hydrated").shadowRoot.querySelector("pes-router-link.description.hydrated a > h3")''')
print(description.text)
Console Output:
Active harmonic filter - 60 A 380..480 V AC - IP00 enclosure
References
You can find a couple of relevant detailed discussions in:
How to locate the First name field within shadow-root (open) within the website https://www.virustotal.com using Selenium and Python
How to get past a cookie agreement page using Python and Selenium?
Unable to locate the Sign In element within #shadow-root (open) using Selenium and Python
Here is a picture (sorry) of the HTML that I am trying to parse:
I am using this line:
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
Thinking that I'd get the 1st child of the class statText and the outcome would be 53%.
But it's not. I get "Loading..." and none of the data that I was trying to use and display.
The full code I have so far:
soup = BeautifulSoup(source, 'lxml')
home_team = soup.find('div', class_='tname-home').a.text
away_team = soup.find('div', class_='tname-away').a.text
home_score = soup.select_one('.current-result .scoreboard:nth-child(1)').text
away_score = soup.select_one('.current-result .scoreboard:nth-child(2)').text
print("The home team is " + home_team, "and they scored " + home_score)
print()
print("The away team is " + away_team, "and they scored " + away_score)
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
print(home_stats)
Which currently does print the hone and away team and the number of goals they scored. But I can't seem to get any of the statistical content from this site.
My output plan is to have:
[home_team] had 53% ball possession and [away_team] had 47% ball possession
However, I would like to remove the "%" symbols from the parse (but that's not essential). My plan is to use these numbers for more stats later on, so the % symbol gets in the way.
Apologies for the noob question - this is the absolute beginning of my Pythonic journey. I have scoured the internet and StackOverflow and just can not find this situation - I also possibly don't know exactly what I am looking for either.
Thanks kindly for your help! May your answer be the one I pick as "correct" ;)
Assuming that this is the website that u r tryna scrape, here is the complete code to scrape all the stats:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.scoreboard.com/en/match/SO3Fg7NR/#match-statistics;0')
pg = driver.page_source #Gets the source code of the page
driver.close()
soup = BeautifulSoup(pg,'html.parser') #Creates a soup object
statrows = soup.find_all('div',class_ = "statTextGroup") #Finds all the div tags with class statTextGroup -- these div tags contain the stats
#Scrapes the team names
teams = soup.find_all('a',class_ = "participant-imglink")
teamslst = []
for x in teams:
team = x.text.strip()
if team != "":
teamslst.append(team)
stats_dict = {}
count = 0
for x in statrows:
txt = x.text
final_txt = ""
stat = ""
alphabet = False
percentage = False
#Extracts the numbers from the text
for c in txt:
if c in '0123456789':
final_txt+=c
else:
if alphabet == False:
final_txt+= "-"
alphabet = True
if c != "%":
stat += c
else:
percentage = True
values = final_txt.split('-')
#Appends the values to the dictionary
for x in values:
if stat in stats_dict.keys():
if percentage == True:
stats_dict[stat].append(x + "%")
else:
stats_dict[stat].append(int(x))
else:
if percentage == True:
stats_dict[stat] = [x + "%"]
else:
stats_dict[stat] = [int(x)]
count += 1
if count == 15:
break
index = [teamslst[0],teamslst[1]]
#Creates a pandas DataFrame out of the dictionary
df = pd.DataFrame(stats_dict,index = index).T
print(df)
Output:
Burnley Southampton
Ball Possession 53% 47%
Goal Attempts 10 5
Shots on Goal 2 1
Shots off Goal 4 2
Blocked Shots 4 2
Free Kicks 11 10
Corner Kicks 8 2
Offsides 2 1
Goalkeeper Saves 0 2
Fouls 8 10
Yellow Cards 1 0
Total Passes 522 480
Tackles 15 12
Attacks 142 105
Dangerous Attacks 44 29
Hope that this helps!
P.S: I actually wrote this code for a different question, but I didn't post it as an answer was already posted! But I didn't know that it would come in handy now! Anyways, I hope that my answer does what u need.
I have the below DataFrame that I saved to excel using the pandas library:
Report No. Score Specifications
26-013RN42 >=1000 WaterSense certified
26-013RN42 >=1000 Single-Flush HET
26-013RN42 >=1000 Floor Mounted
26-013RN42 >=1000 2 Piece Unit
26-013RN42 >=1000 Round
26-013RN42 >=1000 Standard
26-013RN42 >=1000 Gravity
26-013RN42 >=1000 Floor Outlet
26-013RN42 >=1000 Flapper size 3in
26-013RN42 >=1000 Rough-in: 10"
26-013RN42 >=1000 Insulated: No
As you can see the "Report No." column and the "Score" column are all the same value but the "Specifications" columns are all different.
What I was hoping to do was combine all of the values under the "Specifications" column into one row as seen below:
Report No. Score Specifications
26-013RN42 >=1000 WaterSense certified, Single-Flush HET, Floor Mounted, 2 Piece Unit, Round, Standard, Gravity, Floor Outlet, Flapper size 3in, Rough-in: 10", Insulated: No
EDIT:
Here is my input code. The purpose of this code is to go to a website, scrape data and organize it into a table. Didn't post it before as it is a tad messy and I know there are ways for it to be more efficient. Please let me know if you have any suggestions on how to improve the code!
python:
url2 = 'https://www.map-testing.com/map-search/?start=3&searchOptions=AllResults'
urlh2 = requests.get(url2)
info2 = urlh2.text
soup = BeautifulSoup(info2, 'html.parser')
toilets = soup.find_all('div', attrs= {'class' : 'search-result'})
testlist = []
datalist = []
for s in toilets[0].stripped_strings:
datalist.append(s)
dict = {}
count = 0
for info in datalist[:9]:
if count == 0:
dict[info] = datalist[count + 1]
count += 1
elif (count % 2) == 1:
count += 1
continue
elif (count % 2) == 0:
dict[info] = datalist[count + 1]
count += 1
specs = datalist[11:22]
dict['Specifications'] = specs
df = pd.DataFrame(dict)
Using BeautifulSoup to scrape html web page data. and using pandas library to convert json data into DataFrame.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url2 = 'https://www.map-testing.com/map-search/?start=3&searchOptions=AllResults'
urlh2 = requests.get(url2)
soup = BeautifulSoup(urlh2.text, 'html.parser')
results = soup.find_all('div', attrs= {'class' : 'search-result'})
jsonData = []
for row_obj in results:
data = {}
row = row_obj.find("div")
#scrape Manufacturer
manufacturer = row.find("div", string="Manufacturer")
data['Manufacturer'] = manufacturer.find_next('div').text.strip()
# scrape Model Name
modelName = row.find("div", string="Model Name")
data['Model Name'] = modelName.find_next('div').text.strip()
# scrape Model Number
modelNumber = row.find("div", string="Model Number")
data['Model Number'] = modelNumber.find_next('div').text.strip()
# scrape MaP Report No.
maPReportNo = row.find("div", string="MaP Report No.")
data['MaP Report No.'] = maPReportNo.find_next('div').text.strip()
# scrape MaP Flush Score
maPFlushScore = row.find("div", string="MaP Flush Score")
data['MaP Flush Score'] = maPFlushScore.find_next('div').text.strip()
# scrape Specifications
specifications = row.find_all("li")
data['Specifications'] = ",".join(i.text.strip() for i in specifications)
jsonData.append(data)
df = pd.DataFrame(jsonData)