I have a simple question that assumingly can be solved very easily.
I however used some time now to extract the four lines of information as shown here:
see html structure here
I first try to access the <ul _ngcontent-xl-byg-c79="" class="short ng-star-inserted" item to then loop over the <li _ngcontent-xl-byg-c79="" class="table-row ng-star-inserted"> items in order to store the embedded information in my dataframe (columns are 'Mærke', 'Produkttype', 'Serie', and 'Model').
What do I do wrong? My problem is that the four lines have the same "class" name, which gives me the same output in all four loops.
This is my code:
from selenium import webdriver
import pandas as pd
# Activate web browser: External control
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
# Get webpage
browser.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
# Get information
brand= []
product= []
series=[]
model=[]
for i in browser.find_elements_by_xpath("//ul[#class='short ng-star-inserted']/li"):
for p in i.find_elements_by_xpath("//span[#class='attribute-name']"):
brand.append(i.find_elements_by_class_name('?').text)
product.append(i.find_elements_by_class_name('?').text)
series.append(i.find_elements_by_class_name('?').text)
model.append(i.find_elements_by_class_name('?').text)
df = pd.DataFrame()
df['brand'] = brand
df['product'] = product
df['series'] = series
df['model'] = model
Any help is very appreciated!!
Try like below and confirm:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
wait = WebDriverWait(driver,30)
# Cookie pop-up
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#aria-label='Accept all' or #aria-label = 'Accepter alle']"))).click()
options = driver.find_elements_by_xpath("//div[#class='row-column']//ul[contains(#class,'short')]/li")
for opt in options:
attribute = opt.find_element_by_xpath("./span[#class='attribute-name']").text # Use a "." in the xpath to find element within in an element
value = opt.find_element_by_xpath("./*[contains(#class,'ng-star-inserted')]").text
print(f"{attribute} : {value}")
Mærke : Knauf Insulation
Produkttype : Murfilt
Serie : ECOBATT
Materiale : Glasmineraluld
Related
So I'm not sure if this practically valid, but was wondering if there's a way in selenium to wait for an <a tag - out of two based on their href value or the text contained after the tag closes.
What I'm trying to do is to power up this page https://www.coingecko.com/en/exchanges, iterate through the exchanges links, visit each one of them, then click on the about tab of each of those newly opened pages as they contain the info to be extracted. The code actually worked up until halfway through when it failed to identify the tab properly through a StaleElementException and elementNotFound as I did it through driver.find_element_by_text.
The problem is that the 'about' tab changes from one page to the other, so it's either //ul[#role='tablist']/li[3] or li[2], and that's why I'm trying to wait and click on the right element based on its href value. That is since one of the a tags on the page href's value contains the text # about ---> //ul[#role='tablist']/li[3]/a
Apologies if it wasn't straightforward but I was trying to pinpoint what the issue was until recently :)
This is the code that I've attempted so far if anyone can gratefully point me in the right direction
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
num_of_pages = 4
exchanges_list = []
names_list = []
websites_list = []
emails_list = []
years_list = []
countries_list = []
twitter_list = []
for i in range(num_of_pages):
url = 'https://www.coingecko.com/en/exchanges?page=' + str(i+1)
driver.get(url)
links = driver.find_elements_by_xpath("//tbody[#data-target='exchanges-list.tableRows']/tr/td[2]/div/span[2]/a")
links = [url.get_attribute('href') for url in links]
time.sleep(0.5)
for link in links:
driver.get(link)
wait = WebDriverWait(driver, 2)
wait.until(EC.text_to_be_present_in_element_value((By.XPATH, "//ul[#role='tablist']/li[position()=2 or position()=3]/a"), '#about'))
try:
name = driver.find_element_by_xpath("//div[#class='exchange-details-header-content']/div/h1").text
website = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[8]/a").get_attribute('href')
email = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[9]/a").get_attribute('href')
year_est = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[10]").text
inc_country = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[12]").text
twitter = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[16]/div[2]/div[2]/a").get_attribute('title')
except:
pass
try:
print('---------------')
print('exchange name is : {}'.format(name))
print('exchange website is : {}'.format(website))
print('exchange email is : {}'.format(email))
print('exchange established in year: {}'.format(year_est))
print('exchange incorporated in : {}'.format(Inc_country))
print('exchange twitter handle is: {}'.format(twitter))
except:
pass
try:
names_list.append(name)
websites_list.append(website)
emails_list.append(email)
years_list.append(year_est)
countries_list.append(Inc_country)
twitter_list.append(twitter)
except:
pass
df = pd.DataFrame(list(zip(names_list, websites_list,emails_list, years_list, countries_list, twitter_list)), columns=['Ex_Names', 'Website', 'Support Email', 'Inc Year', 'Inc Country', 'Twitter Handle' ])
CoinGecko2_data = df.to_csv('CoinGecko4.csv', index=False)
If you know the href just wait for: //a[contains(#href, 'my-href')]
I am not sue if there is any but you can create your custom wait. Here is an example:
https://seleniumbyexamples.github.io/waitcustom
I want to harvest information using beautiful soup and python3 from a table within a given website .
I have also tried to use XPath method but still cannot get a way to obtain the data.
coaches = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
coachespage = urlopen(coaches)
soup = BeautifulSoup(coachespage,features="html.parser")
data = soup.find_all("tbody", { "id" : "JGrid-az-com-1031-tbody" })
def crawler(table):
for mytable in table:
try:
rows = mytable.find_all('tr')
for tr in rows:
cols = tr.find_all('td')
for td in cols:
return(td.text)
except:
raise ValueError("no data")
print(crawler(data))
If you use selenium to make the selections and then pd.read_html the page_source to get the table, this allows javascript to run and populate values
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
driver = webdriver.Chrome()
driver.get(url)
ele = driver.find_element_by_css_selector('.az-triggers-panel a') #distance dropdown
driver.execute_script("arguments[0].scrollIntoView();", ele)
ele.click()
option = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "comboOption-az-com-1015-8"))) # any distance
option.click()
driver.find_element_by_css_selector('.az-btn-text').click()
time.sleep(5) #seek better wait condition for page update
tables = pd.read_html(driver.page_source)
I am trying to grab the stock symbol from this page.
This is my code:
from selenium import webdriver
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
browser.get(url)
dfs = pd.read_html(browser.page_source)
print(dfs)
browser.close()
This is the output:
dfs
[ 0
0 加入至心水組合:請先登入或註冊成為會員, Empty DataFrame
Columns: [沒有符合以上篩選條件的股票。]
Index: [], 0
0 加入至心水組合:請先登入或註冊成為會員]
I know it's javascript and I used Selenium already. How come I can't get the table? And how do I get the stock symbol in the page as shown below? Thanks.
Additonal info: After clicking the link, choose the 2nd one from the GREEN drop-down list, then the above table will be shown.
One way is as follows
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
driver = webdriver.Chrome()
driver.get(url)
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'option')))
# select the second dropdown option by its value attribute whose value is mb
driver.find_element_by_css_selector('[value=mb]').click()
#wait for blue button to be clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[href*=submit]'))).click()
#select table
table = driver.find_element_by_css_selector('.dt960')
#transfer html of table to pandas read_html which handles tables
df = pd.read_html(table.get_attribute('outerHTML'))[0] #grab the table
df2 = df.drop(df.columns[0], axis=1).dropna(how='all') #lose the nan column and rows
df2.rename(columns=df.iloc[0], inplace = True) #set headers same as row 1
df2.drop(df.index[0], inplace = True) #lose row 1
df2.reset_index(drop=True) #re-index
print(df2)
driver.quit()
Im trying to get a url from a PLP and visit each of the elements to get certain keywords from the PDP and dump it into json file. However, the list only returns 1 data only. Im suspecting the website is trying to block the action. Im using this program once a month to see what new features are added in new items.
The code between the "***" is the part I am having trouble with. It returns the correct value but only returns 1 data.How can I get more data?In the example below I am only getting the product names to make it simple.
sample url: "https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3"
Actual element
<div class="exp-product-wall clearfix">
::before
<div class="grid-item fullSize" data-pdpurl="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf" data-column-index="0" data-item-index="1">
<div class="grid-item-box">
<div class="grid-item-content">
<div class="grid-item-image">
<div class="grid-item-image-wrapper sprite-sheet sprite-index-1">
<a href="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf">
<img src="https://images.nike.com/is/image/DotCom/pwp_sheet2?$NIKE_PWPx3$&$img0=BQ8928_001&$img1=BQ8928_003&$img2=BQ8928_005">
Below working code
import selenium
import json
import time
import re
import string
import requests
import bs4
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
domain = 'website url goes here'
def prepare_driver(url):
'''Returns a Firefox Webdriver.'''
options = Options()
# options.add_argument('-headless')
driver = webdriver.Chrome(executable_path='location to chromedriver')
driver.get(url)
wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.CLASS_NAME, 'product-name ')))
time.sleep(2)
return driver
def fill_form(driver, search_argument):
'''Finds all the input tags in form and makes a POST requests.'''
#search_field = driver.find_element_by_id('q')
#search_field.send_keys(search_argument)
# We look for the search button and click it
#driver.find_element_by_class_name('search__submit')\
#.click()
wait = WebDriverWait(driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'product-name ')))
def scrape_results(driver, n_results):
'''Returns the data from n_results amount of results.'''
products_urls = list()
products_data = list()
***for product_title in driver.find_elements_by_xpath('//div[#class="exp-gridwall-content clearfix"]'):
products_urls.append(product_title.find_element_by_xpath(
'//div[#class="grid-item fullSize"]').get_attribute('data-pdpurl'))***
for url in range(0, n_results):
if url == n_results:
break
url_data = scrape_product_data(driver, products_urls[url])
products_data.append(url_data)
return products_data
def scrape_product_data(driver, product_url):
'''Visits an product page and extracts the data.'''
if driver == None:
driver = prepare_driver(product_url)
driver.get(product_url)
time.sleep(12)
product_fields = dict()
# Get the product name
product_fields['product_name'] = driver.find_element_by_xpath(
'//h1[#id="pdp_product_title"]').get_attribute('textContent')
# .text.strip('name')
return product_fields
if __name__ == '__main__':
try:
driver = prepare_driver(domain)
#fill_form(driver, 'juniole tf')
products_data = scrape_results(driver, 2)
products_data = json.dumps(products_data, indent=4,ensure_ascii=False) #ensure_acii => changes japanese to correct character
with open('data.json', 'w') as f:
f.write(products_data)
finally:
driver.quit()
Desired Output in json:
[
{
"product_name": "Nike Epic React Flyknit 2",
"descr": "The Nike Epic React Flyknit 2 takes a step up from its predecessor with smooth, lightweight performance and a bold look. An updated Flyknit upper conforms to your foot with a minimal, supportive design. Underfoot, durable Nike React technology defies the odds by being both soft and responsive, for comfort that lasts as long as you can run."
},
{
"product_name": "Nike Zoom Fly SP Fast Nathan Bell",
"descr": "The Nike Zoom Fly SP Fast Nathan Bell is part of a collaboration with artist Nathan Bell, featuring hand-drawn graphics that celebrate running as a competition with yourself. It's designed to meet the demands of your toughest tempo runs, long runs and race day with a responsive construction that turns the pressure of each stride into energy return for the next."
}
]
You can easily get the urls with requests. I targeted the data-pdpurl attribute. In the selenium loop you may need to add some handling of requests for location. A short wait is needed during loop to prevent false claims of product not available.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
d = webdriver.Chrome()
results = []
r = requests.get('https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3')
soup = bs(r.content, 'lxml')
products = []
listings = soup.select('.grid-item')
for listing in listings:
url = listing['data-pdpurl']
title = listing.select_one('.product-display-name').text
row = {'title' :title ,
'url' : url}
products.append(row)
for product in products:
url = product['url']
d.get(url)
try:
d.get(url)
desc = WebDriverWait(d,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".description-preview")))
results.append({'product_name': product['title'],
'descr' : desc.text})
except Exception as e:
print(e, url)
finally:
time.sleep(1)
d.quit()
print(results)
I am a newbie with Python and trying to retrieve data within
this Site using Python version 3.6.0
There are 2 dropdowns and second's data depends on the first's selection.
First: 'Organizasyon Adi'
Second: 'UEVCB Adi'
All options from the source is like:
<option value="0" selected="selected">TÜMÜ</option> #this is default value when we open the page
<option value="10374">1461 TRABZON ELEKTRİK ÜRETİM A.Ş</option>
<option value="9426">2M ELEKTRİK ÜRETİM SANAYİ VE TİCARET ANONİM ŞİRKETİ</option>
These are options for firs Dropdown and there are almost 800 options.
We cant see the second Dropdowns options without inspecting the page unless the second Dropdown box is clicked. (Both dropdowns opens a searchbox when clicked.)
Second Dropdown opens a list of units for selected organisation.
When options from two Dropdowns are selected it generates a table data and we're trying to get data for all units.
I couldn't make it to scrap data for all units with one program, so i decided to scrap them individually.
With this code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
organisation = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_label']")
organisation.click()
dropdown1 = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_filter']")
dropdown1.send_keys('1461')
dropdown1.send_keys(u'\ue007')
unit = driver.find_element_by_id('j_idt102:uevcb_label')
dropdown2 = driver.find_element_by_xpath(".//*[#id='j_idt102:uevcb_filter']")
dropdown2.send_keys('SAMA')
dropdown2.send_keys(u'\ue007')
apply= driver.find_element_by_xpath("//*[#id='j_idt102:goster']")
apply.click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source)
table = soup.find_all('table')[0]
rows = table.find_all('tr')[1:]
data = {
'01.Date' : [],
'02.Hour' : [],
'03.NaturalGas' : [],
'04.Wind' : [],
'05.Lignite' : [],
'06.Hard_Coal' : [],
'07.ImportedCoal' : [],
'08.Geothermal' : [],
'09.Hydro_Dam' : [],
'10.Naphta' : [],
'11.Biomass' : [],
'12.River' : [],
'13.Other' : []
}
for row in rows:
cols = row.find_all('td')
data['01.Date'].append( cols[0].get_text() )
data['02.Hour'].append( cols[1].get_text() )
data['03.NaturalGas'].append( cols[3].get_text() )
data['04.Wind'].append( cols[4].get_text() )
data['05.Lignite'].append( cols[5].get_text() )
data['06.Hard_Coal'].append( cols[6].get_text() )
data['07.ImportedCoal'].append( cols[7].get_text() )
data['08.Geothermal'].append( cols[8].get_text() )
data['09.Hydro_Dam'].append( cols[9].get_text() )
data['10.Naphta'].append( cols[10].get_text() )
data['11.Biomass'].append( cols[11].get_text() )
data['12.River'].append( cols[12].get_text() )
data['13.Other'].append( cols[13].get_text() )
df = pd.DataFrame( data )
writer = pd.ExcelWriter('//192.168.0.102/Data/kgup.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
time.sleep(5)
driver.close()
By this code we can select from first dropdown using search function and Enter key.
When it comes to second, it generates ImportError: sys.meta_path is None, Python is likely shutting down
How should I handle this?
Thanks.
Your code seem to be sensitive to StaleElementException as well as to exception Element is not clickable at point.... Try below code for web-scraping part and let me know the result:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, 20)
driver.maximize_window()
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:distributionId_label'))).click() # organization drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:distributionId_filter'))).send_keys('1461' + u'\ue007') # select required
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt179_modal'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:uevcb_label'))).click() # unit drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:uevcb_filter'))).send_keys('SAMA' + u'\ue007') # select unit
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:goster'))).click() # click Apply
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
soup = BeautifulSoup(driver.page_source)
....