How do I get the stock symbol by Selenium? - python

I am trying to grab the stock symbol from this page.
This is my code:
from selenium import webdriver
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
browser.get(url)
dfs = pd.read_html(browser.page_source)
print(dfs)
browser.close()
This is the output:
dfs
[ 0
0 加入至心水組合:請先登入或註冊成為會員, Empty DataFrame
Columns: [沒有符合以上篩選條件的股票。]
Index: [], 0
0 加入至心水組合:請先登入或註冊成為會員]
I know it's javascript and I used Selenium already. How come I can't get the table? And how do I get the stock symbol in the page as shown below? Thanks.
Additonal info: After clicking the link, choose the 2nd one from the GREEN drop-down list, then the above table will be shown.

One way is as follows
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
driver = webdriver.Chrome()
driver.get(url)
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'option')))
# select the second dropdown option by its value attribute whose value is mb
driver.find_element_by_css_selector('[value=mb]').click()
#wait for blue button to be clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[href*=submit]'))).click()
#select table
table = driver.find_element_by_css_selector('.dt960')
#transfer html of table to pandas read_html which handles tables
df = pd.read_html(table.get_attribute('outerHTML'))[0] #grab the table
df2 = df.drop(df.columns[0], axis=1).dropna(how='all') #lose the nan column and rows
df2.rename(columns=df.iloc[0], inplace = True) #set headers same as row 1
df2.drop(df.index[0], inplace = True) #lose row 1
df2.reset_index(drop=True) #re-index
print(df2)
driver.quit()

Related

Get information from webpage with the same class names (Python Selenium)

I have a simple question that assumingly can be solved very easily.
I however used some time now to extract the four lines of information as shown here:
see html structure here
I first try to access the <ul _ngcontent-xl-byg-c79="" class="short ng-star-inserted" item to then loop over the <li _ngcontent-xl-byg-c79="" class="table-row ng-star-inserted"> items in order to store the embedded information in my dataframe (columns are 'Mærke', 'Produkttype', 'Serie', and 'Model').
What do I do wrong? My problem is that the four lines have the same "class" name, which gives me the same output in all four loops.
This is my code:
from selenium import webdriver
import pandas as pd
# Activate web browser: External control
browser = webdriver.Chrome(r'C:\Users\KristerJens\Downloads\chromedriver_win32\chromedriver')
# Get webpage
browser.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
# Get information
brand= []
product= []
series=[]
model=[]
for i in browser.find_elements_by_xpath("//ul[#class='short ng-star-inserted']/li"):
for p in i.find_elements_by_xpath("//span[#class='attribute-name']"):
brand.append(i.find_elements_by_class_name('?').text)
product.append(i.find_elements_by_class_name('?').text)
series.append(i.find_elements_by_class_name('?').text)
model.append(i.find_elements_by_class_name('?').text)
df = pd.DataFrame()
df['brand'] = brand
df['product'] = product
df['series'] = series
df['model'] = model
Any help is very appreciated!!
Try like below and confirm:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.xl-byg.dk/shop/knauf-insulation-ecobatt-murfilt-190-mm-2255993")
wait = WebDriverWait(driver,30)
# Cookie pop-up
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#aria-label='Accept all' or #aria-label = 'Accepter alle']"))).click()
options = driver.find_elements_by_xpath("//div[#class='row-column']//ul[contains(#class,'short')]/li")
for opt in options:
attribute = opt.find_element_by_xpath("./span[#class='attribute-name']").text # Use a "." in the xpath to find element within in an element
value = opt.find_element_by_xpath("./*[contains(#class,'ng-star-inserted')]").text
print(f"{attribute} : {value}")
Mærke : Knauf Insulation
Produkttype : Murfilt
Serie : ECOBATT
Materiale : Glasmineraluld

Appending table values from a webpage to csv

I want to get the table from webpage
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
options.add_argument("--headless");
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
driver.get("https://munafasutra.com/nse/dividends")
file_object = open('divident.csv', 'a')
output table
How to get the first table and their values?
You have to look at the HTML path and locate the WebElement that is gathering that first table (Clicking "Inspect" when you click right-button of the mouse can do the work).
You can save that webelement using the following line of code:
first_table = driver.find_element_by_xpath("//div[#id = 'co']//table[1]") # The [1] is not really necessary as when using **find_element_by_xpath** will only look for the first element.
Then, if you look at how data is organized inside that table, you can observe each row is gathered by a tr WebElement. Therefore, if you wish to write it in a csv file, I would suggest to write row by row with the following code:
rows = first_table.find_elements_by_xpath("./tbody/tr")
for row in rows:
entries_of_the_row = row.find_elements_by_xpath("./td")
row_to_csv = []
for entry in entries_of_the_row:
row_to_csv.append(entry.text)
file_object.write(f"{row_to_csv[0]}, {row_to_csv[1]}, {row_to_csv[2]}, {row_to_csv[3]}, {row_to_csv[4]}\n")
file_object.close()
You can use below XPATH to retrieve the first table value :
//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td
Something like this :
driver.get("https://munafasutra.com/nse/dividends")
first_table = driver.find_elements(By.XPATH, "//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td")
for first in first_table:
print(first.text)
You can use BeautifulSoup to get the table data. Selenium is not required if you just want to extract web page data.
You need to import below packages :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
You can Extract HTML of the table using below code (soup variable will contain the HTML code of the entire page):
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
Below is the code to extract HTML for the 1st table (here table is the tag value and in [] contains the index of the table we want to extract data of):
first_table = soup.find_all('table')[0]
You can also add attributes to distinctly identify the table along with tag name.
Below is the code to extract all the rows in the selected table :
all_rows = first_table.findAll("tr")
Use the below code to write the data in csv file :
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
Below is the complete code to extract 1st table data to csv :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
first_table = soup.find_all('table')[0]
all_rows = first_table.findAll("tr")
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)

Scraping data from HTML table usin xpath and LXML or selenium

I need to extract data from HTML table from this website:
https://1x2.lucksport.com/result_en.shtml?dt=2019-04-12&cid=156
I use Python, selenium and lxml with xpath
I want to extract each match odds
The problem is that each match is in 2 row
two : tr class="dtd2", then come two: tr class="dtd1"
I need the xpath that allow to extract the first row and his following row
driver.get(u)
t = html.fromstring(driver.page_source)
for i in t.xpath('//*[#id="odds_tb"]/table/tbody/tr[#class="dtd2"]/td[1]/text()'):
A more verbose method
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
import pandas as pd
import copy
d = webdriver.Chrome()
d.get('https://1x2.lucksport.com/result_en.shtml?dt=2019-04-12&cid=156')
WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#odds_tb tr[class]")))
soup = bs(d.page_source, 'lxml')
rows = soup.select('#odds_tb tr[class]')
results = []
i = 1
headers = ['Competition', 'Date', 'Match' ,'OddsType', 'Home Win', 'Draw', 'Away Win', 'Result']
for row in rows[1:]:
cols = [td.text for td in row.select('td')]
if (i % 2 == 1):
record = {'Competition' : cols[0],
'Date' : cols[1],
'Match' : ' v '.join([cols[2], cols[6]]),
'OddsType' : 'average early odds',
'Home Win' : cols[3],
'Draw' : cols[4],
'Away Win' : cols[5],
'Result' : cols[7]}
else:
record['OddsType'] = 'average live odds'
record['Home Win'] = cols[0]
record['Draw'] = cols[1]
record['Away Win'] = cols[2]
results.append(copy.deepcopy(record))
i+=1
df = pd.DataFrame(results, columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
d.quit()
You can use both selenium and pandas to get the table info.
from selenium import webdriver
import time
import pandas as pd
driver = webdriver.Chrome()
driver.get("https://1x2.lucksport.com/result_en.shtml?dt=2019-04-12&cid=156")
time.sleep(3)
htmlcontent=driver.page_source
tables=pd.read_html(htmlcontent)
print(tables[14])
It looks like you want to iterate the odd trs and then include the "next" tr.
In css that looks like:
.dtd1:nth-child(odd),.dtd2:nth-child(odd)
You can get odds with xpath too, just add:
[position() mod 2 = 1]

Cannot extract the html table

I want to harvest information using beautiful soup and python3 from a table within a given website .
I have also tried to use XPath method but still cannot get a way to obtain the data.
coaches = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
coachespage = urlopen(coaches)
soup = BeautifulSoup(coachespage,features="html.parser")
data = soup.find_all("tbody", { "id" : "JGrid-az-com-1031-tbody" })
def crawler(table):
for mytable in table:
try:
rows = mytable.find_all('tr')
for tr in rows:
cols = tr.find_all('td')
for td in cols:
return(td.text)
except:
raise ValueError("no data")
print(crawler(data))
If you use selenium to make the selections and then pd.read_html the page_source to get the table, this allows javascript to run and populate values
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
driver = webdriver.Chrome()
driver.get(url)
ele = driver.find_element_by_css_selector('.az-triggers-panel a') #distance dropdown
driver.execute_script("arguments[0].scrollIntoView();", ele)
ele.click()
option = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "comboOption-az-com-1015-8"))) # any distance
option.click()
driver.find_element_by_css_selector('.az-btn-text').click()
time.sleep(5) #seek better wait condition for page update
tables = pd.read_html(driver.page_source)

Scraping from dropdown menus with Python

I am a newbie with Python and trying to retrieve data within
this Site using Python version 3.6.0
There are 2 dropdowns and second's data depends on the first's selection.
First: 'Organizasyon Adi'
Second: 'UEVCB Adi'
All options from the source is like:
<option value="0" selected="selected">TÜMÜ</option> #this is default value when we open the page
<option value="10374">1461 TRABZON ELEKTRİK ÜRETİM A.Ş</option>
<option value="9426">2M ELEKTRİK ÜRETİM SANAYİ VE TİCARET ANONİM ŞİRKETİ</option>
These are options for firs Dropdown and there are almost 800 options.
We cant see the second Dropdowns options without inspecting the page unless the second Dropdown box is clicked. (Both dropdowns opens a searchbox when clicked.)
Second Dropdown opens a list of units for selected organisation.
When options from two Dropdowns are selected it generates a table data and we're trying to get data for all units.
I couldn't make it to scrap data for all units with one program, so i decided to scrap them individually.
With this code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
import time
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
organisation = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_label']")
organisation.click()
dropdown1 = driver.find_element_by_xpath(".//*[#id='j_idt102:distributionId_filter']")
dropdown1.send_keys('1461')
dropdown1.send_keys(u'\ue007')
unit = driver.find_element_by_id('j_idt102:uevcb_label')
dropdown2 = driver.find_element_by_xpath(".//*[#id='j_idt102:uevcb_filter']")
dropdown2.send_keys('SAMA')
dropdown2.send_keys(u'\ue007')
apply= driver.find_element_by_xpath("//*[#id='j_idt102:goster']")
apply.click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source)
table = soup.find_all('table')[0]
rows = table.find_all('tr')[1:]
data = {
'01.Date' : [],
'02.Hour' : [],
'03.NaturalGas' : [],
'04.Wind' : [],
'05.Lignite' : [],
'06.Hard_Coal' : [],
'07.ImportedCoal' : [],
'08.Geothermal' : [],
'09.Hydro_Dam' : [],
'10.Naphta' : [],
'11.Biomass' : [],
'12.River' : [],
'13.Other' : []
}
for row in rows:
cols = row.find_all('td')
data['01.Date'].append( cols[0].get_text() )
data['02.Hour'].append( cols[1].get_text() )
data['03.NaturalGas'].append( cols[3].get_text() )
data['04.Wind'].append( cols[4].get_text() )
data['05.Lignite'].append( cols[5].get_text() )
data['06.Hard_Coal'].append( cols[6].get_text() )
data['07.ImportedCoal'].append( cols[7].get_text() )
data['08.Geothermal'].append( cols[8].get_text() )
data['09.Hydro_Dam'].append( cols[9].get_text() )
data['10.Naphta'].append( cols[10].get_text() )
data['11.Biomass'].append( cols[11].get_text() )
data['12.River'].append( cols[12].get_text() )
data['13.Other'].append( cols[13].get_text() )
df = pd.DataFrame( data )
writer = pd.ExcelWriter('//192.168.0.102/Data/kgup.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
time.sleep(5)
driver.close()
By this code we can select from first dropdown using search function and Enter key.
When it comes to second, it generates ImportError: sys.meta_path is None, Python is likely shutting down
How should I handle this?
Thanks.
Your code seem to be sensitive to StaleElementException as well as to exception Element is not clickable at point.... Try below code for web-scraping part and let me know the result:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://seffaflik.epias.com.tr/transparency/uretim/planlama/kgup.xhtml' #
driver = webdriver.Chrome()
driver.get(url)
wait = WebDriverWait(driver, 20)
driver.maximize_window()
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:distributionId_label'))).click() # organization drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:distributionId_filter'))).send_keys('1461' + u'\ue007') # select required
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt179_modal'))) # wait until modal disappeared
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:uevcb_label'))).click() # unit drop-down
wait.until(EC.element_to_be_clickable((By.ID, 'j_idt102:uevcb_filter'))).send_keys('SAMA' + u'\ue007') # select unit
wait.until(EC.element_to_be_clickable((By.ID,'j_idt102:goster'))).click() # click Apply
wait.until_not(EC.visibility_of_element_located((By.ID,'j_idt15'))) # wait until modal disappeared
soup = BeautifulSoup(driver.page_source)
....

Categories

Resources