I want to harvest information using beautiful soup and python3 from a table within a given website .
I have also tried to use XPath method but still cannot get a way to obtain the data.
coaches = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
coachespage = urlopen(coaches)
soup = BeautifulSoup(coachespage,features="html.parser")
data = soup.find_all("tbody", { "id" : "JGrid-az-com-1031-tbody" })
def crawler(table):
for mytable in table:
try:
rows = mytable.find_all('tr')
for tr in rows:
cols = tr.find_all('td')
for td in cols:
return(td.text)
except:
raise ValueError("no data")
print(crawler(data))
If you use selenium to make the selections and then pd.read_html the page_source to get the table, this allows javascript to run and populate values
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
url = 'https://www.badmintonengland.co.uk/coach/find-a-coach'
driver = webdriver.Chrome()
driver.get(url)
ele = driver.find_element_by_css_selector('.az-triggers-panel a') #distance dropdown
driver.execute_script("arguments[0].scrollIntoView();", ele)
ele.click()
option = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID, "comboOption-az-com-1015-8"))) # any distance
option.click()
driver.find_element_by_css_selector('.az-btn-text').click()
time.sleep(5) #seek better wait condition for page update
tables = pd.read_html(driver.page_source)
Related
I create a crawler to scrape binance.com enter link description here using python selenium. The problem is I want to scrape all the crypto names and their price from this website, But I can't. I am able to scrape only the data which is show on a particular page. Is there any way to grab all the data. Without using any api.
Code trials:
from lib2to3.pgen2 import driver
import schedule
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
def getData():
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get("https://www.binance.com/en/markets")
# driver.implicitly_wait(5)
bitcoins = driver.find_elements(By.XPATH, "//div[contains(#class, 'css-1ap5wc6')]")
# prMarCap = driver.find_elements(By.XPATH, "//div[#class='css-leyy1t'] //div[contains(text(), '$')]")
pr = driver.find_elements(By.XPATH, "//div[#class='css-leyy1t'] //div[#class='css-ydcgk2']")
MarCap = driver.find_elements(By.XPATH, "//div[#class='css-s779xv']")
mycoin=[]
myprice=[]
mymarcap = []
for bit in bitcoins:
# print(bit.text)
mycoin.append(bit.text)
for MC in MarCap:
# print(MC.text)
mymarcap.append(MC.text)
for price in pr:
# print(price.text)
myprice.append(price.text)
final = zip(mycoin, myprice, mymarcap)
for data in list(final):
print(data)
schedule.every(1).seconds.do(getData)
while True:
schedule.run_pending()
time.sleep(1)
There is no need to scrape it with selenium as you can get it by calling binance api
Here is a code, that saves all <CRYPTO>-BUSD prices to out.json
import json
from collections import defaultdict
import requests
data = requests.get("https://www.binance.com/bapi/asset/v2/public/asset-service/product/get-products?includeEtf=true").json()['data']
crypto = defaultdict(dict)
for value in data:
if value['s'].endswith("BUSD"):
crypto[value['b']] = {"HIGH": value['h'], "LOW": value['l'], "CURRENT": value['c']}
with open("out.json", "w") as f:
f.write(json.dumps(crypto))
To extract and print the Crypto name and their price from binance website you need to induce WebDriverWait for visibility_of_all_elements_located() and you can use either of the following Locator Strategies:
driver.get("https://www.binance.com/en/markets")
texts = [my_elem.text for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#id='tabContainer']//div[#direction='ltr']//div[#data-area='left']//div[#data-bn-type='text']")))]
coins = [texts[i] for i in range(len(texts)) if i % 3 == 1]
prices = [texts[i] for i in range(len(texts)) if i % 3 == 2]
print (coins)
print (prices)
Console Output:
['Bitcoin', 'Ethereum', 'TetherUS', 'BNB', 'USD Coin', 'Ripple', 'Cardano', 'Solana', 'Terra', 'Avalanche', 'BUSD', 'Polkadot', 'Dogecoin', 'SHIBA INU', 'TerraUSD', 'Polygon', 'Wrapped Bitcoin', 'Cosmos', 'Litecoin', 'Uniswap']
['$37,988.60000', '$2,612.69000', '$1.00000', '$363.99000', '$0.99970', '$0.71888', '$0.87973', '$89.56000', '$75.21000', '$74.41000', '$0.99970', '$16.83000', '$0.12316', '$0.00002', '$1.00000', '$1.45000', '$37,989.82000', '$27.26000', '$102.87000', '$9.54000']
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
References
You can find a couple of relevant detailed discussions in:
Find element by Xpath. How to split the element I don't want inside the Xpath
Code trial:
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
# Below is to crawl data from a webpage with two different dropdown
try:
driver = webdriver.Chrome('./chromedriver')
driver.get('https://price.joinsland.joins.com/theme/index_theme.asp?sisaegbn=T05')
select1 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='sido']"))))
for item1 in select1.options:
item1.click()
select2 = Select(WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='gugun']"))))
for item2 in select2.options:
item2.click()
time.sleep(2)
# below is to get the attained date into excel file
table = driver.find_element_by_class_name('tbl_box')
tbody = table.find_element_by_tag_name('tbody')
rows=tbody.find_elements_by_tag_name('tr')
total = []
result = []
for index, value in enumerate(rows):
body = value.find_elements_by_tag_name('td')
for i in range(len(body)):
data = body[i].text
result.append(data)
total.append(result)
result=[]
df = pd.DataFrame.from_records(total)
df.to_excel('text.xlsx')
except Exception as e:
print(e)
finally:
driver.quit()
I have edited this code thanks to the lovely comment below but I get the same error saying as below.:
Message: stale element reference: element is not attached to the page document
I roughly understand why this message shows up but still have no clear idea on how to fix this. I would deeply appreciate any comment! Many thanks in advance!
This is what I figured out. But I'm not sure if code is right or not. I don't know python.
#get select
select1 = Select(driver.find_element_by_xpath('//select[#name="sido"]'))
#get all options from select
options1 = select1.options
for opt1 is options1:
#select the option which has the value of opt1
select1.select_by_value(opt1.get_attribute("value"))
time.sleep(5)
select2 = Select(driver.find_element_by_xpath('//select[#name="gugu"]'))
options2 = select2.options
for opt2 in options2:
select1.select_by_value(opt2.get_attribute("value"))
time.sleep(4)
To select all the <option> elements from multiple e.g. two (2) different drop-down-select elements within the website https://price.joinsland.joins.com/theme/index_theme.asp?sisaegbn=T05 you need to induce WebDriverWait for the element_to_be_clickable() and you can use the following xpath based Locator Strategies:
driver.get('https://price.joinsland.joins.com/theme/index_theme.asp?sisaegbn=T05')
select1 = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='sido']"))))
for item1 in select1.options:
item1.click()
select2 = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='gugun']"))))
for item2 in select2.options:
item2.click()
time.sleep(3) # perform your web-crawling here
According to my code, I am able to get the First heading of Project and I want the subheading to be printed (FSI Details). Not able to get the second heading using beautifulsoup.I tried the reference for the nth-child
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
print(Project_title)
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find_all(
"h2")[1].text.strip()
print(Project_title1 ) # (FSI Detail) heading should be printed here
You can try CSS selector :contains("FSI Details"), which selects element containing string "FSI Details". This code prints labels and values of the "FSI Details" section:
import requests
from bs4 import BeautifulSoup
url = 'https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=BPUvrrjIzYs%2f2hwYj1YIOfflh9NisZW6zTns2KLjHBZn6cbQ008s91nzlFrDxVvLwR1vAeLID0%2bo%2bD0H0Z6o2t%2b5P%2b%2fbBOcHCbMQHU8gkwdNZJnbbfu6N7mWSpgKXt4AiQyzuEpoDE7FX6HZypqsGXz4ObYD4KpyRzCsFJaWTgA%3d'
soup = BeautifulSoup(requests.get(url).text, 'lxml')
fsi_content = soup.select_one('.x_title:contains("FSI Details") + .x_content')
print('{: <160}{: <8}'.format('Label', 'Value'))
print('-' * 168)
for label, text in zip(fsi_content.select('label'), fsi_content.select('div:has(> label) + div')):
print('{: <160}{: <8}'.format(label.get_text(strip=True), text.get_text(strip=True)))
Prints:
Label Value
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Built-up-Area as per Proposed FSI (In sqmts) ( Proposed but not sanctioned) ( As soon as approved, should be immediately updated in Approved FSI) 0
Built-up-Area as per Approved FSI (In sqmts) 11566.50
TotalFSI 11566.50
Further reading:
CSS Selectors Refernece
I am trying to grab the stock symbol from this page.
This is my code:
from selenium import webdriver
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
browser.get(url)
dfs = pd.read_html(browser.page_source)
print(dfs)
browser.close()
This is the output:
dfs
[ 0
0 加入至心水組合:請先登入或註冊成為會員, Empty DataFrame
Columns: [沒有符合以上篩選條件的股票。]
Index: [], 0
0 加入至心水組合:請先登入或註冊成為會員]
I know it's javascript and I used Selenium already. How come I can't get the table? And how do I get the stock symbol in the page as shown below? Thanks.
Additonal info: After clicking the link, choose the 2nd one from the GREEN drop-down list, then the above table will be shown.
One way is as follows
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://stock360.hkej.com/StockScreener/profession/tab/profile'
driver = webdriver.Chrome()
driver.get(url)
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'option')))
# select the second dropdown option by its value attribute whose value is mb
driver.find_element_by_css_selector('[value=mb]').click()
#wait for blue button to be clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[href*=submit]'))).click()
#select table
table = driver.find_element_by_css_selector('.dt960')
#transfer html of table to pandas read_html which handles tables
df = pd.read_html(table.get_attribute('outerHTML'))[0] #grab the table
df2 = df.drop(df.columns[0], axis=1).dropna(how='all') #lose the nan column and rows
df2.rename(columns=df.iloc[0], inplace = True) #set headers same as row 1
df2.drop(df.index[0], inplace = True) #lose row 1
df2.reset_index(drop=True) #re-index
print(df2)
driver.quit()
I want to retrieve all the information from a table on a dynamic website and I have the following code for it:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import sys
reload(sys)
import re
import csv
from time import sleep
sys.setdefaultencoding('utf-8') #added since it would give error for certain values when using str(i)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
maxcr = 1379
listofrows = []
url = "http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT"
print(url)
driver.get(url)
wait = WebDriverWait(driver,10)
# Trying to get the table
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
table = driver.find_elements_by_css_selector(".panel-body")
print(table)
RowsOfTable = table.get_attribute("tr")
However, I keep getting error but it doesn't work so far. How do I retrieve the information of the table?
Thanks a lot!
error:
RowsOfTable = table.get_attribute("tr")
AttributeError: 'list' object has no attribute 'get_attribute'
Here is the code to get the product details
tableloadwait = (wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".panel-body"))))
driver.find_element_by_xpath("//span[contains(.,'Product Details')]").click()
rows = driver.find_elements_by_xpath("//span[contains(.,'Product Details')]/ancestor::div[#class='accordion-top-border']//tr[(#ng-repeat='attr in attributes' or #ng-repeat='field in fields') and #class='visible-xs']")
for rowNum in range(len(rows)):
print(rows[rowNum].get_attribute('innerText'))
driver.quit()
We have to trim the values or break the values as per your requirement.
if you would like to get the data based on row text use the below.
upcData = driver.find_element_by_xpath("//strong[.='UPC']/parent::td").get_attribute('innerText').replace('UPC','').replace('\n','').replace(' ','')
Expand the accordion with the appropriate + button first then select the table. Add waits for items to be present. Change the expandSigns index to 2 if you want the other table.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT'
driver = webdriver.Chrome()
driver.get(url)
expandSigns = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".glyphicon-plus")))
expandSigns[1].click()
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "td")))
table = driver.find_element_by_css_selector('table')
html = table.get_attribute('outerHTML')
df = pd.read_html(html)
print(df)
driver.quit()
If you need to scrape, not test, you can use requests to get data. Below code is example how you can get data from the page.
import requests
import re
# Return header page(html) to get token and list key
response = requests.get("http://biggestbook.com/ui/catalog.html#/itemDetail?itemId=HERY4832YER01&uom=CT")
# Get token using regular expression
productRecommToken = re.search("'productRecommToken','(.+)'", response.text)[1]
# Get list of keys using regular expression
listKey = re.search("'listKey',\\['(.*?)'\\]", response.text)[1].split("','")
# Create header with token
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'http://biggestbook.com/ui/catalog.html',
'Origin': 'http://biggestbook.com',
'DNT': '1',
'token': productRecommToken,
'BiggestBook-Handle-Errors-Generically': 'true',
}
# Create parameters with list keys and search values
params = (
('listKey', listKey),
('uom', 'CT'),
('vc', 'n'),
('win', 'HERY4832YER01'),
)
# Return json with all details about product
response = requests.get('https://api.essendant.com/digital/digitalservices/search/v1/items',
headers=headers,
params=params)
data = response.json()
# Get items from json, probably could be more than one
items = data["items"]
# Iterate and get details you need. Check "data" to see all possible details you can get
for i in items:
print(i["manufacturer"])
print(i["description"])
print(i["actualPrice"])
# Get attributes
attributes = i["attributes"]
# Example hot you can get specific one attribute.
thickness = list(filter(lambda d: d['name'] == 'Thickness', attributes))[0]["value"]
# Print all attributes as name = value
for a in attributes:
print(f"{a['name']} = {a['value']}")