Scraping dynamic data selenium - Unable to locate element - python

I am very new to scraping and have a question. I am scraping worldometers covid data. As it is dynamic - I am doing it with selenium.
The code is the following:
from selenium import webdriver
import time
URL = "https://www.worldometers.info/coronavirus/"
# Start the Driver
driver = webdriver.Chrome(executable_path = r"C:\Webdriver\chromedriver.exe")
# Hit the url and wait for 10 seconds.
driver.get(URL)
time.sleep(10)
#find class element
data= driver.find_elements_by_class_name("odd" and "even")
#for loop
for d in data:
country=d.find_element_by_xpath(".//*[#id='main_table_countries_today']").text
print(country)
current output:
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//*[#id='main_table_countries_today']"}
(Session info: chrome=96.0.4664.45)

To scrape table within worldometers covid data you need to induce WebDriverWait for the visibility_of_element_located() and using DataFrame from Pandas you can use the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
options = Options()
options.add_argument("start-maximized")
s = Service('C:\\BrowserDrivers\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get("https://www.worldometers.info/coronavirus/")
data = WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#main_table_countries_today"))).get_attribute("outerHTML")
df = pd.read_html(data)
print(df)
driver.quit()
Console Output:
[ # Country,Other TotalCases NewCases ... Deaths/1M pop TotalTests Tests/ 1M pop Population
0 NaN World 264359298 632349.0 ... 673.3 NaN NaN NaN
1 1.0 USA 49662381 89259.0 ... 2415.0 756671013.0 2267182.0 3.337495e+08
2 2.0 India 34609741 3200.0 ... 336.0 643510926.0 459914.0 1.399198e+09
3 3.0 Brazil 22118782 12910.0 ... 2865.0 63776166.0 297051.0 2.146975e+08
4 4.0 UK 10329074 53945.0 ... 2124.0 364875273.0 5335159.0 6.839070e+07
.. ... ... ... ... ... ... ... ... ...
221 221.0 Samoa 3 NaN ... NaN NaN NaN 2.002800e+05
222 222.0 Saint Helena 2 NaN ... NaN NaN NaN 6.103000e+03
223 223.0 Micronesia 1 NaN ... NaN NaN NaN 1.167290e+05
224 224.0 Tonga 1 NaN ... NaN NaN NaN 1.073890e+05
225 NaN Total: 264359298 632349.0 ... 673.3 NaN NaN NaN
[226 rows x 15 columns]]

Related

Python Selenium cannot find a way to scroll down (checked other responses an none seem to work)

I'm currently trying to scrape the Kaggle rankings and the page is an infinte loading scroll. I would like to get at least the first 2000 ranked kagglers, and so to solve this I've created this script:
No matter what I do I don't see the browser scrolling and the lista_parseada list always has a length of 20. Can somebody help with this? Thanks!!
My code below:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import re
import pandas as pd
import numpy as np
import time
from datetime import date
# Notebook rankings url
url = 'https://www.kaggle.com/rankings?group=notebooks&page=1&pageSize=20'
wait_delay = 10 # seconds
scroll_pause_time = 2 # seconds
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument('-private')
driver = webdriver.Firefox(options=firefox_options)
# load page
driver.get(url)
try:
WebDriverWait(driver, wait_delay).until(EC.presence_of_element_located((By.ID, 'site-content')))
print("Page is ready!")
except Exception as e:
print(e)
print("Loading took too much time!")
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_pass = 0
while scroll_pass < 10:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll_pass+=1
lista = driver.find_elements_by_xpath('//div[#role="button"]//div[#class="leaderboards__name"]/p/a')
lista_parseada = [link.get_attribute('href') for link in lista]
print(len(lista_parseada))
driver.close()
Kaggle has an api, so it's better to use it:
import requests
import json
import pandas as pd
def post_call(url: str, headers: dict, data: dict) -> dict:
response = requests.post(url=url, headers=headers, json=data)
response.raise_for_status()
return response.json()
url = 'https://www.kaggle.com/api/i/users.ProgressionService/GetUserRankings'
headers = {
"content-type": "application/json",
"cookie": "CSRF-TOKEN=VxwvCfDJ7O7KAEn8EWH0HeK9uT-G89SyETB0-hq9mZZhVsjDDIFJAh4OOhIUFjymST0kO8oX43sl86ZuOudHOoxHlPWV-krcTXNUlSgOQA;",
"x-xsrf-token": "CfDJ7O7VujnuKA6ZuOudEn8ExwsAkR8eU_RQRaWH0HLuA2qYIkNHMeUOWequ-h2j0YuQNki8aAxC0j5tYvo9fI9fL-j9yzhevhI4MPdC9DRHLWnA"
}
tdf = []
for i in range(1, 101):
data = {
"group": "ACHIEVEMENT_SUMMARY_TYPE_NOTEBOOKS",
"page": i,
"pageSize": 20
}
df = pd.json_normalize(post_call(url, headers, data)['list'])
# use drop(columns=["value1", "value2"]) to exclude unnecessary values
tdf.append(df.drop(columns=["thumbnailUrl"]))
# reset indexes
print(pd.concat(tdf).reset_index(drop=True))
Output df with 2000 users:
currentRanking displayName userId userUrl tier points joinTime totalGoldMedals totalSilverMedals totalBronzeMedals
0 1 Chris Deotte 1723677 /cdeotte GRANDMASTER 4943 2018-03-14T22:51:30.630Z 71.0 17.0 3.0
1 2 Marília Prata 3012786 /mpwolke MASTER 3621 2019-03-29T19:09:20.750Z 12.0 39.0 450.0
2 3 Abhishek Thakur 5309 /abhishek GRANDMASTER 3169 2011-01-12T03:44:52Z 65.0 28.0 24.0
3 4 AmbrosM 7917824 /ambrosm GRANDMASTER 2737 2021-07-16T18:36:58.170Z 28.0 8.0 8.0
4 5 Y.Nakama 1695531 /yasufuminakama GRANDMASTER 2630 2018-03-06T11:56:37.560Z 37.0 9.0 6.0
... ... ... ... ... ... ... ... ... ... ...
1995 1996 ayoub chaoui 6625407 /ayoubchaoui EXPERT 51 2021-01-30T15:31:19.840Z NaN 1.0 6.0
1996 1997 micheldc55 6646082 /micheldc55 EXPERT 51 2021-02-02T18:58:13.170Z NaN NaN 5.0
1997 1998 Hugo R. V. Angulo 6910521 /hugovallejo EXPERT 51 2021-03-10T18:29:25.247Z NaN 1.0 7.0
1998 1999 Dina Nabil 7213495 /dinanabil811 EXPERT 51 2021-04-18T11:09:01.470Z NaN NaN 5.0
1999 2000 Naser Al-qaydeh 7424338 /naseralqaydeh EXPERT 51 2021-05-15T13:16:16.093Z NaN NaN 8.0
Cookies and other info can be found on the "Network" tab in "DevTools"
In cookies you only need "CSRF-TOKEN"

How to find the attribute and element id by selenium.webdriver?

I am learning web scrapping since I need it for my work. I wrote the following code:
from selenium import webdriver
chromedriver='/home/es/drivers/chromedriver'
driver = webdriver.Chrome(chromedriver)
driver.implicitly_wait(30)
driver.get('http://crdd.osdd.net/raghava/hemolytik/submitkey_browse.php?ran=1955')
df = pd.read_html(driver.find_element_by_id("table.example.display.datatable").get_attribute('example'))[0]
However, it is showing the following error:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="table.example.display.datatable"]"}
(Session info: chrome=103.0.5060.134)
Then I inspect the table that I wanna scrape this table from this page
what is the attribute that needs to be included in get_attribute() function in the following line?
df = pd.read_html(driver.find_element_by_id("table.example.display.datatable").get_attribute('example'))[0]
what I should write in the driver.find_element_by_id?
EDITED:
Some tables have lots of records in multi-pages.
For example, this page has 2,246 entries, which shows 100 entries on each page. Once I tried to web-scrape it, there were only 320 entries in df and the record ID is from 1232-1713, which means it took entries from the next few pages and it is not starting from the first page to the end at the last page.
What we can do in such cases?
You need to get the outerHTML property of the table first, then call the table element from pandas.
You need to wait for element to be visible. Use explicit wait like WebdriverWait()
driver.get('http://crdd.osdd.net/raghava/hemolytik/submitkey_browse.php?ran=1955')
table=WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"table#example")))
tableRows=table.get_attribute("outerHTML")
df = pd.read_html(tableRows)[0]
print(df)
Import below libraries.
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import pandas as pd
Output:
ID PMID YEAR ... DSSP Natural Structure Final Structure
0 1643 16137634 2005 ... CCCCCCCCCCCSCCCC NaN NaN
1 1644 16137634 2005 ... CCTTSCCSSCCCC NaN NaN
2 1645 16137634 2005 ... CTTTCGGGHHHHHHHHCC NaN NaN
3 1646 16137634 2005 ... CGGGTTTHHHHHHHGGGC NaN NaN
4 1647 16137634 2005 ... CCSCCCSSCHHHHHHHHHTTC NaN NaN
5 1910 16730859 2006 ... CCCCCCCSSCCSHHHHHHHHTTHHHHHHHHSSCCC NaN NaN
6 1911 16730859 2006 ... CCSCC NaN NaN
7 1912 16730859 2006 ... CCSSSCSCC NaN NaN
8 1913 16730859 2006 ... CCCSSCCSSCCSHHHHHTTHHHHTTTCSCC NaN NaN
9 1914 16730859 2006 ... CCSHHHHHHHHHHHHHCCCC NaN NaN
10 2110 11226440 2001 ... CCCSSCCCBTTBTSSSSSSCSCC NaN NaN
11 3799 9204560 1997 ... CCSSCC NaN NaN
12 4149 16137634 2005 ... CCHHHHHHHHHHHC NaN NaN
[13 rows x 17 columns]
If you want to select table by #id you need
driver.find_element_by_id("example")
By.CSS:
driver.find_element_by_css_selector("table#example")
By.XPATH:
driver.find_element_by_xpath("//table[#id='example'])
If you want to extract #id value you need
.get_attribute('id')
Since there is not much sense in searching by #id to extract that exact #id you might use other attribute of table node:
driver.find_element_by_xpath("//table[#aria-describedby='example_info']").get_attribute('id')

How to capture data from table using selenium python?

I need to capture the table from the link:
https://fr.tradingeconomics.com/country-list/rating
I tried the following code but I don't get any response
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://fr.tradingeconomics.com/country-list/rating"
driver.get(my_url)
#actions = ActionChains(driver)
WebDriverWait(driver, 50).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "table table-hover")))
trs = driver.find_elements(By.TAG_NAME, "tr")
print(len(trs))
countries = []
for tr in trs:
country = {}
items= tr.find_elements(By.TAG_NAME, "td")
for item in items:
country_name = item.find_element(By.XPATH, "//*[#id='ctl00_ContentPlaceHolder1_ctl01_GridView1']/tbody/tr[2]/td[1]")
country['country_name'] = country_name.get_attribute('text')
s_and_p = item.find_element(By.XPATH, "//*[#id='ctl00_ContentPlaceHolder1_ctl01_GridView1']/tbody/tr[2]/td[2]")
country['S&P']= s_and_p.get_attribute("text")
moodys = item.find_element(By.XPATH, "//*[#id='ctl00_ContentPlaceHolder1_ctl01_GridView1']/tbody/tr[2]/td[3]")
country['Moody\'s'] = moodys.get_attribute("text")
countries.append(country)
print(country)
Any help would be appreciated. Thank you.
As the url isn't dynamic, so you also can easily grab table data using pandas only.
import pandas as pd
url='https://fr.tradingeconomics.com/country-list/rating'
df = pd.read_html(url)[0]
print(df)
Output:
Unnamed: 0 S&P Moody's Fitch DBRS TE
0 Albanie B+ B1 NaN NaN 35.0
1 Andorre BBB Baa2 BBB+ NaN 62.0
2 Angola B- B3 B- NaN 23.0
3 Argentine CCC+ Ca CCC CCC 15.0
4 Arménie B+ Ba3 B+ NaN 14.0
.. ... ... ... ... ... ...
151 Uruguay BBB Baa2 BBB- BBB (low) 55.0
152 Ouzbékistan BB- B1 BB- NaN 38.0
153 Venezuela NaN C RD NaN 11.0
154 Vietnam BB Ba3 BB NaN 43.0
155 Zambie SD Ca RD NaN 30.0
[156 rows x 6 columns]
You have to use innerText not text, also the first tr does not have td that's the reason you are not getting anything in response.
Selenium solution:
Code:
driver.maximize_window()
wait = WebDriverWait(driver, 30)
my_url= "https://fr.tradingeconomics.com/country-list/rating"
driver.get(my_url)
#actions = ActionChains(driver)
table = WebDriverWait(driver, 50).until(EC.visibility_of_element_located((By.XPATH, "//table[#class='table table-hover']")))
trs = table.find_elements(By.XPATH, ".//tr")
print(len(trs))
countries = []
for tr in trs:
tds = tr.find_elements(By.XPATH, ".//td[not(self::th)]")
for td in tds:
print(td.get_attribute('innerText'))
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

Scraping table from Python Beautifulsoup

I tried to scrape table from this website: https://stockrow.com/VRTX/financials/income/quarterly
I am using Python Google Colab and I'd like to have the dates as columns. (e.g. 2020-06-30 etc) I used code to do something like this:
source = urllib.request.urlopen('https://stockrow.com/VRTX/financials/income/quarterly').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find_all('table')
However, I cannot get the tables. I am a bit new to scraping so I looked at other Stackoverflow pages but couldn't solve the problem. Can you please help me? That would be much appreciated.
You can use their API to load the data:
import requests
import pandas as pd
indicators_url = 'https://stockrow.com/api/indicators.json'
data_url = 'https://stockrow.com/api/companies/VRTX/financials.json?ticker=VRTX&dimension=Q&section=Income+Statement'
indicators = {i['id']: i for i in requests.get(indicators_url).json()}
all_data = []
for d in requests.get(data_url).json():
d['id'] = indicators[d['id']]['name']
all_data.append(d)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Prints:
id 2020-06-30 2020-03-31 2019-12-31 2019-09-30 2019-06-30 ... 2011-12-31 2011-09-30 2011-06-30 2011-03-31 2010-12-31 2010-09-30
0 Consolidated Net Income/Loss 837270000.0 602753000.0 583234100.0 57518000.0 267427000.0 ... 188141000.0 228452000.0 -199318000.0 -176096000.0 -180392000.0 -208957000.0
1 EPS (Basic, from Continuous Ops) 3.2248 2.3199 2.2654 0.2239 1.044 ... 0.9374 1.109 -0.9751 -0.8703 -0.8966 -1.0402
2 Net Profit Margin 0.5492 0.3978 0.4127 0.0606 0.2841 ... 0.2816 0.3354 -1.5213 -2.3906 -2.7531 -8.7816
3 Gross Profit 1339965000.0 1352610000.0 1228253000.0 817914000.0 805553000.0 ... 533213000.0 620794000.0 105118000.0 70996000.0 62475000.0 20567000.0
4 Income Tax Provision -12500000.0 54781000.0 93716000.0 13148000.0 59711000.0 ... 22660000.0 -27842000.0 24448000.0 0.0 NaN 0.0
5 Operating Income 718033000.0 720224100.0 551464400.0 99333000.0 269960000.0 ... 223901900.0 215707000.0 -165890000.0 -159899000.0 -166634000.0 -199588000.0
6 EBIT 718033000.0 720224100.0 551464700.0 99333000.0 269960000.0 ... 223901900.0 215707000.0 -165890000.0 -159899000.0 -166634000.0 -199588000.0
7 EPS (Diluted, from Cont. Ops) 3.1787 2.2874 2.2319 0.2208 1.0293 ... 1.0011 1.0415 -0.9751 -0.8703 -0.8966 -1.0402
8 EBITDA 744730000.0 747045000.0 577720400.0 125180000.0 297658000.0 ... 233625900.0 223457000.0 -157181000.0 -151041000.0 -158429000.0 -192830000.0
9 EPS (Basic, Consolidated) 3.2248 2.3199 2.2654 0.2239 1.044 ... 0.9374 1.109 -0.9751 -0.8703 -0.8966 -1.0402
10 EBT 824770000.0 657534000.0 676950000.0 70666000.0 327138000.0 ... 210801000.0 200610000.0 -174870000.0 -176096000.0 -180392000.0 -208957000.0
11 Operating Cash Flow Margin 0.6812 0.5384 0.3156 0.3525 0.4927 ... 0.8941 0.0651 -1.8894 -2.5336 -2.535 -6.8918
12 EBT margin 0.541 0.434 0.479 0.0744 0.3475 ... 0.3742 0.3043 -1.5283 -2.3906 -2.7531 -8.7816
13 EBIT Margin 0.471 0.4754 0.3902 0.1046 0.2868 ... 0.3975 0.3272 -1.4498 -2.1707 -2.5431 -8.3878
14 Income from Continuous Operations 837270000.0 602753000.0 583234000.0 57518000.0 267427000.0 ... 188141000.0 228452000.0 -199318000.0 -176096000.0 -180392000.0 -208957000.0
15 R&D Expenses 420928000.0 448528000.0 480011000.0 555948000.0 379091000.0 ... 186438000.0 189052000.0 173604000.0 158612000.0 168888000.0 170434000.0
16 Non-operating Interest Expenses 13871000.0 14136000.0 14249000.0 14548000.0 14837000.0 ... 11659000.0 7059000.0 6962000.0 12001000.0 7686000.0 3951000.0
17 EBITDA Margin 0.4885 0.4931 0.4088 0.1318 0.3162 ... 0.4147 0.339 -1.3737 -2.0505 -2.4179 -8.1038
18 Non-operating Income/Expense 106737000.0 -62690000.0 125485000.0 -28667000.0 57178000.0 ... -13101000.0 -15097000.0 -8980000.0 -16197000.0 -13758000.0 -9369000.0
19 EPS (Basic) 3.22 2.32 2.26 0.22 1.04 ... 0.76 1.06 -0.85 -0.87 -0.9 -1.04
20 Gross Margin 0.879 0.8927 0.8691 0.8611 0.8558 ... 0.9465 0.9417 0.9187 0.9638 0.9535 0.8643
21 Revenue 1524485000.0 1515107000.0 1413265000.0 949828000.0 941293000.0 ... 563340000.0 659200000.0 114424000.0 73662000.0 65524000.0 23795000.0
22 Shares (Diluted, Average) 263403000.0 263515000.0 262108000.0 260473000.0 259822000.0 ... 217602000.0 219349000.0 204413000.0 202329000.0 201355000.0 200887000.0
23 Cost of Revenue 184520000.0 162497000.0 185012000.0 131914000.0 135740000.0 ... 30127000.0 38406000.0 9306000.0 2666000.0 3049000.0 3228000.0
24 SG&A Expenses 191804000.0 182258000.0 195277000.0 159674000.0 156502000.0 ... 121881000.0 110654000.0 96663000.0 71523000.0 62478000.0 48855000.0
25 EPS (Diluted, Consolidated) 3.1787 2.2874 2.2319 0.2208 1.0293 ... 1.0011 1.0415 -0.9751 -0.8703 -0.8966 -1.0402
26 Revenue Growth 0.6196 0.765 0.6242 0.2107 0.2515 ... 7.5975 26.7033 2.6185 2.2842 0.9335 -0.0466
27 Shares (Basic, Weighted) 259637000.0 259815000.0 256728000.0 256946000.0 256154000.0 ... 204891000.0 206002000.0 204413000.0 202329000.0 200402000.0 200887000.0
28 Income after Tax 837270000.0 602753000.0 583234000.0 57518000.0 267427000.0 ... 188141000.0 228452000.0 -199318000.0 -176096000.0 -180392000.0 -208957000.0
29 EPS (Diluted) 3.18 2.29 2.23 0.22 1.03 ... 0.74 1.02 -0.85 -0.87 -0.9 -1.04
30 Net Income Common 837270000.0 602753000.0 583234100.0 57518000.0 267427000.0 ... 158629000.0 221110000.0 -174069000.0 -176096000.0 -180392000.0 -208957000.0
31 Shares (Diluted, Weighted) 263403000.0 263515000.0 260673000.0 260473000.0 259822000.0 ... 208807000.0 219349000.0 204413000.0 202329000.0 200402000.0 200887000.0
32 Non-Controlling Interest NaN NaN NaN NaN NaN ... 29512000.0 7342000.0 -25249000.0 0.0 NaN 0.0
33 Dividends (Preferred) NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN
34 EPS (Basic, from Discontinued Ops) NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN
35 EPS (Diluted, from Disc. Ops) NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN
36 Income from Discontinued Operations NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN
[37 rows x 41 columns]
And saves data.csv:
Or donwload their XLSX from that page:
url = 'https://stockrow.com/api/companies/VRTX/financials.xlsx?dimension=Q&section=Income%20Statement&sort=desc'
df = pd.read_excel(url)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(df)
First problem is, that table is loaded via javascript and BeautifulSoup does not find it, because it's not loaded yet at the moment of parsing. To solve this problem you'll need to use selenium.
Second problem is, that there is no table tag in HTML, it uses grid formatting.
Since you're using Google Colab, you'll need to install there selenium web driver (code taken from this answer):
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
After that you can load the page and parse it:
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# load page via selenium
wd.get("https://stockrow.com/VRTX/financials/income/quarterly")
# wait 5 seconds until element with class mainGrid will be loaded
grid = WebDriverWait(wd, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'mainGrid')))
# parse content of the grid
soup = BeautifulSoup(grid.get_attribute('innerHTML'), 'lxml')
# access grid cells, your logic should be here
for tag in soup.find_all('div', {'class': 'financials-value'}):
print(tag)

Selenium code can not catch the table from Chrome

I am using selenium to parse from
https://www.worldometers.info/coronavirus/
and doing as the following, I get attribute error and the table variable remains empty, what is the reason ?
I use Chrome 80. Are the tags right ?
AttributeError: 'NoneType' object has no attribute 'tbody'
from selenium import webdriver
import bs4
browser = webdriver.Chrome()
browser.get("https://www.worldometers.info/coronavirus/")
html = bs4.BeautifulSoup(browser.page_source, "html.parser")
table = html.find("table",class_="table table-bordered table-hover main_table_countries dataTable no-footer") #
Wherever I have table tags, I find it easier to use pandas to capture the table.
import pandas as pd
url = 'https://www.worldometers.info/coronavirus/'
table = pd.read_html(url)[0]
Output:
print(table)
Country,Other TotalCases ... Tot Cases/1M pop Tot Deaths/1M pop
0 China 81093 ... 56.00 2.0
1 Italy 63927 ... 1057.00 101.0
2 USA 43734 ... 132.00 2.0
3 Spain 35136 ... 751.00 49.0
4 Germany 29056 ... 347.00 1.0
.. ... ... ... ... ...
192 Somalia 1 ... 0.06 NaN
193 Syria 1 ... 0.06 NaN
194 Timor-Leste 1 ... 0.80 NaN
195 Turks and Caicos 1 ... 26.00 NaN
196 Total: 378782 ... 48.60 2.1
[197 rows x 10 columns]

Categories

Resources