I am currently trying to setup a webscraper in Python for the following webpage:
https://understat.com/team/Juventus/2018
specifically for the 'team-players jTable'
I have managed to scrape the table successfully with BeautifulSoup and selenium, but there are hidden columns (accessible via the options popup window) that I can't initialize and include in my scraping.
Anyone know how to change this?
import urllib.request
from bs4 import BeautifulSoup
import lxml
import re
import requests
from selenium import webdriver
import pandas as pd
import re
import random
import datetime
base_url = 'https://understat.com/team/Juventus/2018'
url = base_url
data = requests.get(url)
html = data.content
soup = BeautifulSoup(html, 'lxml')
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('/Users/kylecaron/Desktop/souptest/chromedriver',options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'lxml')
headers = soup.find('div', attrs={'class':'players jTable'}).find('table').find_all('th',attrs={'class':'sort'})
headers_list = [header.get_text(strip=True) for header in headers]
body = soup.find('div', attrs={'class':'players jTable'}).table.tbody
all_rows_list = []
for tr in body.find_all('tr'):
row = tr.find_all('td')
current_row = []
for item in row:
current_row.append(item.get_text(strip=True))
all_rows_list.append(current_row)
headers_list = ['№', 'Player', 'Positions', 'Apps', 'Min', 'G', 'A', 'Sh90', 'KP90', 'xG', 'xA', 'xG90', 'xA90']
xg_df = pd.DataFrame(all_rows_list, columns=headers_list)
If you navigate to the website, there are hidden table columns such as 'XGChain'. I want all of these hidden elements scraped, but having trouble doing it.
Best,
Kyle
Here you go. You could still use BeautifulSoup to iterate through the tr and td tags, but I always find pandas much easier to get tables, as it does the work for you.
from selenium import webdriver
import pandas as pd
url = 'https://understat.com/team/Juventus/2018'
driver = webdriver.Chrome()
driver.get(url)
# Click the Options Button
driver.find_element_by_xpath('//*[#id="team-players"]/div[1]/button/i').click()
# Click the fields that are hidden
hidden = [7, 12, 14, 15, 17, 19, 20, 21, 22, 23, 24]
for val in hidden:
x_path = '//*[#id="team-players"]/div[2]/div[2]/div/div[%s]/div[2]/label' %val
driver.find_element_by_xpath(x_path).click()
# Appy the filter
driver.find_element_by_xpath('//*[#id="team-players"]/div[2]/div[3]/a[2]').click()
# get the tables in source
tables = pd.read_html(driver.page_source)
data = tables[1]
data.rename(columns={'Unnamed: 22':"Yellow_Cards", "Unnamed: 23":"Red_Cards"})
driver.close()
Output:
print (data.columns)
Index(['№', 'Player', 'Pos', 'Apps', 'Min', 'G', 'NPG', 'A', 'Sh90', 'KP90',
'xG', 'NPxG', 'xA', 'xGChain', 'xGBuildup', 'xG90', 'NPxG90', 'xA90',
'xG90 + xA90', 'NPxG90 + xA90', 'xGChain90', 'xGBuildup90',
'Yellow_Cards', 'Red_Cards'],
dtype='object')
print (data)
№ Player ... Yellow_Cards Red_Cards
0 1.0 Cristiano Ronaldo ... 2 0
1 2.0 Mario Mandzukic ... 3 0
2 3.0 Paulo Dybala ... 1 0
3 4.0 Federico Bernardeschi ... 2 0
4 5.0 Blaise Matuidi ... 2 0
5 6.0 Rodrigo Bentancur ... 5 1
6 7.0 Juan Cuadrado ... 2 0
7 8.0 Leonardo Bonucci ... 1 0
8 9.0 Miralem Pjanic ... 4 0
9 10.0 Sami Khedira ... 0 0
10 11.0 Giorgio Chiellini ... 1 0
11 12.0 Medhi Benatia ... 2 0
12 13.0 Douglas Costa ... 2 1
13 14.0 Emre Can ... 2 0
14 15.0 Mattia Perin ... 1 0
15 16.0 Mattia De Sciglio ... 0 0
16 17.0 Wojciech Szczesny ... 0 0
17 18.0 Andrea Barzagli ... 0 0
18 19.0 Alex Sandro ... 3 0
19 20.0 Daniele Rugani ... 1 0
20 21.0 Moise Kean ... 0 0
21 22.0 João Cancelo ... 2 0
22 NaN NaN ... 36 2
[23 rows x 24 columns]
Related
I'm currently trying to scrape the Kaggle rankings and the page is an infinte loading scroll. I would like to get at least the first 2000 ranked kagglers, and so to solve this I've created this script:
No matter what I do I don't see the browser scrolling and the lista_parseada list always has a length of 20. Can somebody help with this? Thanks!!
My code below:
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import re
import pandas as pd
import numpy as np
import time
from datetime import date
# Notebook rankings url
url = 'https://www.kaggle.com/rankings?group=notebooks&page=1&pageSize=20'
wait_delay = 10 # seconds
scroll_pause_time = 2 # seconds
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument('-private')
driver = webdriver.Firefox(options=firefox_options)
# load page
driver.get(url)
try:
WebDriverWait(driver, wait_delay).until(EC.presence_of_element_located((By.ID, 'site-content')))
print("Page is ready!")
except Exception as e:
print(e)
print("Loading took too much time!")
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_pass = 0
while scroll_pass < 10:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll_pass+=1
lista = driver.find_elements_by_xpath('//div[#role="button"]//div[#class="leaderboards__name"]/p/a')
lista_parseada = [link.get_attribute('href') for link in lista]
print(len(lista_parseada))
driver.close()
Kaggle has an api, so it's better to use it:
import requests
import json
import pandas as pd
def post_call(url: str, headers: dict, data: dict) -> dict:
response = requests.post(url=url, headers=headers, json=data)
response.raise_for_status()
return response.json()
url = 'https://www.kaggle.com/api/i/users.ProgressionService/GetUserRankings'
headers = {
"content-type": "application/json",
"cookie": "CSRF-TOKEN=VxwvCfDJ7O7KAEn8EWH0HeK9uT-G89SyETB0-hq9mZZhVsjDDIFJAh4OOhIUFjymST0kO8oX43sl86ZuOudHOoxHlPWV-krcTXNUlSgOQA;",
"x-xsrf-token": "CfDJ7O7VujnuKA6ZuOudEn8ExwsAkR8eU_RQRaWH0HLuA2qYIkNHMeUOWequ-h2j0YuQNki8aAxC0j5tYvo9fI9fL-j9yzhevhI4MPdC9DRHLWnA"
}
tdf = []
for i in range(1, 101):
data = {
"group": "ACHIEVEMENT_SUMMARY_TYPE_NOTEBOOKS",
"page": i,
"pageSize": 20
}
df = pd.json_normalize(post_call(url, headers, data)['list'])
# use drop(columns=["value1", "value2"]) to exclude unnecessary values
tdf.append(df.drop(columns=["thumbnailUrl"]))
# reset indexes
print(pd.concat(tdf).reset_index(drop=True))
Output df with 2000 users:
currentRanking displayName userId userUrl tier points joinTime totalGoldMedals totalSilverMedals totalBronzeMedals
0 1 Chris Deotte 1723677 /cdeotte GRANDMASTER 4943 2018-03-14T22:51:30.630Z 71.0 17.0 3.0
1 2 Marília Prata 3012786 /mpwolke MASTER 3621 2019-03-29T19:09:20.750Z 12.0 39.0 450.0
2 3 Abhishek Thakur 5309 /abhishek GRANDMASTER 3169 2011-01-12T03:44:52Z 65.0 28.0 24.0
3 4 AmbrosM 7917824 /ambrosm GRANDMASTER 2737 2021-07-16T18:36:58.170Z 28.0 8.0 8.0
4 5 Y.Nakama 1695531 /yasufuminakama GRANDMASTER 2630 2018-03-06T11:56:37.560Z 37.0 9.0 6.0
... ... ... ... ... ... ... ... ... ... ...
1995 1996 ayoub chaoui 6625407 /ayoubchaoui EXPERT 51 2021-01-30T15:31:19.840Z NaN 1.0 6.0
1996 1997 micheldc55 6646082 /micheldc55 EXPERT 51 2021-02-02T18:58:13.170Z NaN NaN 5.0
1997 1998 Hugo R. V. Angulo 6910521 /hugovallejo EXPERT 51 2021-03-10T18:29:25.247Z NaN 1.0 7.0
1998 1999 Dina Nabil 7213495 /dinanabil811 EXPERT 51 2021-04-18T11:09:01.470Z NaN NaN 5.0
1999 2000 Naser Al-qaydeh 7424338 /naseralqaydeh EXPERT 51 2021-05-15T13:16:16.093Z NaN NaN 8.0
Cookies and other info can be found on the "Network" tab in "DevTools"
In cookies you only need "CSRF-TOKEN"
I am try to scrape tables from Rotowire. pd.read is only returning the Headers.
import pandas as pd
url = pd.read_html("http://www.rotowire.com/daily/mlb/optimizer.htm?site=DraftKings&sport=MLB")
# for idx, table in enumerate(url):
# print("***************************")
# print(idx)
# print(table)
url[5]
Output:
Player Team Position Salary Fpts. Val Min. % Max. % Exposure
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
No idea what table you want, but you're not going to get anything from the static html response as the page is rendered through javascript. They do have some data you can access though. You'd have to work out the parameters:
import pandas as pd
import requests
url = 'https://www.rotowire.com/daily/tables/optimizer-mlb.php'
payload = {
'siteID': '1',
'slateID': '6441',
'projSource': 'RotoWire',
'rst': 'RotoWire'}
jsonData = requests.get(url, params=payload).json()
df = pd.DataFrame(jsonData)
Output:
print(df)
id playerID rotoPlayerID ... ie_green_lights ie_matchup_notes ie_volatility
0 12739 11095 12739 ... 0 0
1 10510 4081 10510 ... 0 0
2 16036 5163 16036 ... 0 0
3 14194 10827 14194 ... 0 0
4 14865 15463 14865 ... 0 0
.. ... ... ... ... ... ... ...
687 14444 11330 14444 ... 0 0
688 14440 18894 14440 ... 0 0
689 14439 18905 14439 ... 0 0
690 14435 5058 14435 ... 0 0
691 17921 18828 17921 ... 0 0
[692 rows x 99 columns]
I have scraped information with the results of the 2016 Chess Olympiad, using the following code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
#Imports the HTML into python
url = 'https://www.olimpbase.org/2016/2016te14.html'
requests.get(url)
page = requests.get(url)
print(page)
soup = BeautifulSoup(page.text, 'lxml')
#Subsets the HTML to only get the HTML of our table needed
table = soup.find('table', attrs = {'border': '1'})
print(table)
#Gets all the column headers of our table, but just for the first eleven columns in the webpage
table.find_all('td', class_= 'bog')[1:12]
headers = []
for i in table.find_all('td', class_= 'bog')[1:12]:
title = i.text.strip()
headers.append(title)
#Creates a dataframe using the column headers from our table
df = pd.DataFrame(columns = headers)
table.find_all('tr')[3:] #We grab data since the fourth row; the previous ones belong to the headers.
for j in table.find_all('tr')[3:]:
row_data = j.find_all('td')
row = [tr.text for tr in row_data][0:11]
length = len(df)
df.loc[length] = row
I want to do the same thing for the results of 2014 and 2012 (the Olympics are played every two years normally), authomatically. I have advanced the code half the way, but I really don't know how to continue. This is what I've done so far.
import requests
from bs4 import BeautifulSoup
import pandas as pd
#Imports the HTML into python
url = 'https://www.olimpbase.org/2016/2016te14.html'
requests.get(url)
page = requests.get(url)
print(page)
soup = BeautifulSoup(page.text, 'lxml')
#Subsets the HTML to only get the HTML of our table needed
table = soup.find('table', attrs = {'border': '1'})
print(table)
#Gets all the column headers of our table
table.find_all('td', class_= 'bog')[1:12]
headers = []
for i in table.find_all('td', class_= 'bog')[1:12]:
title = i.text.strip()
headers.append(title)
#Creates a dataframe using the column headers from our table
df = pd.DataFrame(columns = headers)
table.find_all('tr')[3:] #We grab data since the fourth row; the previous ones belong to the headers.
start_year=2012
i=2
end_year=2016
def download_chess(start_year):
url = f'https://www.olimpbase.org/{start_year}/{start_year}te14.html'
response = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
for j in table.find_all('tr')[3:]:
row_data = j.find_all('td')
row = [tr.text for tr in row_data][0:11]
length = len(df)
df.loc[length] = row
while start_year<end_year:
download_chess(start_year)
start_year+=i
download_chess(start_year)
I don't have much experience so I don't quite understand the logic of writing filenames. I hope you can help me.
The following will retrieve information for a range of years - in this case, 2000 -- 2018, and save each table to csv as well:
import requests
import pandas as pd
years = range(2000, 2019, 2)
for y in years:
try:
df = pd.read_html(f'https://www.olimpbase.org/{y}/{y}te14.html')[1]
new_header = df.iloc[2]
df = df[3:]
df.columns = new_header
print(df)
df.to_csv(f'chess_olympics_{y}.csv')
except Exception as e:
print(y, 'error', e)
This will print out the results table for each year:
no.
team
Elo
flag
code
pos.
pts
Buch
MP
gms
nan
+
=
-
nan
+
=
-
nan
%
Eloav
Elop
ind.medals
3
1
Russia
2685
nan
RUS
1
38
457.5
20
56
nan
8
4
2
nan
23
30
3
nan
67.9
2561
2694
1 - 0 - 2
4
2
Germany
2604
nan
GER
2
37
455.5
22
56
nan
10
2
2
nan
21
32
3
nan
66.1
2568
2685
0 - 0 - 2
5
3
Ukraine
2638
nan
UKR
3
35½
457.5
21
56
nan
8
5
1
nan
18
35
3
nan
63.4
2558
2653
1 - 0 - 0
6
4
Hungary
2661
nan
HUN
4
35½
455.5
21
56
nan
8
5
1
nan
22
27
7
nan
63.4
2570
2665
0 - 0 - 0
7
5
Israel
2652
nan
ISR
5
34½
463.5
20
56
nan
7
6
1
nan
17
35
4
nan
61.6
2562
2649
0 - 0 - 0
[...]
Relevant documentation for pandas: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
i'm more than a noob in python, i'm tryng to get some tables from this page:
https://www.basketball-reference.com/wnba/boxscores/202208030SEA.html
Using Pandas and command pd.read_html i'm able to get most of them but not the "Line Score" and the "Four Factors"...if i print all the table (they are 19) these two are missing, inspecting with chrome they seem to be table and i also get them with excel importing from web.
What am i missing here?
Any help appreciated, thanks!
If you look at the page source (not by inspecting), you'd see those tables are within the comments of the html. You can either a) edit the html str and remove the <!-- and --> from the html, then let pandas parse, or 2) use bs4 to pull out the comments, then parse that tables that way.
I'll show you both options:
Option 1: Remove the comment tags from the page source
import requests
import pandas as pd
url = 'https://www.basketball-reference.com/wnba/boxscores/202208030SEA.html'
response = requests.get(url).text.replace("<!--","").replace("-->","")
dfs = pd.read_html(response, header=1)
Output:
You can see you now have 21 tables, with the 4th and 5th tables the ones in question.
print(len(dfs))
for each in dfs[3:5]:
print('\n\n', each, '\n')
21
Unnamed: 0 1 2 3 4 T
0 Minnesota Lynx 18 14 22 23 77
1 Seattle Storm 30 26 22 11 89
Unnamed: 0 Pace eFG% TOV% ORB% FT/FGA ORtg
0 MIN 97.0 0.507 16.1 14.3 0.101 95.2
1 SEA 97.0 0.579 11.8 9.7 0.114 110.1
Option 2: Pull out comments with bs4
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
url = 'https://www.basketball-reference.com/wnba/boxscores/202208030SEA.html'
result = requests.get(url).text
data = BeautifulSoup(result, 'html.parser')
dfs = pd.read_html(url, header=1)
comments = data.find_all(string=lambda text: isinstance(text, Comment))
other_tables = []
for each in comments:
if '<table' in str(each):
try:
other_tables.append(pd.read_html(str(each), header=1)[0])
except:
continue
Output:
for each in other_tables:
print(each, '\n')
Unnamed: 0 1 2 3 4 T
0 Minnesota Lynx 18 14 22 23 77
1 Seattle Storm 30 26 22 11 89
Unnamed: 0 Pace eFG% TOV% ORB% FT/FGA ORtg
0 MIN 97.0 0.507 16.1 14.3 0.101 95.2
1 SEA 97.0 0.579 11.8 9.7 0.114 110.1
I am using selenium to parse from
https://www.worldometers.info/coronavirus/
and doing as the following, I get attribute error and the table variable remains empty, what is the reason ?
I use Chrome 80. Are the tags right ?
AttributeError: 'NoneType' object has no attribute 'tbody'
from selenium import webdriver
import bs4
browser = webdriver.Chrome()
browser.get("https://www.worldometers.info/coronavirus/")
html = bs4.BeautifulSoup(browser.page_source, "html.parser")
table = html.find("table",class_="table table-bordered table-hover main_table_countries dataTable no-footer") #
Wherever I have table tags, I find it easier to use pandas to capture the table.
import pandas as pd
url = 'https://www.worldometers.info/coronavirus/'
table = pd.read_html(url)[0]
Output:
print(table)
Country,Other TotalCases ... Tot Cases/1M pop Tot Deaths/1M pop
0 China 81093 ... 56.00 2.0
1 Italy 63927 ... 1057.00 101.0
2 USA 43734 ... 132.00 2.0
3 Spain 35136 ... 751.00 49.0
4 Germany 29056 ... 347.00 1.0
.. ... ... ... ... ...
192 Somalia 1 ... 0.06 NaN
193 Syria 1 ... 0.06 NaN
194 Timor-Leste 1 ... 0.80 NaN
195 Turks and Caicos 1 ... 26.00 NaN
196 Total: 378782 ... 48.60 2.1
[197 rows x 10 columns]