I've been working with BeautifulSoup lately. I'm trying to get the data from https://www.pro-football-reference.com/teams/mia/2000_roster.htm site. Specifically all I want is the player name and 'gs' (games started).
However, when doing it, it's only returning the 1st ('Starters') table data. I'm actually not interested in that top table at all, I want the 2nd table titled 'Roster'.
Here's the code, that I was doing. Like I said, I didn't really want/need anything other than player name and games started, but was just practicing and learning BeautifulSoup.
import pandas as pd
import requests
import bs4
alpha = requests.get('https://www.pro-football-
reference.com/teams/mia/2000_roster.htm')
beta = bs4.BeautifulSoup(alpha.text,'lxml')
gama = beta.findAll('th',{'data-stat':'pos'})
position = [th.text for th in gama]
position = position[1:]
position = list(filter(None, position))
gama = beta.findAll('td',{'data-stat':'player'})
player = [td.text for td in gama]
player = player[1:]
while 'Defensive Starters' in player: player.remove('Defensive Starters')
while 'Special Teams Starters' in player: player.remove('Special Teams
Starters')
gama = beta.findAll('td',{'data-stat':'age'})
age = [td.text for td in gama]
age = list(filter(None, age))
gama = beta.findAll('td',{'data-stat':'gs'})
gs = [td.text for td in gama]
gs = list(filter(None, gs))
target = pd.DataFrame(
{
'player_name':player,
'position':position,
'gs':gs,
'age':age
})
Anyone see where I'm going wrong? Or maybe an alternative way to go about it?
To get the content from that table you need to use any browser simulator cause the response of that portion is generated dynamically. Data from the first table can easily be accessible without any browser simulator, though. I tried selenium in this case:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
page_url = "https://www.pro-football-reference.com/teams/mia/2000_roster.htm"
driver.get(page_url)
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.select(".table_outer_container")[1]
for items in table.select("tr"):
player = items.select("[data-stat='player']")[0].text
gs = items.select("[data-stat='gs']")[0].text
print(player,gs)
driver.quit()
Partial output:
Player GS
Trace Armstrong* 0
John Bock 1
Tim Bowens 15
Lorenzo Bromell 0
Autry Denson 0
Mark Dixon 15
Kevin Donnalley 16
For some reason if you encounter such error, this time there will be no such option for that error either:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
page_url = "https://www.pro-football-reference.com/teams/mia/2000_roster.htm"
driver.get(page_url)
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.select(".table_outer_container")[1]
for items in table.select("tr"):
player = items.select("[data-stat='player']")[0].text if items.select("[data-stat='player']") else ""
gs = items.select("[data-stat='gs']")[0].text if items.select("[data-stat='gs']") else ""
print(player,gs)
driver.quit()
Related
I am trying to extract the balance sheet for an example ticker "MSFT" (Microsoft) from Yahoo Finance.
Using Selenium to click on the button "Expand All" before any scraping is done. This part seems to work.
By the way, when the Chrome web driver is launched, I manually click on the button(s) to accept or reject cookies. In a later step, I plan to add some more code so that this part is also automated. My question is though not on this one now.
Below is how the code currently looks like.
# for scraping the balance sheet from Yahoo Finance
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
#content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
df = pd.DataFrame(columns=col)
# the following code returns an empty list for index (why?)
# and values in a list that need actually be in a DataFrame
idx = []
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
for h in div.find_all('title'):
text = h.get_text()
idx.append(text)
val = []
for div in soup.find_all('div', attrs={'data-test': 'fin-col'}):
for h in div.find_all('span'):
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
# if the above part is commented out and this block is used instead
# the following code manages to work well until the row "Cash Equivalents"
# that is because there are no entries for years 2020 and 2019 on this row
""" for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
i = 0
idx = ""
val = []
for h in div.find_all('span'):
if i % 5 == 0:
idx = h.get_text()
else:
num = int(h.get_text().replace(",", "")) * 1000
val.append(num)
i += 1
row = pd.DataFrame([val], columns=col, index=[idx])
df = pd.concat([df, row], axis=0) """
return idx, val
get_balance_sheet_from_yfinance("MSFT")
I could not get the data scraped from the expanded table in a usable tabular format. Instead, the function above returns what I managed to scrape from the webpage. There are some additional comments in the code.
Could you give me some ideas on how to properly extract the data and put it into a DataFrame object with index which should be the text under the "Breakdown" column? Basically, the DataFrame should look like the snapshot below, with what is under the first column in there being the index.
balance-sheet-df
i've spent a long time on this, hope it helps, basically your function now returns a dataFrame with the following formatting:
2022-06-29 2021-06-29 2020-06-29 2019-06-29
Total Assets 364,840,000 333,779,000 301,311,000 286,556,000
Current Assets 169,684,000 184,406,000 181,915,000 175,552,000
Cash, Cash Equivalents & Short Term Investments 104,749,000 130,334,000 136,527,000 133,819,000
Cash And Cash Equivalents 13,931,000 14,224,000 13,576,000 11,356,000
Cash 8,258,000 7,272,000 - -
... ... ... ... ...
Tangible Book Value 87,720,000 84,477,000 67,915,000 52,554,000
Total Debt 61,270,000 67,775,000 70,998,000 78,366,000
Net Debt 35,850,000 43,922,000 49,751,000 60,822,000
Share Issued 7,464,000 7,519,000 7,571,000 7,643,000
Ordinary Shares Number 7,464,000 7,519,000 7,571,000 7,643,000
and here's the final code:
# for scraping the balance sheet from Yahoo Finance
from time import sleep
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup
# importing selenium to click on the "Expand All" button before scraping the financial statements
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def get_balance_sheet_from_yfinance(ticker):
url = f"https://finance.yahoo.com/quote/{ticker}/balance-sheet?p={ticker}"
options = Options()
options.add_argument("start-maximized")
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
WebDriverWait(driver, 3600).until(EC.element_to_be_clickable((
By.XPATH, "//section[#data-test='qsp-financial']//span[text()='Expand All']"))).click()
# content whole page in html format
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get the column headers (i.e. 'Breakdown' row)
div = soup.find_all('div', attrs={'class': 'D(tbhg)'})
if len(div) < 1:
print("Fail to retrieve table column header")
exit(0)
# get the list of columns from the column headers
col = []
for h in div[0].find_all('span'):
text = h.get_text()
if text != "Breakdown":
col.append(datetime.strptime(text, "%m/%d/%Y"))
row = {}
for div in soup.find_all('div', attrs={'data-test': 'fin-row'}):
head = div.find('span').get_text()
i = 4
for h in div.find_all('span'):
if h.get_text().replace(',', '').isdigit() or h.get_text()[0] == '-':
row[head].append(h.get_text())
i += 1
else:
while i < 4:
row[head].append('')
i += 1
else:
head = h.get_text()
row[head] = []
i = 0
for k, v in row.items():
while len(v) < 4:
row[k].append('-')
df = pd.DataFrame(columns=col, index=row.keys(), data=row.values())
print(df)
return df
get_balance_sheet_from_yfinance("MSFT")
i've removed some od the unused code and added a new scrapping method, but i have kept your method of getting the dates of all the columns.
if you have any questions don't hesitate to ask in the comments.
I am trying to iterate through player seasons on NBA.com and pull shooting statistics after each click of the season dropdown menu. After each click, I get the error message "list index out of range" for:
headers = table[1].findAll('th')
It seems to me that the page doesn't load all the way before the source data is saved.
Looking at other similar questions, I have tried using an browser.implicitly_wait() for each loop, but I am still getting the same error. It also doesn't seem that the browser waits after more than the first iteration of the loop.
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
player_id = str(1629216)
url = 'https://www.nba.com/stats/player/' + player_id + "/shooting/"
browser = Chrome(executable_path='/usr/local/bin/chromedriver')
browser.get(url)
select = Select(browser.find_element_by_xpath('/html/body/main/div/div/div/div[4]/div/div/div/div/div[1]/div[1]/div/div/label/select'))
options = select.options
for index in range(0, len(options)):
select.select_by_index(index)
browser.implicitly_wait(5)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I found my own answer. I used time.sleep(1) at the top of the loop to give the browser a second to load all the way. Without this delay, the pages source code did not have the appropriate table that I am scraping.
Responding to those who answered - I did not want to go the api route, but I have seen people scrape nba.com using that method. Table[1] is the correct table; just needed the source code a chance to load after the I loop through the season dropdown.
select.select_by_index(index)
time.sleep(1)
src = browser.page_source
parser = BeautifulSoup(src, "lxml")
table = parser.findAll("div", attrs = {"class":"nba-stat-table__overflow"})
headers = table[1].findAll('th')
headerlist = [h.text.strip() for h in headers[1:]]
headerlist = [a for a in headerlist if not '\n' in a]
headerlist.append('AST%')
headerlist.append('UAST%')
row_labels = table[1].findAll("td", {"class": "first"})
row_labels_list = [r.text.strip() for r in row_labels[0:]]
rows = table[1].findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
df = pd.DataFrame(data=player_stats, columns=headerlist, index = row_labels_list)
print(df)
I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
I've been working with BeautifulSoup lately. I'm trying to get the data from https://www.pro-football-reference.com/teams/mia/2000_roster.htm site. Specifically all I want is the player name and 'gs' (games started).
However, when doing it, it's only returning the 1st ('Starters') table data. I'm actually not interested in that top table at all, I want the 2nd table titled 'Roster'.
Here's the code, that I was doing. Like I said, I didn't really want/need anything other than player name and games started, but was just practicing and learning BeautifulSoup.
import pandas as pd
import requests
import bs4
alpha = requests.get('https://www.pro-football-
reference.com/teams/mia/2000_roster.htm')
beta = bs4.BeautifulSoup(alpha.text,'lxml')
gama = beta.findAll('th',{'data-stat':'pos'})
position = [th.text for th in gama]
position = position[1:]
position = list(filter(None, position))
gama = beta.findAll('td',{'data-stat':'player'})
player = [td.text for td in gama]
player = player[1:]
while 'Defensive Starters' in player: player.remove('Defensive Starters')
while 'Special Teams Starters' in player: player.remove('Special Teams
Starters')
gama = beta.findAll('td',{'data-stat':'age'})
age = [td.text for td in gama]
age = list(filter(None, age))
gama = beta.findAll('td',{'data-stat':'gs'})
gs = [td.text for td in gama]
gs = list(filter(None, gs))
target = pd.DataFrame(
{
'player_name':player,
'position':position,
'gs':gs,
'age':age
})
Anyone see where I'm going wrong? Or maybe an alternative way to go about it?
To get the content from that table you need to use any browser simulator cause the response of that portion is generated dynamically. Data from the first table can easily be accessible without any browser simulator, though. I tried selenium in this case:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
page_url = "https://www.pro-football-reference.com/teams/mia/2000_roster.htm"
driver.get(page_url)
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.select(".table_outer_container")[1]
for items in table.select("tr"):
player = items.select("[data-stat='player']")[0].text
gs = items.select("[data-stat='gs']")[0].text
print(player,gs)
driver.quit()
Partial output:
Player GS
Trace Armstrong* 0
John Bock 1
Tim Bowens 15
Lorenzo Bromell 0
Autry Denson 0
Mark Dixon 15
Kevin Donnalley 16
For some reason if you encounter such error, this time there will be no such option for that error either:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
page_url = "https://www.pro-football-reference.com/teams/mia/2000_roster.htm"
driver.get(page_url)
soup = BeautifulSoup(driver.page_source, "lxml")
table = soup.select(".table_outer_container")[1]
for items in table.select("tr"):
player = items.select("[data-stat='player']")[0].text if items.select("[data-stat='player']") else ""
gs = items.select("[data-stat='gs']")[0].text if items.select("[data-stat='gs']") else ""
print(player,gs)
driver.quit()
I have found many reference that scroll the entire webpage but I am looking for a particular section to scroll. I am working on marketwatch.com - section - latest news tab. How can I scroll just this latest news tab using selenium webdriver?
Below is my code which returns the heading of the news but keeps repeating same headings.
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
while browser.find_element_by_tag_name('ol'):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = []
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.append((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
That happens because the script you are executing scrolls to the bottom of the page.
To keep scrolling inside the element fetching news you need to replace this:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
with this:
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
EDIT
This is the complete working solution:
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
while browser.find_element_by_tag_name('ol'):
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = set()
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.add((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
EDIT 2
You may also want to change how you store the headers, since the way you currently do keeps duplicates inside the list. Changed it to a set so that doesn't happen.