BeautifulSoup: Can't Access Info Within TD - python

I'm looking at the following website:
https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859
I want to extract the name of each university and the href associated with it. So for the first entry, I'd like to get Stanford and https://modules.ussquash.com/ssm/pages/leagues/Team_Information.asp?id=18564
I've gotten to the point where I have all of the TDs, using BeautifulSoup. I'm just having difficulty extracting the school and its href.
Here's my attempt:
def main():
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table')[1]
rows = table.find_all('tr')[1:]
for row in rows:
cols = row.find_all('td')
print(cols)
When I try to access cols[0], I get:
IndexError: list index out of range
Any idea how to fix this would be awesome!
Thanks

The first two tr's are in the thead which have no td tags, you want to skip the first two tr's:
rows = table.find_all('tr')[2:]
To get what you want, we can simplify using css selectors:
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
# anchor we want is inside the first td
a = row.select_one("td a") # or a = row.find("td").a
print(a.text,a["href"])
Also the href is a relative path so you need to join it to a base url:
import requests
from bs4 import BeautifulSoup
from urllib.urlparse import urljoin
def main():
base = "https://modules.ussquash.com/ssm/pages/leagues/"
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
a = row.select_one("td a")
print(a.text, urljoin(base, a["href"]))

Related

How to get specific table from HTML

We have form 10-k of several companies. We want to get Earnings tables (Item 6) from the HTML. The structure of the form changes for the companies.
For e.g
url1= 'https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm'
url2='https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm'
We need to get the table in Item 6 Consolidated Financial data.
One way we tried is based on string search for Item 6, getting all the text from Item 6 to Item 7 then get the tables as following:
doc10K = requests.get(url2)
st6 =doc10K.text.lower().find("item 6")
end6 = doc10K.text.lower().find("item 7")
# get text fro item 6 and removing currency sign
item6 = doc10K.text[st6:end6].replace('$','')
Tsoup = bs.BeautifulSoup(item6, 'lxml')
# Extract all tables from the response
html_tables =Tsoup.find_all('table')
This approach doesn't work for all the forms. E.g. With KSS, we are not able to find string 'Item6'. Ideal output will be the table given in Item 6.
petezurich is right, but the marker is not fully positioned.
# You can try this, too. The start parameter can be a list, just match any one of the above
doc10K = requests.get(url2)
from simplified_scrapy.simplified_doc import SimplifiedDoc
doc = SimplifiedDoc(doc10K.text)
start = doc.html.rfind('Selected Consolidated Financial Data')
if start<0:
start = doc.html.rfind('Selected Financial Data')
tables = doc.getElementsByTag('table',start=start,end=['Item 7','ItemĀ 7'])
for table in tables:
trs = table.trs
for tr in trs:
tds = tr.tds
for td in tds:
print(td.text)
# print(td.unescape()) #Replace HTML entity
The string item 6 seems to contain either a space or a non breaking space.
Try this cleaned code:
import requests
from bs4 import BeautifulSoup
url1= 'https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm'
url2='https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm'
doc10K = requests.get(url2)
st6 = doc10K.text.lower().find("item 6")
# found "item 6"? if not search search with underscore
if st6 == -1:
st6 = doc10K.text.lower().find("item_6")
end6 = doc10K.text.lower().find("item 7")
item6 = doc10K.text[st6:end6].replace('$','')
soup = BeautifulSoup(item6, 'lxml')
html_tables = soup.find_all('table')
With bs4 4.7.1+ you can use :contains and :has to specify the appropriate matching patterns for the table based on the html. You can use css Or syntax so either of the two patterns shown below are matched.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
urls = ['https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm','https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:contains("Item 6") ~ div:has(table) table, p:contains("Selected Consolidated Financial Data") ~ div:has(table) table')))[0]
table.dropna(axis = 0, how = 'all',inplace= True)
table.dropna(axis = 1, how = 'all',inplace= True)
table.fillna(' ', inplace=True)
table.rename(columns= table.iloc[0], inplace = True) #set headers same as row 1
table.drop(table.index[0:2], inplace = True) #lose row 1
table.reset_index(drop=True, inplace = True) #re-index
print(table)

Beautiful Soup scrape table with table breaks

I'm trying to scrape a table into a dataframe. My attempt only returns the table name and not the data within rows for each region.
This is what i have so far:
from bs4 import BeautifulSoup as bs4
import requests
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
table_regions = soup.find('table', {'class': "t4"})
regions = table_regions.find_all('tr')
for row in regions:
print row
ideal outcome i'd like to get:
region | price
---------------|-------
new england | 2.59
new york city | 2.52
Thanks for any assistance.
If you check your html response (soup) you will see that the table tag you get in this line table_regions = soup.find('table', {'class': "t4"}) its closed up before the rows that contain the information you need (the ones that contain the td's with the class names: up dn d1 and s1.
So how about using the raw td tags like this:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
a = soup.find_all('tr')
rows = []
subel = []
for tr in a[42:50]:
b = tr.find_all('td')
for td in b:
subel.append(td.string)
rows.append(subel)
subel = []
df = pd.DataFrame(rows, columns=['Region','Price_1', 'Percent_change_1', 'Price_2', 'Percent_change_2', 'Spark Spread'])
Notice that I use just the a[42:50] slice of the results because a contains all the td's of the website. You can use the rest too if you need to.

Python Data Scraper

I wrote the following line of code
#!/usr/bin/python
#weather.scraper
from bs4 import BeautifulSoup
import urllib
def main():
"""weather scraper"""
r = urllib.urlopen("https://www.wunderground.com/history/airport/KPHL/2016/1/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1").read()
soup = BeautifulSoup(r, "html.parser")
table = soup.find_all("table", class_="responsive airport-history-summary-table")
tr = soup.find_all("tr")
td = soup.find_all("td")
print table
if __name__ == "__main__":
main()
When I print the table i get all the html (td, tr, span, etc.) as well. How can I print the content of the table (tr, td) without the html?
THANKS!
You have to use .getText() method when you want to get a content. Since find_all returns a list of elements, you have to choose one of them (td[0]).
Or you can do for example:
for tr in soup.find_all("tr"):
print '>>>> NEW row <<<<'
print '|'.join([x.getText() for x in tr.find_all('td')])
The loop above prints for each row cell next to cell.
Note that you do find all td's and all tr's your way but you probably want to get just those in table.
If you want to look for elements inside the table, you have to do this:
table.find('tr') instead of soup.find('tr) so the BeautifulSoup will be looking for trs in the table instead of whole html.
YOUR CODE MODIFIED (according to your comment that there are more tables):
#!/usr/bin/python
#weather.scraper
from bs4 import BeautifulSoup
import urllib
def main():
"""weather scraper"""
r = urllib.urlopen("https://www.wunderground.com/history/airport/KPHL/2016/1/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
print '>>>>>>> NEW TABLE <<<<<<<<<'
trs = table.find_all("tr")
for tr in trs:
# for each row of current table, write it using | between cells
print '|'.join([x.get_text().replace('\n','') for x in tr.find_all('td')])
if __name__ == "__main__":
main()

Getting table from HTML file with Python

game_link = "http://espn.go.com/nba/playbyplay?gameId=400579510&period=0"
game_source = urlopen(game_link)
game_html = game_source.read()
game_source.close();
row = BeautifulSoup(game_html, "html.parser")
pieces = list(row.children)
I need to get game log rows from above link but above my code gives me whol HTML text how can I extract tables and turn them into single rowns (pieces).
You could try BeautifulSoup.findAll and supply the tag and any other attributes you may know about the tags you are looking for. After looking at the page it looks like you're looking for all <tr> tags with the class even. So you could use soup.findAll("tr", attrs = {"class": "even"}). For example.
import urllib.request
from bs4 import BeautifulSoup
game_link = "http://espn.go.com/nba/playbyplay?gameId=400579510&period=0"
game_source = urllib.request.urlopen(game_link)
game_html = game_source.read()
game_source.close();
soup = BeautifulSoup(game_html, "html.parser")
# find all instances of a row with class "even"
rows = soup.findAll("tr", attrs = {"class": "even"})
for row in rows:
// do work
print(row)
You would still need to parse the html for each row. The following is a very "crude" example.
def parse_row(row):
cols = row.findAll("td") # get each column in the row
# ignore timeouts, this is just an example
if len(cols) < 4:
return None
else:
return {
"time": cols[0].get_text(),
"team1": cols[1].get_text(),
"score": cols[2].get_text(),
"team2": cols[3].get_text()
}
parsed_rows = []
for row in rows:
parsed = parse_row(row)
if parsed:
parsed_rows.append(parsed)

I do not quite understand how to parse the Yahoo NHL Page

Here is my code so far:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = urlopen("http://sports.yahoo.com/nhl/scoreboard?d=2013-04-01")
content = url.read()
soup = BeautifulSoup(content)
print (soup.prettify)
table = soup.find('table')
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
for td in cols:
text = td.findAll('yspscores')
for yspscores in td:
print (yspscores)
The problem I've been having is that the HTML for that yahoo page has the table data in this context: <td class="yspscores">
I do not quite understand how to reference it in my code. My goal is to print out the scores and name of the teams that the score corresponds to.
You grabbed the first table, but there is more than one table on that page. In fact, there are 46 tables.
You want to find the tables with the scores class:
for table in soup.find_all('table', class_='scores'):
for row in table.find_all('tr'):
for cell in row.find_all('td', class_='yspscores'):
print(cell.text)
Note that searching for a specific class is done with the class_ keyword argument.

Categories

Resources