I have been trying to extract out the contents inside a table on a website.
descriptions = []
sources = []
values = []
site = 'https://www.eia.gov/todayinenergy/prices.php' #address of the site
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.execute_script("document.body.style.zoom='100%'")
driver.get(site)
soup_1 = bs(driver.page_source, 'lxml') #clean up the site using beautiful soup
tables = soup_1.find_all('tbody') #script of interest
print(len(tables)) #count the scripts
for table in tables:
rows = table.find_all('tr')
print(len(rows))
for row in rows:
description = row.find('td', class_='s1')
descriptions.append(descri_clean)
source = row.find('td', class_='s2')
sources.append(source_clean)
value = row.find('td', class_='d1') #find the row that gives the data
values.append(value_clean) #compile it all together
driver.close()
I have been trying to get clean text form the table however the data extracted looks like this.
<td class="s1" rowspan="3">Crude Oil<br/> ($/barrel)</td>
While i want something like just ''Crude Oil ($/barrel)
When i tried
description = row.find('td', class_='s1').text.renderContents()
descriptions.append(descri_clean)
The error showed up
AttributeError: 'NoneType' object has no attribute 'renderContents'
You can use just requests. You can filter out your values by doing string matching on expected values for certain class attributes when looping table rows. I set the two tables of interest into separate variables which are lists of the rows within those tables. The tables on the page each have their own distinct class identifier for the table number e.g. t1, t2 ......
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.eia.gov/todayinenergy/prices.php')
soup = bs(r.content, 'lxml')
table1 = soup.select('.t1 tr')
table2 = soup.select('.t2 tr')
for item in table1:
if 'Crude Oil ($/barrel) - Nymex Apr' in item.text:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)
elif 'Ethanol ($/gallon) - CBOT Apr' in item.text:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)
for item in table2:
if len(item.select('td')) == 4:
header = item.select_one('td.s1').text
if item.select_one('td.s2'):
if item.select_one('td.s2').text in ['WTI','Brent','Louisiana Light','Los Angeles'] and header in ['Crude Oil ($/barrel)','Gasoline (RBOB) ($/gallon)']:
rowInfo = [td.text for td in item.select('td')]
print(rowInfo)
Related
I am trying to scrape table data from this page on the PGA stats website. I am grabbing the player name, and the code seems to be working, but it only returns the last value "Patrick Rodgers" from the site. What am I doing wrong here?
This is the html it's grabbing.
Here is my source code:
#Get URL and Parse
url = 'https://www.pgatour.com/content/pgatour/stats/stat.02674.y2020.eon.t027.html'
results = requests.get(url)
soup = BeautifulSoup(results.text, 'html.parser')
#Find data
sg_ttg = soup.find('table', id = 'statsTable')
#Get data
for player in sg_ttg.find_all('tbody'):
rows = player.find_all('tr')
for row in rows:
playername = row.find('td', class_= 'player-name').text
To get all the data, try to define list and append each value:
players = []
for table in sg_ttg.find_all('tbody'):
rows = table.find_all('tr')
for row in rows:
player = row.find('td', class_= 'player-name').text.strip()
players.append(player)
print(players)
I'm trying to scrape a table into a dataframe. My attempt only returns the table name and not the data within rows for each region.
This is what i have so far:
from bs4 import BeautifulSoup as bs4
import requests
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
table_regions = soup.find('table', {'class': "t4"})
regions = table_regions.find_all('tr')
for row in regions:
print row
ideal outcome i'd like to get:
region | price
---------------|-------
new england | 2.59
new york city | 2.52
Thanks for any assistance.
If you check your html response (soup) you will see that the table tag you get in this line table_regions = soup.find('table', {'class': "t4"}) its closed up before the rows that contain the information you need (the ones that contain the td's with the class names: up dn d1 and s1.
So how about using the raw td tags like this:
from bs4 import BeautifulSoup as bs4
import requests
import pandas as pd
url = 'https://www.eia.gov/todayinenergy/prices.php'
r = requests.get(url)
soup = bs4(r.text, "html.parser")
a = soup.find_all('tr')
rows = []
subel = []
for tr in a[42:50]:
b = tr.find_all('td')
for td in b:
subel.append(td.string)
rows.append(subel)
subel = []
df = pd.DataFrame(rows, columns=['Region','Price_1', 'Percent_change_1', 'Price_2', 'Percent_change_2', 'Spark Spread'])
Notice that I use just the a[42:50] slice of the results because a contains all the td's of the website. You can use the rest too if you need to.
First, there is a leading "1" in the return string and I'm encountering trouble iterating passed it - I've tried using [0:]: method and get stuck somewhere. I'd like to skip drop it or skip it to get to the second value that is the id val. the scraped table
Additionally, in trying to format returned items from table for storage - I've been getting index out of range errors. I've been using def store().
import requests
from bs4 import BeautifulSoup
import MySQLdb
#mysql portion
mydb = MySQLdb.connect(host='****',
user= '****',
passwd='****',
db='****')
cur = mydb.cursor()
def store (id, ticker):
cur.execute('INSERT IGNORE INTO TEST (id, ticker) VALUES (\"%s\", \"%s\")',(id, ticker))
cur.connection.commit()
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
main_div = soup.find('div', attrs = {'id':'screener-content'})
table = main_div.find('table')
sub = table.findAll('tr')
cells = sub[5].findAll('td')
for cell in cells:
link = cell.a
if link is not None:
link = link.get_text()
id = link[0]
ticker = link[1]
store(id, ticker)
print(link)
I don't know what you real try to do but this works for me
import requests
from bs4 import BeautifulSoup
base_url = 'http://finviz.com/screener.ashx?v=152&s=ta_topgainers&o=price&c=0,1,2,3,4,5,6,24,25,63,64,65,66,67'
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
rows = soup.find_all('tr', class_=["table-dark-row-cp", "table-light-row-cp"])
for row in rows:
columns = row.find_all('td')
id_ = columns[0].a.get_text()
ticker = columns[1].a.get_text()
company = columns[2].a.get_text()
sector = columns[3].a.get_text()
industry = columns[4].a.get_text()
print(id_, ticker, company, sector, industry)
Or event with a
for row in rows:
columns = row.find_all('a')
id_ = columns[0].get_text()
ticker = columns[1].get_text()
company = columns[2].get_text()
sector = columns[3].get_text()
industry = columns[4].get_text()
print(id_, ticker, company, sector, industry)
BTW: you can also use CSS selector
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr[class]')
or
rows = soup.select('#screener-content table[bgcolor="#d3d3d3"] tr')
# skip first row with headers
rows = rows[1:]
I'm looking at the following website:
https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859
I want to extract the name of each university and the href associated with it. So for the first entry, I'd like to get Stanford and https://modules.ussquash.com/ssm/pages/leagues/Team_Information.asp?id=18564
I've gotten to the point where I have all of the TDs, using BeautifulSoup. I'm just having difficulty extracting the school and its href.
Here's my attempt:
def main():
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table')[1]
rows = table.find_all('tr')[1:]
for row in rows:
cols = row.find_all('td')
print(cols)
When I try to access cols[0], I get:
IndexError: list index out of range
Any idea how to fix this would be awesome!
Thanks
The first two tr's are in the thead which have no td tags, you want to skip the first two tr's:
rows = table.find_all('tr')[2:]
To get what you want, we can simplify using css selectors:
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
# anchor we want is inside the first td
a = row.select_one("td a") # or a = row.find("td").a
print(a.text,a["href"])
Also the href is a relative path so you need to join it to a base url:
import requests
from bs4 import BeautifulSoup
from urllib.urlparse import urljoin
def main():
base = "https://modules.ussquash.com/ssm/pages/leagues/"
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
a = row.select_one("td a")
print(a.text, urljoin(base, a["href"]))
Here is my code so far:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = urlopen("http://sports.yahoo.com/nhl/scoreboard?d=2013-04-01")
content = url.read()
soup = BeautifulSoup(content)
print (soup.prettify)
table = soup.find('table')
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
for td in cols:
text = td.findAll('yspscores')
for yspscores in td:
print (yspscores)
The problem I've been having is that the HTML for that yahoo page has the table data in this context: <td class="yspscores">
I do not quite understand how to reference it in my code. My goal is to print out the scores and name of the teams that the score corresponds to.
You grabbed the first table, but there is more than one table on that page. In fact, there are 46 tables.
You want to find the tables with the scores class:
for table in soup.find_all('table', class_='scores'):
for row in table.find_all('tr'):
for cell in row.find_all('td', class_='yspscores'):
print(cell.text)
Note that searching for a specific class is done with the class_ keyword argument.