Getting table from HTML file with Python - python

game_link = "http://espn.go.com/nba/playbyplay?gameId=400579510&period=0"
game_source = urlopen(game_link)
game_html = game_source.read()
game_source.close();
row = BeautifulSoup(game_html, "html.parser")
pieces = list(row.children)
I need to get game log rows from above link but above my code gives me whol HTML text how can I extract tables and turn them into single rowns (pieces).

You could try BeautifulSoup.findAll and supply the tag and any other attributes you may know about the tags you are looking for. After looking at the page it looks like you're looking for all <tr> tags with the class even. So you could use soup.findAll("tr", attrs = {"class": "even"}). For example.
import urllib.request
from bs4 import BeautifulSoup
game_link = "http://espn.go.com/nba/playbyplay?gameId=400579510&period=0"
game_source = urllib.request.urlopen(game_link)
game_html = game_source.read()
game_source.close();
soup = BeautifulSoup(game_html, "html.parser")
# find all instances of a row with class "even"
rows = soup.findAll("tr", attrs = {"class": "even"})
for row in rows:
// do work
print(row)
You would still need to parse the html for each row. The following is a very "crude" example.
def parse_row(row):
cols = row.findAll("td") # get each column in the row
# ignore timeouts, this is just an example
if len(cols) < 4:
return None
else:
return {
"time": cols[0].get_text(),
"team1": cols[1].get_text(),
"score": cols[2].get_text(),
"team2": cols[3].get_text()
}
parsed_rows = []
for row in rows:
parsed = parse_row(row)
if parsed:
parsed_rows.append(parsed)

Related

Beautiful Soup to Scrape Data from Static Webpages

I am trying to values from a table of multiple static webpages. It is the verb conjugation data for Korean verbs here: https://koreanverb.app/
My Python script uses Beautiful Soup. The goal is to grab all conjugations from multiple URL inputs and output the data to a CSV file.
Conjugations are stored on the page in table with class "table-responsive" and under the table rows with class "conjugation-row". There are multiple "conjugation-row" table rows on each page. My script is someone only grabbing the first table row with class "conjugation-row".
Why isn't the for loop grabbing all the td elements with class "conjugation-row"? I would appreciate a solution that grabs all tr with class "conjugation-row". I tried using job_elements = results.find("tr", class_="conjugation-row"), but I get the following error:
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
Furthermore, when I do get the data and output to a CSV file, the data is in separate rows as expected, but leaves empty spaces., It places the data rows for the second URL at the index after all data rows for the first URL. See example output here:
See code here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
# create csv file
outfile = open("scrape.csv","w",newline='')
writer = csv.writer(outfile)
## define first URL to grab conjugation names
url1 = 'https://koreanverb.app/?search=%ED%95%98%EB%8B%A4'
# define dataframe columns
df = pd.DataFrame(columns=['conjugation name'])
# get URL content
response = requests.get(url1)
soup = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results = soup.find("div", class_="table-responsive")
##### GET CONJUGATIONS AND APPEND TO CSV
# define URLs
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# loop to get data
for url in urls:
response = requests.get(url)
soup2 = BeautifulSoup(response.content, 'html.parser')
# get table with all verb conjugations
results2 = soup2.find("div", class_="table-responsive")
# get dictionary form of verb/adjective
verb_results = soup2.find('dl', class_='dl-horizontal')
verb_title = verb_results.find('dd')
verb_title_text = verb_title.text
job_elements = results2.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
data_column = pd.DataFrame({ 'conjugation name': [conjugation_name_text],
verb_title_text: [conjugation_korean_text],
})
#data_column = pd.DataFrame({verb_title_text: [conjugation_korean_text]})
df = df.append(data_column, ignore_index = True)
# save to csv
df.to_csv('scrape.csv')
outfile.close()
print('Verb Conjugations Collected and Appended to CSV, one per column')
Get all the job_elements using find_all() since find() only returns the first occurrence and iterate over them in a for loop like below.
job_elements = results.find_all("tr", class_="conjugation-row")
for job_element in job_elements:
conjugation_name = job_element.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_name_text = conjugation_name.text
conjugation_korean_text = conjugation_korean.text
# append element to data
df2 = pd.DataFrame([[conjugation_name_text,conjugation_korean_text]],columns=['conjugation_name','conjugation_korean'])
df = df.append(df2)
The error is where you are trying to use find() on a variable of type list.
As your script is growing big, I made some modifications like using get_conjugations() function and some proper names that are easy to understand. Firstly, conjugation_names and conjugation_korean_names are added into pandas Dataframe columns and then other columns are added subsequently (korean0, korean1 ...).
import requests
from bs4 import BeautifulSoup
import pandas as pd
# function to parse the html data & get conjugations
def get_conjugations(url):
#set return lists
conjugation_names = []
conjugation_korean_names = []
#get html text
html = requests.get(url).text
#parse the html text
soup = BeautifulSoup(html, 'html.parser')
#get table
table = soup.find("div", class_="table-responsive")
table_rows = table.find_all("tr", class_="conjugation-row")
for row in table_rows:
conjugation_name = row.find("td", class_="conjugation-name")
conjugation_korean = conjugation_name.find_next_sibling("td")
conjugation_names.append(conjugation_name.text)
conjugation_korean_names.append(conjugation_korean.text)
#return both lists
return conjugation_names, conjugation_korean_names
# create csv file
outfile = open("scrape.csv", "w", newline='')
urls = ['https://koreanverb.app/?search=%ED%95%98%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A8%B9%EB%8B%A4',
'https://koreanverb.app/?search=%EB%A7%88%EC%8B%9C%EB%8B%A4']
# define dataframe columns
df = pd.DataFrame(columns=['conjugation_name', 'conjugation_korean', 'korean0', 'korean1'])
conjugation_names, conjugation_korean_names = get_conjugations(urls[0])
df['conjugation_name'] = conjugation_names
df['conjugation_korean'] = conjugation_korean_names
for index, url in enumerate(urls[1:]):
conjugation_names, conjugation_korean_names = get_conjugations(url)
#set column name
column_name = 'korean' + str(index)
df[column_name] = conjugation_korean_names
#save to csv
df.to_csv('scrape.csv')
outfile.close()
# Print DONE
print('Export to CSV Complete')
Output:
,conjugation_name,conjugation_korean,korean0,korean1
0,declarative present informal low,해,먹어,마셔
1,declarative present informal high,해요,먹어요,마셔요
2,declarative present formal low,한다,먹는다,마신다
3,declarative present formal high,합니다,먹습니다,마십니다
...
Note:
This assumes that elements in different URLs are in same order.

How to get specific table from HTML

We have form 10-k of several companies. We want to get Earnings tables (Item 6) from the HTML. The structure of the form changes for the companies.
For e.g
url1= 'https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm'
url2='https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm'
We need to get the table in Item 6 Consolidated Financial data.
One way we tried is based on string search for Item 6, getting all the text from Item 6 to Item 7 then get the tables as following:
doc10K = requests.get(url2)
st6 =doc10K.text.lower().find("item 6")
end6 = doc10K.text.lower().find("item 7")
# get text fro item 6 and removing currency sign
item6 = doc10K.text[st6:end6].replace('$','')
Tsoup = bs.BeautifulSoup(item6, 'lxml')
# Extract all tables from the response
html_tables =Tsoup.find_all('table')
This approach doesn't work for all the forms. E.g. With KSS, we are not able to find string 'Item6'. Ideal output will be the table given in Item 6.
petezurich is right, but the marker is not fully positioned.
# You can try this, too. The start parameter can be a list, just match any one of the above
doc10K = requests.get(url2)
from simplified_scrapy.simplified_doc import SimplifiedDoc
doc = SimplifiedDoc(doc10K.text)
start = doc.html.rfind('Selected Consolidated Financial Data')
if start<0:
start = doc.html.rfind('Selected Financial Data')
tables = doc.getElementsByTag('table',start=start,end=['Item 7','Item 7'])
for table in tables:
trs = table.trs
for tr in trs:
tds = tr.tds
for td in tds:
print(td.text)
# print(td.unescape()) #Replace HTML entity
The string item 6 seems to contain either a space or a non breaking space.
Try this cleaned code:
import requests
from bs4 import BeautifulSoup
url1= 'https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm'
url2='https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm'
doc10K = requests.get(url2)
st6 = doc10K.text.lower().find("item 6")
# found "item 6"? if not search search with underscore
if st6 == -1:
st6 = doc10K.text.lower().find("item_6")
end6 = doc10K.text.lower().find("item 7")
item6 = doc10K.text[st6:end6].replace('$','')
soup = BeautifulSoup(item6, 'lxml')
html_tables = soup.find_all('table')
With bs4 4.7.1+ you can use :contains and :has to specify the appropriate matching patterns for the table based on the html. You can use css Or syntax so either of the two patterns shown below are matched.
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
urls = ['https://www.sec.gov/Archives/edgar/data/794367/000079436719000038/m-0202201910xk.htm','https://www.sec.gov/Archives/edgar/data/885639/000156459019009005/kss-10k_20190202.htm']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = pd.read_html(str(soup.select_one('table:contains("Item 6") ~ div:has(table) table, p:contains("Selected Consolidated Financial Data") ~ div:has(table) table')))[0]
table.dropna(axis = 0, how = 'all',inplace= True)
table.dropna(axis = 1, how = 'all',inplace= True)
table.fillna(' ', inplace=True)
table.rename(columns= table.iloc[0], inplace = True) #set headers same as row 1
table.drop(table.index[0:2], inplace = True) #lose row 1
table.reset_index(drop=True, inplace = True) #re-index
print(table)

How to scrape embedded integers on a website

I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:
The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!
You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)

BeautifulSoup: Can't Access Info Within TD

I'm looking at the following website:
https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859
I want to extract the name of each university and the href associated with it. So for the first entry, I'd like to get Stanford and https://modules.ussquash.com/ssm/pages/leagues/Team_Information.asp?id=18564
I've gotten to the point where I have all of the TDs, using BeautifulSoup. I'm just having difficulty extracting the school and its href.
Here's my attempt:
def main():
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table')[1]
rows = table.find_all('tr')[1:]
for row in rows:
cols = row.find_all('td')
print(cols)
When I try to access cols[0], I get:
IndexError: list index out of range
Any idea how to fix this would be awesome!
Thanks
The first two tr's are in the thead which have no td tags, you want to skip the first two tr's:
rows = table.find_all('tr')[2:]
To get what you want, we can simplify using css selectors:
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
# anchor we want is inside the first td
a = row.select_one("td a") # or a = row.find("td").a
print(a.text,a["href"])
Also the href is a relative path so you need to join it to a base url:
import requests
from bs4 import BeautifulSoup
from urllib.urlparse import urljoin
def main():
base = "https://modules.ussquash.com/ssm/pages/leagues/"
r = requests.get('https://modules.ussquash.com/ssm/pages/leagues/League_Information.asp?leagueid=1859')
data = r.text
soup = BeautifulSoup(data)
table = soup.find_all('table', limit=2)[1]
# skip first two tr's
rows = table.select("tr + tr + tr")
for row in rows:
a = row.select_one("td a")
print(a.text, urljoin(base, a["href"]))

How do I find an element with a certain class in a web page with BeautifulSoup?

I have tried to find a table with class "data" in a web page with this code.
import urllib2
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read())
rows = soup.findAll("table.data")
print rows
However, I am getting none for rows even though I am sure that a table with class "data" exists on that page. What is the proper way to find an element with class "data" on a web page with BeautifulSoup?
If you want to pick up the rows, you'll need the following
import urllib2
from BeautifuSoup import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.cbssports.com/nba/draft/mock-draft').read())
# if there's only one table with class = data
table = soup.find('table', attrs = {'class' : 'data'})
# if there are multiple tables with class = data
table = soup.findAll('table', attrs = {'class' : 'data'})[n]
# suppose you need the n-th table of the list returned
rows = table.findAll('tr') # gives all the rows, you can set attrs to filter
Then you can also iterate through the columns:
for row in rows:
cols = row.findAll('td')
...
You want something like
rows = soup.find_all('table', attrs = {"class": "data"})
instead of your current line (tested). The class of an element is an attribute, so you filter by attribute in find_all. This line returns a large table element from your sample page.

Categories

Resources