Combine dictionary output of iterator function into single dictionary - python

I'm trying to web scrape the QS website (using code I got from here) and want to combine the output of the for loop below into a single dictionary, so that I can turn it into a dataframe using pandas.
I've tried pulling the data out as lists and using .append but no luck, and haven't been able to convert the dictionaries to a df as the return function only returns the last output.
Python code:
def get_entries():
import requests
#2023 Masters in Finance Ranking Indicators URL#
url = "https://www.topuniversities.com//sites//default//files//qs-rankings-data//en//3827348_indicators.txt?rjbmn2"
headers = {
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
response = requests.get(url, headers=headers)
response.raise_for_status()
def Entries(entry):
from bs4 import BeautifulSoup as Soup
try:
return {
"Name": Soup(entry["uni"], "html.parser").select_one(".uni-link").find(text=True, recursive=False).strip(),
"Programme": Soup(entry["uni"], "html.parser").select_one(".uni-link").find("span").find(text=True, recursive=False),
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "html.parser").select_one(".td-wrap-in").get_text(strip=True)
}
except:
return {
"Name": Soup(entry["uni"], "html.parser").select_one(".uni-link").find(text=True, recursive=False).strip(),
"Programme": Soup(entry["uni"], "html.parser").select_one(".uni-link").find("span"),
#Finds span tags instead of text for instances where no programme#
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "html.parser").select_one(".td-wrap-in").get_text(strip=True)
}
yield from map(Entries, response.json()["data"])
def dicts():
from itertools import islice
for entry in get_entries():
print(entry)
return

pandas.DataFrame can read data from generators. For example:
def gen():
from itertools import product, tee
yield from [{'a': i, 'b': j} for i, j in product(*tee(range(3)))]
df = pd.DataFrame(gen())
print(df)
Output:
a b
0 0 0
1 0 1
2 0 2
3 1 0
4 1 1
5 1 2
6 2 0
7 2 1
8 2 2
So I think that the code you have can be simplified like:
def dicts():
from pandas import DataFrame
return DataFrame(get_entries())
Or just one line:
df = pd.DataFrame(get_entries())

Not all of the entries in the returned JSON structure contain a 'uni' key. With the code as shown in the question, this will result in a AttributeError exception.
If I understand what you're trying to do correctly then you can build a "master" dictionary as a superset of all the dictionaries constructed by get_entries() which you can then pass to a DataFrame constructor.
This may help:
import requests
import pandas as pd
from bs4 import BeautifulSoup as Soup
def get_entries():
def Entries(entry):
try:
uni_link = Soup(entry['uni'], 'lxml').select_one('.uni-link')
return {
'name': uni_link.find(text=True, recursive=False).strip(),
"Programme": uni_link.find("span").find(text=True, recursive=False),
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "lxml").select_one(".td-wrap-in").get_text(strip=True)
}
except AttributeError:
return {}
(r := requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/en/3827348_indicators.txt?rjbmn2')).raise_for_status()
yield from map(Entries, r.json()['data'])
result = {}
for d in get_entries():
for k, v in d.items():
result.setdefault(k, []).append(v)
df = pd.DataFrame(result)
print(df)
Output:
name Programme Rank Overall Score Value for Money Thought Leadership Employability Diversity Alumni Outcomes
0 HEC Paris Master in International Finance 1 96.5 100 99.1 98.3 71.7 100
1 Oxford (Said) MSc Financial Economics 2 96.4 99.1 93.8 100 78.5 100
2 London Business School MSc in Financial Analysis 3 95.1 96.1 93.5 99.4 73.5 99.9
3 MIT (Sloan) Master of Finance 4 94.4 96.6 94.1 97.2 70.9 100
4 UCLA (Anderson) Master of Financial Engineering 5 91.5 96.1 95 90.9 60.4 100
.. ... ... ... ... ... ... ... ... ...
161 Tulsa (Collins) MS in Finance 184 52.1 24 21.4 28.4 34.5
162 Universidad Esan Master of Finance 186 62.6 29.6 36.7 34.7 20
163 University of South Florida MS in Finance 189 54.7 38.6 25.7 50.7 34.5
164 Vilnius University Finance and Banking 190 33 47.2 48 43.7 27.9
165 hsc Master of Science in Finance 192 72.8 32.9 32.6 50 21.9
Note:
Code changed to use lxml as that's generally faster than the default html.parser

Related

How to drop subheaders in wikipedia tables?

I am trying to web scrap a wikipedia table into a dataframe. In the wikipedia table, I want to drop Population density, Land Area, and specifically Population (Rank). In the end I want to keep State or territory and just Population (People).
https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density
Here is my code:
wiki = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wiki)
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(indiatable))
df=pd.DataFrame(df[0])
data = df.drop(["Population density","Population"["Rank"],"Land area"], axis=1)
wikidata = data.rename(columns={"State or territory": "State","Population": "Population"})
print (wikidata.head())
How to do I reference specifically that subtable header to drop the rank in Population?
Note: There is no expected result in your question, so you may have to make some adjustments to your headers. Assuming you like to rename people to population and not population by itself I changed that.
To get your goal, simply set the header parameter while reading the html to choose only the second, so you do not need to drop it separatly:
df=pd.read_html(str(indiatable),header=1)[0]
df = df.rename(columns={"State or territory": "State","People": "Population"}).drop(['Rank'], axis=1)
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wiki)
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(indiatable),header=1)[0]
df = df.rename(columns={"State or territory": "State","People": "Population"}).drop(['Rank'], axis=1)
Output
State
Rank(all)
Rank(50 states)
permi2
perkm2
Population
Rank.1
mi2
km2
District of Columbia
1
—
11295
4361
689545
56
61
158
New Jersey
2
1
1263
488
9288994
46
7354
19046.8
Rhode Island
3
2
1061
410
1097379
51
1034
2678
Puerto Rico
4
—
960
371
3285874
49
3515
9103.8
Massachusetts
5
3
901
348
7029917
45
7800
20201.9
Connecticut
6
4
745
288
3605944
48
4842
12540.7
Guam
7
—
733
283
153836
52
210
543.9
American Samoa
8
—
650
251
49710
55
77
199.4

beautiful soup find_all() not returning all elements

I am trying to scrape this website using bs4. Using inspect on particular car ad tile, I figured what I need to scrape in order to get the title & the link to the car's page.
I am making use of the find_all() function of the bs4 library but the issue is that it's not scraping the required info of all the cars. It returns only info of about 21, whereas it's clearly visible on the website that there are about 2410 cars.
The relevant code:
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen
import re
import requests
url = 'https://www.cardekho.com/used-cars+in+bangalore'
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = bs(webpage,"html.parser")
tags = page_soup.find_all("div","title")
print(len(tags))
How to get info on all of the cars present on the page.
P.S - Want to point out just one thing, all the cars aren't displayed at once. More car info gets loaded as you scroll down. Could it because of that? Not sure.
Ok, I've written up a sample code to show you how it can be done. Although the site has a convenient api that we can leverage, the first page is not available through the api, but is embedded in a script tag in the html code. This requires additional processing to extract. After that it is simply a matte of getting the json data from the api, parsing it to python dictionaries and appending the car entries to a list. The link to the api can be found when inspecting network activity in Chrome or Firefox while scrolling the site.
from bs4 import BeautifulSoup
import re
import json
from subprocess import check_output
import requests
import time
from tqdm import tqdm #tqdm is just to implement a progress bar, https://pypi.org/project/tqdm/
cars = [] #create empty list to which we will append the car dicts from the json data
url = 'https://www.cardekho.com/used-cars+in+bangalore'
r = requests.get(url , headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(r.content.decode('utf-8'),"html.parser")
s = soup.find('script', {"type":"application/ld+json"}).next_sibling #find the section with the json data. It looks for a script tage with application/ld+json type, and takes the next tag, which is the one with the data we need, see page source code
js = 'window = {};\n'+s.text.strip()+';\nprocess.stdout.write(JSON.stringify(window.__INITIAL_STATE__));' #strip the text from unnecessary strings and load the json as python dict, taken from: https://stackoverflow.com/questions/54991571/extract-json-from-html-script-tag-with-beautifulsoup-in-python/54992015#54992015
with open('temp.js','w') as f: # save the sting to a javascript file
f.write(js)
data_site = json.loads(check_output(['node','temp.js'])) #execute the file with node, which will return the json data that will be loaded with json.loads.
for i in data_site['items']: #iterate over the dict and append all cars to the empty list 'cars'
cars.append(i)
for page in tqdm(range(20, data_site['total_count'], 20)): #'pagefrom' in the api call is 20, 40, 60, etc. so create a range and loop it
r = requests.get(f"https://www.cardekho.com/api/v1/usedcar/search?&cityId=105&connectoid=&lang_code=en&regionId=0&searchstring=used-cars%2Bin%2Bbangalore&pagefrom={page}&sortby=updated_date&sortorder=asc&mink=0&maxk=200000&dealer_id=&regCityNames=&regStateNames=", headers={'User-Agent': 'Mozilla/5.0'})
data = r.json()
for i in data['data']['cars']: #iterate over the dict and append all cars to the empty list 'cars'
cars.append(i)
time.sleep(5) #wait a few seconds to avoid overloading the site
This will result in cars being a list of dictionaries. The car names can be found in the vid key, and the urls are present in the vlink key.
You can load it into a pandas dataframe to explore the data:
import pandas as pd
df = pd.DataFrame(cars)
df.head() will output (I omitted the images column for readability):
loc
myear
bt
ft
km
it
pi
pn
pu
dvn
ic
ucid
sid
ip
oem
model
vid
city
vlink
p_numeric
webp_image
position
pageNo
centralVariantId
isExpiredModel
modelId
isGenuine
is_ftc
seller_location
utype
views
tmGaadiStore
cls
0
Koramangala
2014
SUV
Diesel
30,000
0
https://images10.gaadicdn.com/usedcar_image/320x240/used_car_2206305_1614944913.jpg
9.9 Lakh
Mahindra XUV500 W6 2WD
13
3019084
9509A09F1673FE2566DF59EC54AAC05B
1
Mahindra
Mahindra XUV500
Mahindra XUV500 2011-2015 W6 2WD
Bangalore
/used-car-details/used-Mahindra-XUV500-2011-2015-W6-2WD-cars-Bangalore_9509A09F1673FE2566DF59EC54AAC05B.htm
990000
https://images10.gaadicdn.com/usedcar_image/320x240webp/2021/used_car_2206305_1614944913.webp
1
1
3822
True
570
0
0
{'address': 'BDA Complex, 100 Feet Rd, 3rd Block, Koramangala 3 Block, Koramangala, Bengaluru, Karnataka 560034, Bangalore', 'lat': 12.931, 'lng': 77.6228}
Dealer
235
False
1
Marathahalli Colony
2017
SUV
Petrol
30,000
0
https://images10.gaadicdn.com/usedcar_image/320x240/used_car_2203506_1614754307.jpeg
7.85 Lakh
Ford Ecosport 1.5 Petrol Trend BSIV
14
3015331
2C0E4C4E543D4792C1C3186B361F718B
1
Ford
Ford Ecosport
Ford Ecosport 2015-2021 1.5 Petrol Trend BSIV
Bangalore
/used-car-details/used-Ford-Ecosport-2015-2021-1.5-Petrol-Trend-BSIV-cars-Bangalore_2C0E4C4E543D4792C1C3186B361F718B.htm
785000
https://images10.gaadicdn.com/usedcar_image/320x240webp/2021/used_car_2203506_1614754307.webp
2
1
6086
True
175
0
0
{'address': '2, Varthur Rd, Ayyappa Layout, Chandra Layout, Marathahalli, Bengaluru, Karnataka 560037, Marathahalli Colony, Bangalore', 'lat': 12.956727624875453, 'lng': 77.70174980163576}
Dealer
495
False
2
Yelahanka
2020
SUV
Diesel
13,969
0
https://images10.gaadicdn.com/usedcar_image/320x240/usedcar_11_276591614316705_1614316747.jpg
41 Lakh
Toyota Fortuner 2.8 4WD AT
12
3007934
BBC13FB62DF6840097AA62DDEA05BB04
1
Toyota
Toyota Fortuner
Toyota Fortuner 2016-2021 2.8 4WD AT
Bangalore
/used-car-details/used-Toyota-Fortuner-2016-2021-2.8-4WD-AT-cars-Bangalore_BBC13FB62DF6840097AA62DDEA05BB04.htm
4100000
https://images10.gaadicdn.com/usedcar_image/320x240webp/2021/usedcar_11_276591614316705_1614316747.webp
3
1
7618
True
364
0
0
{'address': 'Sonnappanahalli Kempegowda Intl Airport Road Jala Uttarahalli Hobli, Yelahanka, Bangalore, Karnataka 560064', 'lat': 13.1518821, 'lng': 77.6220694}
Dealer
516
False
3
Byatarayanapura
2017
Sedans
Diesel
18,000
0
https://images10.gaadicdn.com/usedcar_image/320x240/used_car_2202297_1615013237.jpg
35 Lakh
Mercedes-Benz E-Class E250 CDI Avantgarde
15
3013606
4553943A967049D873712AFFA5F65A56
1
Mercedes-Benz
Mercedes-Benz E-Class
Mercedes-Benz E-Class 2009-2012 E250 CDI Avantgarde
Bangalore
/used-car-details/used-Mercedes-Benz-E-Class-2009-2012-E250-CDI-Avantgarde-cars-Bangalore_4553943A967049D873712AFFA5F65A56.htm
3500000
https://images10.gaadicdn.com/usedcar_image/320x240webp/2021/used_car_2202297_1615013237.webp
4
1
4611
True
674
0
0
{'address': 'NO 19, Near Traffic Signal, Byatanarayanapura, International Airport Road, Byatarayanapura, Bangalore, Karnataka 560085', 'lat': 13.0669588, 'lng': 77.5928756}
Dealer
414
False
4
nan
2015
Sedans
Diesel
80,000
0
https://stimg.cardekho.com/pwa/img/noimage.svg
12.5 Lakh
Skoda Octavia Elegance 2.0 TDI AT
1
3002709
156E5F2317C0A3A3BF8C03FFC35D404C
1
Skoda
Skoda Octavia
Skoda Octavia 2013-2017 Elegance 2.0 TDI AT
Bangalore
/used-car-details/used-Skoda-Octavia-2013-2017-Elegance-2.0-TDI-AT-cars-Bangalore_156E5F2317C0A3A3BF8C03FFC35D404C.htm
1250000
5
1
3092
True
947
0
0
{'lat': 0, 'lng': 0}
Individual
332
False
Or if you wish to explode the dict in seller_location to columns, you can load it with df = pd.json_normalize(cars).
You can save all data to a csv file: df.to_csv('output.csv')

How to properly place web-scraped data into a pandas data frame?

Problem: I have used BeautifulSoup to scrape a Wikipedia page for the meat consumption per capita for each country in the world. Having trouble putting it into a data frame using Pandas - my data frame is coming up blank.
Wikipedia page: https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption
Goal: Place web scraped data into a data frame
Code:
url_meat1='https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption'
page=urllib.request.urlopen(url_meat1)
soup= BeautifulSoup(page, "lxml")# parse the HTML from our URL into the BeautifulSoup parse tree format
print(soup.prettify()) #print results of the web page scrape
table_meat1 = soup.find('table', class_='wikitable sortable')
A=[]
B=[]
C=[]
for row in table_meat1.findAll('tr'):
cells=row.findAll('td')
if len(cells)==3:
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
df_meat1=pd.DataFrame(A,columns=['Country'])
df_meat1['kg/person (2009)']=B
df_meat1['kg/person (2017)']=C
df_meat1
I get a blank data frame...
Result
Replace your for loop with this for loop:
for row in table_meat1.findAll('tr'):
cells=row.find_all('td')
if len(cells)==4:
A.append(cells[0].a['title'])
B.append(cells[2].find(text=True))
C.append(cells[3].find(text=True).strip())
Output:
Country kg/person (2009) kg/person (2017)
0 Albania None
1 Algeria 19.5 17.33
2 American Samoa 26.8
3 Angola 22.4
4 Antigua and Barbuda 84.3
.. ... ... ...
183 Venezuela 76.8
184 Vietnam 49.9 52.90
185 Yemen 17.9
186 Zambia 12.3
187 Zimbabwe 21.3 13.64
[188 rows x 3 columns]
Same data in a csv file:

bs4 find table by id, returning 'None'

Not sure why this isn't working :( I'm able to pull other tables from this page, just not this one.
import requests
from bs4 import BeautifulSoup as soup
url = requests.get("https://www.basketball-reference.com/teams/BOS/2018.html",
headers={'User-Agent': 'Mozilla/5.0'})
page = soup(url.content, 'html')
table = page.find('table', id='team_and_opponent')
print(table)
Appreciate the help.
The page is dynamic. So you have 2 options in this case.
Side note: If you see <table> tags, don't use BeautifulSoup, pandas can do that work for you (and it actually uses bs4 under the hood) by using pd.read_html()
1) Use selenium to first render the page, and THEN you can use BeautifulSoup to pull out the <table> tags
2) Those tables are within the comment tags in the html. You can use BeautifulSoup to pull out the comments, then just grab the ones with 'table'.
I chose option 2.
import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
url = 'https://www.basketball-reference.com/teams/BOS/2018.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
tables = []
for each in comments:
if 'table' in each:
try:
tables.append(pd.read_html(each)[0])
except:
continue
I don't know which particular table you want, but they are there in the list of tables
*Output:**
print (tables[1])
Unnamed: 0 G MP FG FGA ... STL BLK TOV PF PTS
0 Team 82.0 19805 3141 6975 ... 604 373 1149 1618 8529
1 Team/G NaN 241.5 38.3 85.1 ... 7.4 4.5 14.0 19.7 104.0
2 Lg Rank NaN 12 25 25 ... 23 18 15 17 20
3 Year/Year NaN 0.3% -0.9% -0.0% ... -2.1% 9.7% 5.6% -4.0% -3.7%
4 Opponent 82.0 19805 3066 6973 ... 594 364 1159 1571 8235
5 Opponent/G NaN 241.5 37.4 85.0 ... 7.2 4.4 14.1 19.2 100.4
6 Lg Rank NaN 12 3 12 ... 7 6 19 9 3
7 Year/Year NaN 0.3% -3.2% -0.9% ... -4.7% -14.4% 1.6% -5.6% -4.7%
[8 rows x 24 columns]
or
print (tables[18])
Rk Unnamed: 1 Salary
0 1 Gordon Hayward $29,727,900
1 2 Al Horford $27,734,405
2 3 Kyrie Irving $18,868,625
3 4 Jayson Tatum $5,645,400
4 5 Greg Monroe $5,000,000
5 6 Marcus Morris $5,000,000
6 7 Jaylen Brown $4,956,480
7 8 Marcus Smart $4,538,020
8 9 Aron Baynes $4,328,000
9 10 Guerschon Yabusele $2,247,480
10 11 Terry Rozier $1,988,520
11 12 Shane Larkin $1,471,382
12 13 Semi Ojeleye $1,291,892
13 14 Abdel Nader $1,167,333
14 15 Daniel Theis $815,615
15 16 Demetrius Jackson $92,858
16 17 Jarell Eddie $83,129
17 18 Xavier Silas $74,159
18 19 Jonathan Gibson $44,495
19 20 Jabari Bird $0
20 21 Kadeem Allen $0
There is no table with id team_and_opponent in that page. Rather there is a span tag with this id. You can get results by changing id.
This data should be loaded dynamically (like JavaScript).
You should take a look here Web-scraping JavaScript page with Python
For that you can use Selenium or html_requests who supports Javascript
import requests
import bs4
url = requests.get("https://www.basketball-reference.com/teams/BOS/2018.html",
headers={'User-Agent': 'Mozilla/5.0'})
soup=bs4.BeautifulSoup(url.text,"lxml")
page=soup.select(".table_outer_container")
for i in page:
print(i.text)
you will get your desired output

How can I scrape data from a url within a target url and amend everything to a single data frame in python?

I am trying to scrape data points from one webpage (A), but then scrape data from each individual data point's own webpage and combine all of the data into a single data frame for easy viewing.
This is for a daily data frame with four columns: Team, Pitcher, ERA, WHIP. The ERA and WHIP are found within the specific pitcher's url. For the data below, I have managed to scrape the team name as well as the starting pitcher name and organized both into a data frame (albeit incorrectly).
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
targetUrl = 'http://www.baseball-reference.com/previews/'
targetUrl_response = requests.get(targetUrl, timeout=5)
soup = BeautifulSoup(targetUrl_response.content, "html.parser")
teams = []
pitchers = []
for i in soup.find_all('tr'):
if i.find_all('strong'):
for link in i.find_all('strong'):
if not re.findall(r'MLB Debut',link.text):
teams.append(link.text)
if i.find_all('a'):
for link in i.find_all('a'):
if not re.findall(r'Preview',link.text):
pitchers.append(link.text)
print (df)
I'd like to add code to follow each pitcher's webpage, scrape the ERA and WHIP, then amend the data to the same data frame as team and pitcher name. Is this even possible?
Output so far:
0
Aaron Sanchez TOR
CC Sabathia NYY
Steven Matz NYM
Zach Eflin PHI
Lucas Giolito CHW
Eduardo Rodriguez BOS
Brad Keller KCR
Adam Plutko CLE
Julio Teheran ATL
Jon Lester CHC
Clayton Kershaw LAD
Zack Greinke ARI
Jon Gray COL
Drew Pomeranz SFG
Few things off the bat (see what I did there :-) ) the sports-reference.com pages are dynamic. You're able to pull SOME of the tables straight forward, but tif there are multiple tables, you'll find them under comment tags within the html source. So that might be an issue later if you want more data from the page.
The second thing is I notice you are pulling <tr> tags, which means there are <table> tags, and pandas can do the heavy work for you as opposed to iterating through with bs4. It's a simple pd.read_html() function. HOWEVER, it won't pull out those links, just strictly the text. So in this case, iterating with BeautifulSoup is the way to go (I'm just mentioning it for future reference).
There's still more work to do as a couple of the guys didn't have links/return era or whip. And you'll also have to account for if a guy is traded or change leagues, there might be multiple ERAs for the same 2019 season. But this should get you going:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
targetUrl = 'http://www.baseball-reference.com/previews/'
targetUrl_response = requests.get(targetUrl, timeout=5)
soup = BeautifulSoup(targetUrl_response.content, "html.parser")
teams = []
pitchers = []
era_list = []
whip_list = []
for i in soup.find_all('tr'):
if i.find_all('strong'):
for link in i.find_all('strong'):
if not re.findall(r'MLB Debut',link.text):
teams.append(link.text)
if i.find_all('a'):
for link in i.find_all('a'):
if not re.findall(r'Preview',link.text):
try:
url_link = link['href']
pitcher_table = pd.read_html(url_link)[0]
pitcher_table = pitcher_table[(pitcher_table['Year'] == '2019') & (pitcher_table['Lg'].isin(['AL', 'NL']))]
era = round(pitcher_table.iloc[0]['ERA'],2)
whip = round(pitcher_table.iloc[0]['WHIP'],2)
except:
era = 'N/A'
whip = 'N/A'
pitchers.append(link.text)
era_list.append(era)
whip_list.append(whip)
print ('%s\tERA: %s\tWHIP: %s' %(link.text, era, whip))
df = pd.DataFrame(list(zip(pitchers, teams, era_list, whip_list)), columns = ['Pitcher', ',Team', 'ERA', 'WHIP'])
print (df)
Output:
print (df)
Pitcher Team ERA WHIP
0 Walker Lockett NYM 23.14 2.57
1 Jake Arrieta PHI 4.12 1.38
2 Logan Allen SDP 0 0.71
3 Jimmy Yacabonis BAL 4.7 1.44
4 Clayton Richard TOR 7.46 1.74
5 Glenn Sparkman KCR 3.62 1.25
6 Shane Bieber CLE 3.86 1.08
7 Carson Fulmer CHW 6.35 1.94
8 David Price BOS 3.39 1.1
9 Jesse Chavez TEX N/A N/A
10 Jordan Zimmermann DET 6.03 1.37
11 Max Scherzer WSN 2.62 1.06
12 Trevor Richards MIA 3.54 1.25
13 Max Fried ATL 4.03 1.34
14 Adbert Alzolay CHC 2.25 0.75
15 Marco Gonzales SEA 4.38 1.37
16 Zach Davies MIL 3.06 1.36
17 Trevor Williams PIT 4.12 1.19
18 Gerrit Cole HOU 3.54 1.02
19 Blake Snell TBR 4.4 1.24
20 Kyle Gibson MIN 4.18 1.25
21 Chris Bassitt OAK 3.64 1.17
22 Jack Flaherty STL 4.24 1.18
23 Ross Stripling LAD 3.08 1.17
24 Robbie Ray ARI 3.87 1.34
25 Chi Chi Gonzalez COL N/A N/A
26 Madison Bumgarner SFG 4.28 1.24
27 Tyler Mahle CIN 4.17 1.2
28 Andrew Heaney LAA 5.68 1.14

Categories

Resources