How to properly place web-scraped data into a pandas data frame?

How to properly place web-scraped data into a pandas data frame? - python

Problem: I have used BeautifulSoup to scrape a Wikipedia page for the meat consumption per capita for each country in the world. Having trouble putting it into a data frame using Pandas - my data frame is coming up blank.
Wikipedia page: https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption
Goal: Place web scraped data into a data frame
Code:
url_meat1='https://en.wikipedia.org/wiki/List_of_countries_by_meat_consumption'
page=urllib.request.urlopen(url_meat1)
soup= BeautifulSoup(page, "lxml")# parse the HTML from our URL into the BeautifulSoup parse tree format
print(soup.prettify()) #print results of the web page scrape
table_meat1 = soup.find('table', class_='wikitable sortable')
A=[]
B=[]
C=[]
for row in table_meat1.findAll('tr'):
cells=row.findAll('td')
if len(cells)==3:
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
df_meat1=pd.DataFrame(A,columns=['Country'])
df_meat1['kg/person (2009)']=B
df_meat1['kg/person (2017)']=C
df_meat1
I get a blank data frame...
Result

Replace your for loop with this for loop:
for row in table_meat1.findAll('tr'):
cells=row.find_all('td')
if len(cells)==4:
A.append(cells[0].a['title'])
B.append(cells[2].find(text=True))
C.append(cells[3].find(text=True).strip())
Output:
Country kg/person (2009) kg/person (2017)
0 Albania None
1 Algeria 19.5 17.33
2 American Samoa 26.8
3 Angola 22.4
4 Antigua and Barbuda 84.3
.. ... ... ...
183 Venezuela 76.8
184 Vietnam 49.9 52.90
185 Yemen 17.9
186 Zambia 12.3
187 Zimbabwe 21.3 13.64
[188 rows x 3 columns]
Same data in a csv file:

Related

Combine dictionary output of iterator function into single dictionary

I'm trying to web scrape the QS website (using code I got from here) and want to combine the output of the for loop below into a single dictionary, so that I can turn it into a dataframe using pandas.
I've tried pulling the data out as lists and using .append but no luck, and haven't been able to convert the dictionaries to a df as the return function only returns the last output.
Python code:
def get_entries():
import requests
#2023 Masters in Finance Ranking Indicators URL#
url = "https://www.topuniversities.com//sites//default//files//qs-rankings-data//en//3827348_indicators.txt?rjbmn2"
headers = {
"user-agent": "Mozilla/5.0",
"x-requested-with": "XMLHttpRequest"
}
response = requests.get(url, headers=headers)
response.raise_for_status()
def Entries(entry):
from bs4 import BeautifulSoup as Soup
try:
return {
"Name": Soup(entry["uni"], "html.parser").select_one(".uni-link").find(text=True, recursive=False).strip(),
"Programme": Soup(entry["uni"], "html.parser").select_one(".uni-link").find("span").find(text=True, recursive=False),
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "html.parser").select_one(".td-wrap-in").get_text(strip=True)
}
except:
return {
"Name": Soup(entry["uni"], "html.parser").select_one(".uni-link").find(text=True, recursive=False).strip(),
"Programme": Soup(entry["uni"], "html.parser").select_one(".uni-link").find("span"),
#Finds span tags instead of text for instances where no programme#
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "html.parser").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "html.parser").select_one(".td-wrap-in").get_text(strip=True)
}
yield from map(Entries, response.json()["data"])
def dicts():
from itertools import islice
for entry in get_entries():
print(entry)
return

pandas.DataFrame can read data from generators. For example:
def gen():
from itertools import product, tee
yield from [{'a': i, 'b': j} for i, j in product(*tee(range(3)))]
df = pd.DataFrame(gen())
print(df)
Output:
a b
0 0 0
1 0 1
2 0 2
3 1 0
4 1 1
5 1 2
6 2 0
7 2 1
8 2 2
So I think that the code you have can be simplified like:
def dicts():
from pandas import DataFrame
return DataFrame(get_entries())
Or just one line:
df = pd.DataFrame(get_entries())

Not all of the entries in the returned JSON structure contain a 'uni' key. With the code as shown in the question, this will result in a AttributeError exception.
If I understand what you're trying to do correctly then you can build a "master" dictionary as a superset of all the dictionaries constructed by get_entries() which you can then pass to a DataFrame constructor.
This may help:
import requests
import pandas as pd
from bs4 import BeautifulSoup as Soup
def get_entries():
def Entries(entry):
try:
uni_link = Soup(entry['uni'], 'lxml').select_one('.uni-link')
return {
'name': uni_link.find(text=True, recursive=False).strip(),
"Programme": uni_link.find("span").find(text=True, recursive=False),
"Rank": entry["overall_rank"],
"Overall Score": Soup(entry["overall"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Value for Money": Soup(entry["ind_1"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Thought Leadership": Soup(entry["ind_2"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Employability": Soup(entry["ind_27"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Diversity": Soup(entry["ind_29"], "lxml").select_one(".td-wrap-in").get_text(strip=True),
"Alumni Outcomes": Soup(entry["ind_33"], "lxml").select_one(".td-wrap-in").get_text(strip=True)
}
except AttributeError:
return {}
(r := requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/en/3827348_indicators.txt?rjbmn2')).raise_for_status()
yield from map(Entries, r.json()['data'])
result = {}
for d in get_entries():
for k, v in d.items():
result.setdefault(k, []).append(v)
df = pd.DataFrame(result)
print(df)
Output:
name Programme Rank Overall Score Value for Money Thought Leadership Employability Diversity Alumni Outcomes
0 HEC Paris Master in International Finance 1 96.5 100 99.1 98.3 71.7 100
1 Oxford (Said) MSc Financial Economics 2 96.4 99.1 93.8 100 78.5 100
2 London Business School MSc in Financial Analysis 3 95.1 96.1 93.5 99.4 73.5 99.9
3 MIT (Sloan) Master of Finance 4 94.4 96.6 94.1 97.2 70.9 100
4 UCLA (Anderson) Master of Financial Engineering 5 91.5 96.1 95 90.9 60.4 100
.. ... ... ... ... ... ... ... ... ...
161 Tulsa (Collins) MS in Finance 184 52.1 24 21.4 28.4 34.5
162 Universidad Esan Master of Finance 186 62.6 29.6 36.7 34.7 20
163 University of South Florida MS in Finance 189 54.7 38.6 25.7 50.7 34.5
164 Vilnius University Finance and Banking 190 33 47.2 48 43.7 27.9
165 hsc Master of Science in Finance 192 72.8 32.9 32.6 50 21.9
Note:
Code changed to use lxml as that's generally faster than the default html.parser

I must pass two-dimensional input. Three-dimensional list to dataframe (Python)

I'm web scraping data and want to put it into a data frame for analysis.
I have a three-dimensional list that comes out of my scrape and I can't figure out how to get it into a data frame. I know I need to make it two-dimensional (249, 4) from the three-dimensional list (1, 249, 4).
table_countryCodes = pd.read_html("https://www.iban.com/country-codes")
np.reshape(table_countryCodes, (249,4))
df_countryCodes = pd.DataFrame(table_countryCodes)
print(df_countryCodes)
Error: ValueError: Must pass 2-d input. shape=(1, 249, 4)
How can I fix this?
Here is a sample of the three-dimensional list from the web scrape for context:
Country Alpha-2 code Alpha-3 code Numeric
American Samoa AS ASM 16
Andorra AD AND 20
Angola AO AGO 24
Anguilla AI AIA 660

pd.read_html reads all HTML tables into a list of DataFrame objects. Since your use case has only one table in the page, you can extract the same using
df = table_countryCodes[0]
print(df)
which gives us
Country Alpha-2 code Alpha-3 code Numeric
0 Afghanistan AF AFG 4
1 Åland Islands AX ALA 248
2 Albania AL ALB 8
3 Algeria DZ DZA 12
4 American Samoa AS ASM 16
.. ... ... ... ...
244 Wallis and Futuna WF WLF 876
245 Western Sahara EH ESH 732
246 Yemen YE YEM 887
247 Zambia ZM ZMB 894
248 Zimbabwe ZW ZWE 716
[249 rows x 4 columns]

You simply need:
pd.DataFrame(table_countryCodes[0])
i.e. add [0] to select the first and only element in table_countryCodes, which has the shape you need.

How to drop subheaders in wikipedia tables?

I am trying to web scrap a wikipedia table into a dataframe. In the wikipedia table, I want to drop Population density, Land Area, and specifically Population (Rank). In the end I want to keep State or territory and just Population (People).
https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density
Here is my code:
wiki = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wiki)
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(indiatable))
df=pd.DataFrame(df[0])
data = df.drop(["Population density","Population"["Rank"],"Land area"], axis=1)
wikidata = data.rename(columns={"State or territory": "State","Population": "Population"})
print (wikidata.head())
How to do I reference specifically that subtable header to drop the rank in Population?

Note: There is no expected result in your question, so you may have to make some adjustments to your headers. Assuming you like to rename people to population and not population by itself I changed that.
To get your goal, simply set the header parameter while reading the html to choose only the second, so you do not need to drop it separatly:
df=pd.read_html(str(indiatable),header=1)[0]
df = df.rename(columns={"State or territory": "State","People": "Population"}).drop(['Rank'], axis=1)
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = "https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wiki)
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})
df=pd.read_html(str(indiatable),header=1)[0]
df = df.rename(columns={"State or territory": "State","People": "Population"}).drop(['Rank'], axis=1)
Output
State
Rank(all)
Rank(50 states)
permi2
perkm2
Population
Rank.1
mi2
km2
District of Columbia
1
—
11295
4361
689545
56
61
158
New Jersey
2
1
1263
488
9288994
46
7354
19046.8
Rhode Island
3
2
1061
410
1097379
51
1034
2678
Puerto Rico
4
—
960
371
3285874
49
3515
9103.8
Massachusetts
5
3
901
348
7029917
45
7800
20201.9
Connecticut
6
4
745
288
3605944
48
4842
12540.7
Guam
7
—
733
283
153836
52
210
543.9
American Samoa
8
—
650
251
49710
55
77
199.4

How can I scrape data from a url within a target url and amend everything to a single data frame in python?

I am trying to scrape data points from one webpage (A), but then scrape data from each individual data point's own webpage and combine all of the data into a single data frame for easy viewing.
This is for a daily data frame with four columns: Team, Pitcher, ERA, WHIP. The ERA and WHIP are found within the specific pitcher's url. For the data below, I have managed to scrape the team name as well as the starting pitcher name and organized both into a data frame (albeit incorrectly).
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
targetUrl = 'http://www.baseball-reference.com/previews/'
targetUrl_response = requests.get(targetUrl, timeout=5)
soup = BeautifulSoup(targetUrl_response.content, "html.parser")
teams = []
pitchers = []
for i in soup.find_all('tr'):
if i.find_all('strong'):
for link in i.find_all('strong'):
if not re.findall(r'MLB Debut',link.text):
teams.append(link.text)
if i.find_all('a'):
for link in i.find_all('a'):
if not re.findall(r'Preview',link.text):
pitchers.append(link.text)
print (df)
I'd like to add code to follow each pitcher's webpage, scrape the ERA and WHIP, then amend the data to the same data frame as team and pitcher name. Is this even possible?
Output so far:
0
Aaron Sanchez TOR
CC Sabathia NYY
Steven Matz NYM
Zach Eflin PHI
Lucas Giolito CHW
Eduardo Rodriguez BOS
Brad Keller KCR
Adam Plutko CLE
Julio Teheran ATL
Jon Lester CHC
Clayton Kershaw LAD
Zack Greinke ARI
Jon Gray COL
Drew Pomeranz SFG

Few things off the bat (see what I did there :-) ) the sports-reference.com pages are dynamic. You're able to pull SOME of the tables straight forward, but tif there are multiple tables, you'll find them under comment tags within the html source. So that might be an issue later if you want more data from the page.
The second thing is I notice you are pulling <tr> tags, which means there are <table> tags, and pandas can do the heavy work for you as opposed to iterating through with bs4. It's a simple pd.read_html() function. HOWEVER, it won't pull out those links, just strictly the text. So in this case, iterating with BeautifulSoup is the way to go (I'm just mentioning it for future reference).
There's still more work to do as a couple of the guys didn't have links/return era or whip. And you'll also have to account for if a guy is traded or change leagues, there might be multiple ERAs for the same 2019 season. But this should get you going:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
targetUrl = 'http://www.baseball-reference.com/previews/'
targetUrl_response = requests.get(targetUrl, timeout=5)
soup = BeautifulSoup(targetUrl_response.content, "html.parser")
teams = []
pitchers = []
era_list = []
whip_list = []
for i in soup.find_all('tr'):
if i.find_all('strong'):
for link in i.find_all('strong'):
if not re.findall(r'MLB Debut',link.text):
teams.append(link.text)
if i.find_all('a'):
for link in i.find_all('a'):
if not re.findall(r'Preview',link.text):
try:
url_link = link['href']
pitcher_table = pd.read_html(url_link)[0]
pitcher_table = pitcher_table[(pitcher_table['Year'] == '2019') & (pitcher_table['Lg'].isin(['AL', 'NL']))]
era = round(pitcher_table.iloc[0]['ERA'],2)
whip = round(pitcher_table.iloc[0]['WHIP'],2)
except:
era = 'N/A'
whip = 'N/A'
pitchers.append(link.text)
era_list.append(era)
whip_list.append(whip)
print ('%s\tERA: %s\tWHIP: %s' %(link.text, era, whip))
df = pd.DataFrame(list(zip(pitchers, teams, era_list, whip_list)), columns = ['Pitcher', ',Team', 'ERA', 'WHIP'])
print (df)
Output:
print (df)
Pitcher Team ERA WHIP
0 Walker Lockett NYM 23.14 2.57
1 Jake Arrieta PHI 4.12 1.38
2 Logan Allen SDP 0 0.71
3 Jimmy Yacabonis BAL 4.7 1.44
4 Clayton Richard TOR 7.46 1.74
5 Glenn Sparkman KCR 3.62 1.25
6 Shane Bieber CLE 3.86 1.08
7 Carson Fulmer CHW 6.35 1.94
8 David Price BOS 3.39 1.1
9 Jesse Chavez TEX N/A N/A
10 Jordan Zimmermann DET 6.03 1.37
11 Max Scherzer WSN 2.62 1.06
12 Trevor Richards MIA 3.54 1.25
13 Max Fried ATL 4.03 1.34
14 Adbert Alzolay CHC 2.25 0.75
15 Marco Gonzales SEA 4.38 1.37
16 Zach Davies MIL 3.06 1.36
17 Trevor Williams PIT 4.12 1.19
18 Gerrit Cole HOU 3.54 1.02
19 Blake Snell TBR 4.4 1.24
20 Kyle Gibson MIN 4.18 1.25
21 Chris Bassitt OAK 3.64 1.17
22 Jack Flaherty STL 4.24 1.18
23 Ross Stripling LAD 3.08 1.17
24 Robbie Ray ARI 3.87 1.34
25 Chi Chi Gonzalez COL N/A N/A
26 Madison Bumgarner SFG 4.28 1.24
27 Tyler Mahle CIN 4.17 1.2
28 Andrew Heaney LAA 5.68 1.14

How to convert list to pandas DataFrame?

I use BeautifulSoup to get some data from a webpage:
import pandas as pd
import requests
from bs4 import BeautifulSoup
res = requests.get("http://www.nationmaster.com/country-info/stats/Media/Internet-users")
soup = BeautifulSoup(res.content,'html5lib')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
df.head()
But df is a list, not the pandas DataFrame as I expected from using pd.read_html.
How can I get pandas DataFrame out of it?

You can use read_html with your url:
df = pd.read_html("http://www.nationmaster.com/country-info/stats/Media/Internet-users")[0]
And then if necessary remove GRAPH and HISTORY columns and replace NaNs in column # by forward filling:
df = df.drop(['GRAPH','HISTORY'], axis=1)
df['#'] = df['#'].ffill()
print(df.head())
# COUNTRY AMOUNT DATE
0 1 China 389 million 2009
1 2 United States 245 million 2009
2 3 Japan 99.18 million 2009
3 3 Group of 7 countries (G7) average (profile) 80.32 million 2009
4 4 Brazil 75.98 million 2009
print(df.tail())
# COUNTRY AMOUNT DATE
244 214 Niue 1100 2009
245 =215 Saint Helena, Ascension, and Tristan da Cunha 900 2009
246 =215 Saint Helena 900 2009
247 217 Tokelau 800 2008
248 218 Christmas Island 464 2001

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to properly place web-scraped data into a pandas data frame? - python

Related

Combine dictionary output of iterator function into single dictionary

I must pass two-dimensional input. Three-dimensional list to dataframe (Python)

How to drop subheaders in wikipedia tables?

How can I scrape data from a url within a target url and amend everything to a single data frame in python?

How to convert list to pandas DataFrame?

Categories

Resources