I am having some trouble trying to cleanly iterate through a table of sold property listings using BeautifulSoup.
In this example
Some rows in the main table are irrelevant (like "set search filters")
The rows have unique IDs
Have tried getting the rows using a style attribute, but this did not return results.
What would be the best approach to get just the rows for sold properties out of that table?
End goal is to pluck out the sold price; date of sale; # bedrooms/bathrooms/car; land area and append into a pandas dataframe.
from bs4 import BeautifulSoup
import requests
# Globals
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
url = 'http://house.ksou.cn/p.php?q=West+Footscray%2C+VIC'
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
prop_table = soup.find('table', id="mainT")
#prop_table = soup.find('table', {"font-size" : "13px"})
#prop_table = soup.select('.addr') # Pluck out the listings
rows = prop_table.findAll('tr')
for row in rows:
print(row.text)
This HTML is tricky to parse, because it doesn't have fixed structure. Unfortunately, I don't have pandas installed, so I only print the data to the screen:
import requests
from bs4 import BeautifulSoup
url = 'http://house.ksou.cn/p.php?q=West+Footscray&p={page}&s=1&st=&type=&count=300®ion=West+Footscray&lat=0&lng=0&sta=vic&htype=&agent=0&minprice=0&maxprice=0&minbed=0&maxbed=0&minland=0&maxland=0'
data = []
for page in range(0, 2): # <-- increase to number of pages you want to crawl
soup = BeautifulSoup(requests.get(url.format(page=page)).text, 'html.parser')
for table in soup.select('table[id^="r"]'):
name = table.select_one('span.addr').text
price = table.select_one('span.addr').find_next('b').get_text(strip=True).split()[-1]
sold = table.select_one('span.addr').find_next('b').find_next_sibling(text=True).replace('in', '').replace('(Auction)', '').strip()
beds = table.select_one('img[alt="Bed rooms"]')
beds = beds.find_previous_sibling(text=True).strip() if beds else '-'
bath = table.select_one('img[alt="Bath rooms"]')
bath = bath.find_previous_sibling(text=True).strip() if bath else '-'
car = table.select_one('img[alt="Car spaces"]')
car = car.find_previous_sibling(text=True).strip() if car else '-'
land = table.select_one('b:contains("Land size:")')
land = land.find_next_sibling(text=True).split()[0] if land else '-'
building = table.select_one('b:contains("Building size:")')
building = building.find_next_sibling(text=True).split()[0] if building else '-'
data.append([name, price, sold, beds, bath, car, land, building])
# print the data
print('{:^25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format('Name', 'Price', 'Sold', 'Beds', 'Bath', 'Car', 'Land', 'Building'))
for row in data:
print('{:<25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format(*row))
Prints:
Name Price Sold Beds Bath Car Land Building
51 Fontein Street $770,000 07 Dec 2019 - - - - -
50 Fontein Street $751,000 07 Dec 2019 - - - - -
9 Wellington Street $1,024,999 Dec 2019 2 1 1 381 -
239 Essex Street $740,000 07 Dec 2019 2 1 1 358 101
677a Barkly Street $780,000 Dec 2019 4 1 - 380 -
23A Busch Street $800,000 30 Nov 2019 3 1 1 215 -
3/2-4 Dyson Street $858,000 Nov 2019 3 2 - 378 119
3/101 Stanhope Street $803,000 30 Nov 2019 2 2 2 168 113
2/4 Rondell Avenue $552,500 30 Nov 2019 2 - - 1,088 -
3/2 Dyson Street $858,000 30 Nov 2019 3 2 2 378 -
9 Vine Street $805,000 Nov 2019 2 1 2 318 -
39 Robbs Road $957,000 23 Nov 2019 2 2 - 231 100
29 Robbs Road $1,165,000 Nov 2019 2 1 1 266 -
5 Busch Street $700,000 Nov 2019 2 1 1 202 -
46 Indwe Street $730,000 16 Nov 2019 3 1 1 470 -
29/132 Rupert Street $216,000 16 Nov 2019 1 1 1 3,640 -
11/10 Carmichael Street $385,000 15 Nov 2019 2 1 1 1,005 -
2/16 Carmichael Street $515,000 14 Nov 2019 2 1 1 112 -
4/26 Beaumont Parade $410,000 Nov 2019 2 1 1 798 -
5/10 Carmichael Street $310,000 Nov 2019 1 1 1 1,004 -
Related
I am a novice at this, but I've been trying to scrape data on a website (https://awards.decanter.com/DWWA/2022/search/wines?competitionType=DWWA) but I keep coming up empty. I've tried BeautifulSoup and Scrapy but I can't get the text out.
Eventually I want to get the row of each individual wine in the table into a dataframe/csv (from all pages) but currently I can't even get the first wine producer name.
If you inspect the webpage all the details are in tags with no id or class.
My BeautifulSoup attempt
URL = 'https://awards.decanter.com/DWWA/2022/search/wines?competitionType=DWWA'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
soup2 = soup.prettify()
producer = soup2.find_all('td').get_text()
print(producer)
Which is throwing the error:
producer = soup2.find_all('td').get_text()
AttributeError: 'str' object has no attribute 'find_all'
My Scrapy attempt
winedf = pd.DataFrame()
class WineSpider(scrapy.Spider):
name = 'wine_spider'
def start_requests(self):
dwwa_url = "https://awards.decanter.com/DWWA/2022/search/wines?competitionType=DWWA"
yield scrapy.Request(url=dwwa_url, callback=self.parse_front)
def parse_front(self, response):
table = response.xpath('//*[#id="root"]/div/div[2]/div[4]/div[2]/table')
page_links = table.xpath('//*[#id="root"]/div/div[2]/div[4]/div[2]/div[2]/div[1]/ul/li[3]/a(#class,\
"dwwa-page-link") #href')
links_to_follow = page_links.extract()
for url in links_to_follow:
yield response.follow(url=url, callback=self.parse_pages)
def parse_pages(self, response):
wine_name = Selector(response=response).xpath('//*[#id="root"]/div/div[2]/div[4]/div[2]/table/tbody/\
tr[1]/td[1]/text()').get()
wine_name_ext = wine_name.extract().strip()
winedf.append(wine_name_ext)
medal = Selector(response=response).xpath('//*[#id="root"]/div/div[2]/div[4]/div[2]/table/tbody/tr[1]/\
td[4]/text()').get()
medal_ext = medal.extract().strip()
winedf.append(medal_ext)
Which produces and empty df.
Any help would be greatly appreciated.
Thank you!
When you load a site you want to scrape, always inspect what it loads with the network monitor. In this case you see that it loads the data dynamically from an api. This means that you can skip scraping altogether and load the data directly from the api into pandas:
import pandas as pd
df = pd.read_json('https://decanterresultsapi.decanter.com/api/DWWA/2022/wines/search?competitionType=DWWA')
Which gives all 14858 items:
producer
name
id
competition
award
score
country
region
subRegion
vintage
color
style
priceBandLetter
competitionYear
competitionType
0
Yealands Estate Wines
Babydoll Sauvignon Blanc
706484
DWWA 2022
7
88
New Zealand
Marlborough
Not Applicable
2021
White
Still - Dry (below 5 g/L residual sugar)
A
2022
DWWA
1
Yealands Estate Wines
Reserve Pinot Gris
706478
DWWA 2022
7
86
New Zealand
Marlborough
Not Applicable
2021
White
Still - Dry (below 5 g/L residual sugar)
B
2022
DWWA
2
Yealands Estate Wines
Babydoll Pinot Gris
706479
DWWA 2022
7
87
New Zealand
Marlborough
Not Applicable
2021
White
Still - Dry (below 5 g/L residual sugar)
A
2022
DWWA
3
Yealands Estate Wines
Reserve Chardonnay
705165
DWWA 2022
6
90
New Zealand
Hawke's Bay
Not Applicable
2021
White
Still - Dry (below 5 g/L residual sugar)
B
2022
DWWA
4
Yealands Estate Wines
Reserve Sauvignon Blanc
706486
DWWA 2022
6
90
New Zealand
Marlborough
Awatere Valley
2021
White
Still - Dry (below 5 g/L residual sugar)
B
2022
DWWA
Try:
import pandas as pd
url = "https://decanterresultsapi.decanter.com/api/DWWA/2022/wines/search?competitionType=DWWA"
df = pd.read_json(url)
# print last items in df:
print(df.tail().to_markdown())
Prints:
producer
name
id
competition
award
score
country
region
subRegion
vintage
color
style
priceBandLetter
competitionYear
competitionType
14853
Telavi Wine Cellar
Marani
718257
DWWA 2022
7
86
Georgia
Kakheti
Kindzmarauli
2021
Red
Still - Medium (between 19 and 44 g/L residual sugar)
B
2022
DWWA
14854
Štrigova
Muškat Žuti
716526
DWWA 2022
7
87
Croatia
Continental
Zagorje - Međimurje
2021
White
Still - Medium (between 19 and 44 g/L residual sugar)
C
2022
DWWA
14855
Kopjar
Muscat žUti
717754
DWWA 2022
7
86
Croatia
Continental
Zagorje - Međimurje
2021
White
Still - Medium (between 19 and 44 g/L residual sugar)
C
2022
DWWA
14856
Cleebronn-Güglingen
Blanc De Noir Fein & Fruchtig
719836
DWWA 2022
7
87
Germany
Württemberg
Not Applicable
2021
White
Still - Medium (between 19 and 44 g/L residual sugar)
B
2022
DWWA
14857
Winnice Czajkowski
Thoma 8 Grand Selection
719891
DWWA 2022
6
90
Poland
Not Applicable
Not Applicable
2021
White
Still - Medium (between 19 and 44 g/L residual sugar)
D
2022
DWWA
I have been trying to scrape all data from the first page to the last page, but it returns only the first page as the output. How can I solve this? Below is my code:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
pages = np.arange(2, 1589, 20)
for page in pages:
page = requests.get( "https://estateintel.com/app/projects/search?q=%7B%22sectors%22%3A%5B%22residential%22%5D%7D&page="+str(page))
sleep(randint(2,10))
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_="project-card-vertical h-full flex flex-col rounded border-thin border-inactive-blue overflow-hidden pointer")
for list in lists:
title = list.find('p', class_ ="project-location text-body text-base mb-3").text. replace ('\n', '',).strip()
location = list.find('span', class_ ="text-gray-1").text. replace ('\n', '',).strip()
status = list.find('span', class_ ="text-purple-1 font-bold").text. replace ('\n', '',).strip()
units = list.find('span', class_ ="text-body font-semibold").text. replace ('\n', '',).strip()
info = [title,location,status,units]
print(info)
The page is loaded dynamically using the API. Therefore, with a regular GET request, you will always get the first page. You need to study how the page communicates with the browser and find the request you need, I wrote an example for review.
import json
import requests
def get_info(page):
url = f"https://services.estateintel.com/api/v2/properties?type\\[\\]=residential&page={page}"
headers = {
'accept': 'application/json',
'authorization': 'false',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'
}
response = requests.request("GET", url, headers=headers)
json_obj = json.loads(response.text)
for data in json_obj['data']:
print(data['name'])
print(data['area'], data['state'])
print(data['status'])
print(data['size']['value'], data['size']['unit'])
print('------')
for page in range(1, 134):
get_info(page)
You can choose the fields you need, this is just an example, also add to dataframe. Output:
Twin Oaks Apartment
Kilimani Nairobi
Completed
0 units
------
Duchess Park
Lavington Nairobi
Completed
62 units
------
Greenvale Apartments
Kileleshwa Nairobi
Completed
36 units
------
The Urban apartments & Suites
Osu Greater Accra
Completed
28 units
------
Chateau Towers
Osu Greater Accra
Completed
120 units
------
Cedar Haus Gardens
Oluyole Oyo
Under Construction
38 units
------
10 Agoro Street
Oluyole Oyo
Completed
1 units
..............
Think it is working well, but needs the time to sleep - Just in case, you could select your elements more specific e.g. with css selectors and store information in a list of dicts instead just printing it.
Example
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
data = []
for page in range(1,134):
print(page)
page = requests.get( "https://estateintel.com/app/projects/search?q=%7B%22sectors%22%3A%5B%22residential%22%5D%7D&page="+str(page))
sleep(randint(2,10))
soup = BeautifulSoup(page.content, 'html.parser')
for item in soup.select('div.project-grid > a'):
data.append({
'title' : item.h3.text.strip(),
'location' : item.find('span', class_ ="text-gray-1").text.strip(),
'status' : item.find('span', class_ ="text-purple-1 font-bold").text.strip(),
'units' : item.find('span', class_ ="text-body font-semibold").text.strip()
})
pd.DataFrame(data)
Output
title
location
status
units
0
Twin Oaks Apartment
Kilimani, Nairobi
Completed
Size: --
1
Duchess Park
Lavington, Nairobi
Completed
Size: 62 units
2
Greenvale Apartments
Kileleshwa, Nairobi
Completed
Size: 36 units
3
The Urban apartments & Suites
Osu, Greater Accra
Completed
Size: 28 units
4
Chateau Towers
Osu, Greater Accra
Completed
Size: 120 units
5
Cedar Haus Gardens
Oluyole, Oyo
Under Construction
Size: 38 units
6
10 Agoro Street
Oluyole, Oyo
Completed
Size: 1 units
7
Villa O
Oluyole, Oyo
Completed
Size: 2 units
8
Avenue Road Apartments
Oluyole, Oyo
Completed
Size: 6 units
9
15 Alafia Street
Oluyole, Oyo
Completed
Size: 4 units
10
12 Saint Mary Street
Oluyole, Oyo
Nearing Completion
Size: 8 units
11
RATCON Estate
Oluyole, Oyo
Completed
Size: --
12
1 Goodwill Road
Oluyole, Oyo
Completed
Size: 4 units
13
Anike's Court
Oluyole, Oyo
Completed
Size: 3 units
14
9 Adeyemo Quarters
Oluyole, Oyo
Completed
Size: 4 units
15
Marigold Residency
Nairobi West, Nairobi
Under Construction
Size: --
16
Kings Distinction
Kilimani, Nairobi
Completed
Size: --
17
Riverview Apartments
Kyumvi, Machakos
Completed
Size: --
18
Serene Park
Kyumvi, Machakos
Under Construction
Size: --
19
Gitanga Duplexes
Lavington, Nairobi
Under Construction
Size: 36 units
20
Westpointe Apartments
Upper Hill, Nairobi
Completed
Size: 254 units
21
10 Olaoluwa Street
Oluyole, Oyo
Under Construction
Size: 12 units
22
Rosslyn Grove
Nairobi West, Nairobi
Under Construction
Size: 90 units
23
7 Kamoru Ajimobi Street
Oluyole, Oyo
Completed
Size: 2 units
#pip install trio httpx pandas
import trio
import httpx
import pandas as pd
allin = []
keys1 = ['name', 'area', 'state']
keys2 = ['value', 'unit']
async def scraper(client, page):
client.params = client.params.merge({'page': page})
r = await client.get('/properties')
allin.extend([[i.get(k, 'N/A') for k in keys1] +
[i['size'].get(b, 'N/A')
for b in keys2] for i in r.json()['data']])
async def main():
async with httpx.AsyncClient(timeout=None, base_url='https://services.estateintel.com/api/v2') as client, trio.open_nursery() as nurse:
client.params = {
'type[]': 'residential'
}
for page in range(1, 3):
nurse.start_soon(scraper, client, page)
df = pd.DataFrame(allin, columns=[keys1 + keys2])
print(df)
if __name__ == "__main__":
trio.run(main)
Output:
0 Cedar Haus Gardens Oluyole Oyo 38 units
1 10 Agoro Street Oluyole Oyo 1 units
2 Villa O Oluyole Oyo 2 units
3 Avenue Road Apartments Oluyole Oyo 6 units
4 15 Alafia Street Oluyole Oyo 4 units
5 12 Saint Mary Street Oluyole Oyo 8 units
6 RATCON Estate Oluyole Oyo 0 units
7 1 Goodwill Road Oluyole Oyo 4 units
8 Anike's Court Oluyole Oyo 3 units
9 9 Adeyemo Quarters Oluyole Oyo 4 units
10 Marigold Residency Nairobi West Nairobi 0 units
11 Riverview Apartments Kyumvi Machakos 0 units
12 Socian Villa Apartments Kileleshwa Nairobi 36 units
13 Kings Pearl Residency Lavington Nairobi 55 units
14 Touchwood Gardens Kilimani Nairobi 32 units
15 Panorama Apartments Upper Hill Nairobi 0 units
16 Gitanga Duplexes Lavington Nairobi 36 units
17 Serene Park Kyumvi Machakos 25 units
18 Kings Distinction Kilimani Nairobi 48 units
19 Twin Oaks Apartment Kilimani Nairobi 0 units
20 Duchess Park Lavington Nairobi 70 units
21 Greenvale Apartments Kileleshwa Nairobi 36 units
22 The Urban apartments & Suites Osu Greater Accra 28 units
23 Chateau Towers Osu Greater Accra 120 units
I am trying to retrieve football squads data from multiple wikipedia pages and put it in a Pandas Data frame. One example of the source is this [link][1], but I want too do this for links between 1930-2018.
The code that I will show used to work in Python 2 and I'm trying to adapt it to Python 3. The information in every page are multiple tables with 7 columns. All of the tables have the same format.
The code used to crash but now is running. The only problem is that it throws an empty .csv file.
Just to put more context I made some specific changes :
Python 2
path = os.path.join('.cache', hashlib.md5(url).hexdigest() + '.html')
Python 3
path = os.path.join('.cache', hashlib.sha256(url.encode('utf-8')).hexdigest() + '.html')
Python 2
open(path, 'w') as fd:
Python 3
open(path, 'wb') as fd:
Python 2
years = range(1930,1939,4) + range(1950,2015,4)
Python 3: Yes here I also changed the range so I could get World Cup 2018
years = list(range(1930,1939,4)) + list(range(1950,2019,4))
This is the whole chunk of code. If somebody can spot where in the world is the problem and give a solution I would be very thankful.
import hashlib
import requests
from bs4 import BeautifulSoup
import pandas as pd
if not os.path.exists('.cache'):
os.makedirs('.cache')
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/15612.1.29.41.4'
session = requests.Session()
def get(url):
'''Return cached lxml tree for url'''
path = os.path.join('.cache', hashlib.sha256(url.encode('utf-8')).hexdigest() + '.html')
if not os.path.exists(path):
print(url)
response = session.get(url, headers={'User-Agent': ua})
with open(path, 'wb') as fd:
fd.write(response.text.encode('utf-8'))
return BeautifulSoup(open(path), 'html.parser')
def squads(url):
result = []
soup = get(url)
year = url[29:33]
for table in soup.find_all('table','sortable'):
if "wikitable" not in table['class']:
country = table.find_previous("span","mw-headline").text
for tr in table.find_all('tr')[1:]:
cells = [td.text.strip() for td in tr.find_all('td')]
cells += [country, td.a.get('title') if td.a else 'none', year]
result.append(cells)
return result
years = list(range(1930,1939,4)) + list(range(1950,2019,4))
result = []
for year in years:
url = "http://en.wikipedia.org/wiki/"+str(year)+"_FIFA_World_Cup_squads"
result += squads(url)
Final_result = pd.DataFrame(result)
Final_result.to_csv('/Users/home/Downloads/data.csv', index=False, encoding='iso-8859-1')```
[1]: https://en.wikipedia.org/wiki/2018_FIFA_World_Cup_squads
To get information about each team for years 1930-2018 you can use next example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/{}_FIFA_World_Cup_squads"
dfs = []
for year in range(1930, 2019):
print(year)
soup = BeautifulSoup(requests.get(url.format(year)).content, "html.parser")
tables = soup.find_all(
lambda tag: tag.name == "table"
and tag.select_one('th:-soup-contains("Pos.")')
)
for table in tables:
for tag in table.select('[style="display:none"]'):
tag.extract()
df = pd.read_html(str(table))[0]
df["Year"] = year
df["Country"] = table.find_previous(["h3", "h2"]).span.text
dfs.append(df)
df = pd.concat(dfs)
print(df)
df.to_csv("data.csv", index=False)
Prints:
...
13 14 FW Moussa Konaté 3 April 1993 (aged 25) 28 Amiens 2018 Senegal 10.0
14 15 FW Diafra Sakho 24 December 1989 (aged 28) 12 Rennes 2018 Senegal 3.0
15 16 GK Khadim N'Diaye 5 April 1985 (aged 33) 26 Horoya 2018 Senegal 0.0
16 17 MF Badou Ndiaye 27 October 1990 (aged 27) 20 Stoke City 2018 Senegal 1.0
17 18 FW Ismaïla Sarr 25 February 1998 (aged 20) 16 Rennes 2018 Senegal 3.0
18 19 FW M'Baye Niang 19 December 1994 (aged 23) 7 Torino 2018 Senegal 0.0
19 20 FW Keita Baldé 8 March 1995 (aged 23) 19 Monaco 2018 Senegal 3.0
20 21 DF Lamine Gassama 20 October 1989 (aged 28) 36 Alanyaspor 2018 Senegal 0.0
21 22 DF Moussa Wagué 4 October 1998 (aged 19) 10 Eupen 2018 Senegal 0.0
22 23 GK Alfred Gomis 5 September 1993 (aged 24) 1 SPAL 2018 Senegal 0.0
and saves data.csv (screenshot from LibreOffice):
just tested, you have no data because "wikitable" class is present in every table
You can replace "not in" by "in":
if "wikitable" in table["class"]:
...
And your BeautifulSoup data will be there
Once you changed this condition, you will have a problem with this list comprehension:
cells += [country, td.a.get('title') if td.a else 'none', year]
This is because td is not defined in this list, not quite sure what is the aim of these lines but you can define tds before and use them after:
tds = tr.find_all('td')
cells += ...
In general you can add breakpoints in your code to easily identify where is the problem
So I have been able to scrape the first 50 teams in the team rankings webpage from 247sports.
I was able to get the following results:
index Rank Team Total Recruits Average Rating Total Rating
0 0 1 Ohio State 17 94.35 286.75
1 10 11 Alabama 10 94.16 210.61
2 8 9 Georgia 11 93.38 219.60
3 31 32 Clemson 8 92.02 161.74
4 3 4 LSU 14 91.92 240.57
5 4 5 Oklahoma 13 91.81 229.03
6 22 23 USC 9 91.60 174.69
7 11 12 Texas A&M 11 91.59 203.03
8 1 2 Notre Dame 18 91.01 250.35
9 2 3 Penn State 18 90.04 243.95
10 6 7 Texas 14 90.04 222.03
11 14 15 Missouri 12 89.94 196.37
12 7 8 Oregon 15 89.91 220.66
13 5 6 Florida State 15 89.88 224.51
14 25 26 Florida 10 89.15 167.89
15 37 38 North Carolina 9 88.94 152.79
16 9 10 Michigan 16 88.76 216.07
17 33 34 UCLA 10 88.49 160.00
18 23 24 Kentucky 11 88.46 173.12
19 12 13 Rutgers 14 88.44 198.56
20 19 20 Indiana 12 88.41 181.20
21 49 50 Washington 8 88.21 132.55
22 20 21 Oklahoma State 13 88.18 177.91
23 43 44 Ole Miss 10 87.80 143.35
24 44 45 California 9 87.78 141.80
25 17 18 Arkansas 15 87.75 188.64
26 16 17 South Carolina 15 87.61 190.84
27 32 33 Georgia Tech 11 87.30 161.33
28 35 36 Tennessee 11 87.25 157.77
29 39 40 NC State 11 87.18 150.18
30 46 47 SMU 9 87.08 138.50
31 36 37 Wisconsin 11 87.00 157.55
32 21 22 Mississippi State 15 86.96 177.33
33 24 25 West Virginia 13 86.78 171.72
34 30 31 Northwestern 14 86.76 162.66
35 40 41 Maryland 12 86.31 149.77
36 15 16 Virginia Tech 18 86.23 191.06
37 18 19 Baylor 19 85.90 184.68
38 13 14 Boston College 22 85.88 197.15
39 26 27 Michigan State 14 85.85 167.60
40 29 30 Cincinnati 14 85.68 164.90
41 34 35 Minnesota 13 85.55 159.35
42 28 29 Iowa State 14 85.54 166.50
43 48 49 Virginia 10 85.39 133.93
44 45 46 Arizona 11 85.27 140.90
45 41 42 Pittsburgh 12 85.10 147.58
46 47 48 Duke 13 85.02 137.40
47 27 28 Vanderbilt 16 85.01 166.77
48 38 39 Purdue 13 84.83 152.55
49 42 43 Illinois 13 84.15 143.86
From the following script:
year = '2022'
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeTeamRankings/'
print(url)
# Add the `user-agent` otherwise we will get blocked when sending the request
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
response = requests.get(url, headers = headers).content
soup = BeautifulSoup(response, "html.parser")
data = []
for tag in soup.find_all("li", class_="rankings-page__list-item"):
rank = tag.find('div',{'class':'primary'}).text.strip()
team = tag.find('div',{'class':'team'}).find('a').text.strip()
total_recruits = tag.find('div',{'class':'total'}).find('a').text.split(' ')[0].strip()
# five_stars = tag.find('div',{'class':'gold'}).text.strip()
# four_stars = tag.find('div',{'class':'gold'}).text.strip()
# three_stars = tag.find('div',{'class':'metrics'}).text.strip()
avg_rating = tag.find('div',{'class':'avg'}).text.strip()
total_rating = tag.find('div',{'class':'points'}).text.strip()
data.append(
{
"Rank": rank,
"Team": team,
"Total Recruits": total_recruits,
# "Five-Star Recruits": five_stars,
# "Four-Star Recruits": four_stars,
# "Three-Star Recruits": three_stars,
"Average Rating": avg_rating,
"Total Rating": total_rating
}
)
df = pd.DataFrame(data)
df[['Rank', 'Total Recruits', 'Average Rating', 'Total Rating']] = df[['Rank', 'Total Recruits', 'Average Rating', 'Total Rating']].apply(pd.to_numeric)
df.sort_values('Average Rating', ascending = False).reset_index()
# soup
However, I would like to achieve three things.
I would like to grab the data from the "5-stars", "4-stars", "3-stars" columns in the webpage.
I would like to not just get the first 50 schools, but also tell the webpage to click "load more" enough times so that I can get the table with ALL schools in it.
I want to not only get the 2022 team rankings, but every team ranking that 247sports has to offer (2000 through 2024).
I tried to give it a go with this one script, but I constantly get the top-50 schools being outputted in one loop in the "print(row) portion" of the code.
print(datetime.datetime.now().time())
# years = ['2000', '2001', '2002', '2003', '2004',
# '2005', '2006', '2007', '2008', '2009',
# '2010', '2011', '2012', '2013', '2014',
# '2015', '2016', '2017', '2018', '2019',
# '2020', '2021', '2022', '2023']
years = ['2022']
rows = []
page_totals = []
# recruits_final = []
for year in years:
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeTeamRankings/'
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
page = 0
while True:
page +=1
payload = {'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.find_all('li',{'class':'rankings-page__list-item'})
if len(tags) == 0:
print('Page: %s' %page)
page_totals.append(page)
break
continue_loop = True
while continue_loop == True:
for tag in tags:
if tag.text.strip() == 'Load More':
continue_loop = False
continue
# primary_rank = tag.find('div',{'class':'rank-column'}).find('div',{'class':'primary'}).text.strip()
# try:
# other_rank = tag.find('div',{'class':'rank-column'}).find('div',{'class':'other'}).text.strip()
# except:
# other_rank = ''
rank = tag.find('div',{'class':'primary'}).text.strip()
team = tag.find('div',{'class':'team'}).find('a').text.strip()
total_recruits = tag.find('div',{'class':'total'}).find('a').text.split(' ')[0].strip()
# five_stars = tag.find('div',{'class':'gold'}).text.strip()
# four_stars = tag.find('div',{'class':'gold'}).text.strip()
# three_stars = tag.find('div',{'class':'metrics'}).text.strip()
avg_rating = tag.find('div',{'class':'avg'}).text.strip()
total_rating = tag.find('div',{'class':'points'}).text.strip()
try:
team = athlete.find('div',{'class':'status'}).find('img')['title']
except:
team = ''
row = {'Rank': rank,
'Team': team,
'Total Recruits': total_recruits,
'Average Rating': avg_rating,
'Total Rating': total_rating,
'Year': year}
print(row)
rows.append(row)
recruits = pd.DataFrame(rows)
print(datetime.datetime.now().time())
Any assistance on this is truly appreciated. Thanks in advance.
First, you can extract the year ranges from the dropdown with BeautifulSoup (no need to click the button, as the dropdown is already on the page), and then navigate to each link with selenium, using the latter to interact with the "load more" toggle, and then finally scraping the resulting tables:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time, urllib.parse, re
d = webdriver.Chrome('path/to/chromedriver')
d.get((url:='https://247sports.com/Season/2022-Football/CompositeTeamRankings/'))
result = {}
for i in soup(d.page_source, 'html.parser').select('.rankings-page__header-nav > .rankings-page__nav-block .flyout_cmp.year.tooltip li a'):
if (y:=int(i.get_text(strip=True))) > 1999:
d.get(urllib.parse.urljoin(url, i['href']))
while d.execute_script("""return document.querySelector('a[data-js="showmore"]') != null"""):
d.execute_script("""document.querySelector('a[data-js="showmore"]').click()""")
time.sleep(1)
result[y] = [{"Rank":i.select_one('div.wrapper .rank-column .other').get_text(strip=True),
"Team":i.select_one('.team').get_text(strip=True),
"Total":i.select_one('.total').get_text(strip=True).split()[0],
"5-Stars":i.select_one('.star-commits-list li:nth-of-type(1) div').get_text(strip=True),
"4-Stars":i.select_one('.star-commits-list li:nth-of-type(2) div').get_text(strip=True),
"3-Stars":i.select_one('.star-commits-list li:nth-of-type(3) div').get_text(strip=True),
"Ave":i.select_one('.avg').get_text(strip=True),
"Points":i.select_one('.points').get_text(strip=True),
}
for i in soup(d.page_source, 'html.parser').select("""ul[data-js="rankings-list"].rankings-page__list li.rankings-page__list-item""")]
result stores all the team rankings for a given year, 2000-2024 (list(result) produces [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000]). To convert the results to a pandas.DataFrame:
import pandas as pd
df = pd.DataFrame([{'Year':a, **i} for a, b in result.items() for i in b])
print(df)
Output:
Year Rank Team Total 5-Stars 4-Stars 3-Stars Ave Points
0 2024 N/A Iowa 1 0 0 0 0.00 0.00
1 2024 N/A Florida State 3 0 0 0 0.00 0.00
2 2024 N/A BYU 1 0 0 0 0.00 0.00
3 2023 1 Georgia 4 0 4 0 93.86 93.65
4 2023 3 Notre Dame 2 1 1 0 95.98 51.82
... ... ... ... ... ... ... ... ... ...
3543 2000 N/A NC State 18 0 0 0 70.00 0.00
3544 2000 N/A Colorado State 14 0 0 0 70.00 0.00
3545 2000 N/A Oregon 27 0 0 0 70.00 0.00
3546 2000 N/A California 25 0 0 0 70.00 0.00
3547 2000 N/A Texas Tech 20 0 0 0 70.00 0.00
[3548 rows x 9 columns]
Edit: instead of using selenium, you can send requests to the API endpoints that the site uses to retrieve and display the ranking data:
import requests, pandas as pd
from bs4 import BeautifulSoup as soup
def extract_rankings(source):
return [{"Rank":i.select_one('div.wrapper .rank-column .other').get_text(strip=True),
"Team":i.select_one('.team').get_text(strip=True),
"Total":i.select_one('.total').get_text(strip=True).split()[0],
"5-Stars":i.select_one('.star-commits-list li:nth-of-type(1) div').get_text(strip=True),
"4-Stars":i.select_one('.star-commits-list li:nth-of-type(2) div').get_text(strip=True),
"3-Stars":i.select_one('.star-commits-list li:nth-of-type(3) div').get_text(strip=True),
"Ave":i.select_one('.avg').get_text(strip=True),
"Points":i.select_one('.points').get_text(strip=True),
}
for i in soup(source, 'html.parser').select("""li.rankings-page__list-item""")]
def year_rankings(year):
page, results = 1, []
vals = extract_rankings(requests.get(f'https://247sports.com/Season/{year}-Football/CompositeTeamRankings/?ViewPath=~%2FViews%2FSkyNet%2FInstitutionRanking%2F_SimpleSetForSeason.ascx&Page={page}', headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}).text)
while vals:
results.extend(vals)
page += 1
vals = extract_rankings(requests.get(f'https://247sports.com/Season/{year}-Football/CompositeTeamRankings/?ViewPath=~%2FViews%2FSkyNet%2FInstitutionRanking%2F_SimpleSetForSeason.ascx&Page={page}', headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}).text)
return results
results = {y:year_rankings(y) for y in range(2000, 2025)}
df = pd.DataFrame([{'Year':a, **i} for a, b in results.items() for i in b])
print(df)
Background
I have five years of NO2 measurement data, in csv files-one file for every location and year. I have loaded all the files into pandas dataframes in the same format:
Date Hour Location NO2_Level
0 01/01/2016 00 Street 18
1 01/01/2016 01 Street 39
2 01/01/2016 02 Street 129
3 01/01/2016 03 Street 76
4 01/01/2016 04 Street 40
Goal
For each dataframe count the number of times NO2_Level is greater than 150 and output this.
So I wrote a loop that's creates all the dataframes from the right directories and cleans them appropriately .
Problem
Whatever I've tried produces results I know on inspection are incorrect, e.g :
-the count value for every location on a given year is the same (possible but unlikely)
-for a year when I know there should be any positive number for the count, every location returns 0
What I've tried
I have tried a lot of approaches to getting this value for each dataframe, such as making the column a series:
NO2_Level = pd.Series(df['NO2_Level'])
count = (NO2_Level > 150).sum()'''
Using pd.count():
count = df[df['NO2_Level'] >= 150].count()
These two approaches have gotten closest to what I want to output
Example to test on
data = {'Date': ['01/01/2016','01/02/2016',' 01/03/2016', '01/04/2016', '01/05/2016'], 'Hour': ['00', '01', '02', '03', '04'], 'Location': ['Street','Street','Street','Street','Street',], 'NO2_Level': [18, 39, 129, 76, 40]}
df = pd.DataFrame(data=d)
NO2_Level = pd.Series(df['NO2_Level'])
count = (NO2_Level > 150).sum()
count
Expected Outputs
So from this I'm trying to get it to output a single line for each dataframe that was made in the format Location, year, count (of condition):
Kirkstall Road,2013,47
Haslewood Close,2013,97
...
Jack Lane Hunslet,2015,158
So the above example would produce
Street, 2016, 1
Actual
Every year produces the same result for each location, for some years (2014) the count doesn't seem to work at all when on inspection there should be:
Kirkstall Road,2013,47
Haslewood Close,2013,47
Tilbury Terrace,2013,47
Corn Exchange,2013,47
Temple Newsam,2014,0
Queen Street Morley,2014,0
Corn Exchange,2014,0
Tilbury Terrace,2014,0
Haslewood Close,2015,43
Tilbury Terrace,2015,43
Corn Exchange,2015,43
Jack Lane Hunslet,2015,43
Norman Rows,2015,43
Hopefully this helps.
import pandas as pd
ddict = {
'Date':['2016-01-01','2016-01-01','2016-01-01','2016-01-01','2016-01-01','2016-01-02',],
'Hour':['00','01','02','03','04','02'],
'Location':['Street','Street','Street','Street','Street','Street',],
'N02_Level':[19,39,129,76,40, 151],
}
df = pd.DataFrame(ddict)
# Convert dates to datetime
df['Date'] = pd.to_datetime(df['Date'])
# Make a Year column
df['Year'] = df['Date'].apply(lambda x: x.strftime('%Y'))
# Group by lcoation and year, count by M02_Level > 150
df1 = df[df['N02_Level'] > 150].groupby(['Location','Year']).size().reset_index(name='Count')
# Interate the results
for i in range(len(df1)):
loc = df1['Location'][i]
yr = df1['Year'][i]
cnt = df1['Count'][i]
print(f'{loc},{yr},{cnt}')
### To not use f-strings
for i in range(len(df1)):
print('{loc},{yr},{cnt}'.format(loc=df1['Location'][i], yr=df1['Year'][i], cnt=df1['Count'][i]))
Sample data:
Date Hour Location N02_Level
0 2016-01-01 00 Street 19
1 2016-01-01 01 Street 39
2 2016-01-01 02 Street 129
3 2016-01-01 03 Street 76
4 2016-01-01 04 Street 40
5 2016-01-02 02 Street 151
Output:
Street,2016,1
here is a solution with a sample generated (randomly):
def random_dates(start, end, n):
start_u = start.value // 10 ** 9
end_u = end.value // 10 ** 9
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
location = ['street', 'avenue', 'road', 'town', 'campaign']
df = pd.DataFrame({'Date' : random_dates(pd.to_datetime('2015-01-01'), pd.to_datetime('2018-12-31'), 20),
'Location' : np.random.choice(location, 20),
'NOE_level' : np.random.randint(low=130, high= 200, size=20)})
#Keep only year for Date
df['Date'] = df['Date'].dt.strftime("%Y")
print(df)
df = df.groupby(['Location', 'Date'])['NOE_level'].apply(lambda x: (x>150).sum()).reset_index(name='count')
print(df)
Example df generated:
Date Location NOE_level
0 2018 town 191
1 2017 campaign 187
2 2017 town 137
3 2016 avenue 148
4 2017 campaign 195
5 2018 town 181
6 2018 road 187
7 2018 town 184
8 2016 town 155
9 2016 street 183
10 2018 road 136
11 2017 road 171
12 2018 street 165
13 2015 avenue 193
14 2016 campaign 170
15 2016 street 132
16 2016 campaign 165
17 2015 road 161
18 2018 road 161
19 2015 road 140
output:
Location Date count
0 avenue 2015 1
1 avenue 2016 0
2 campaign 2016 2
3 campaign 2017 2
4 road 2015 1
5 road 2017 1
6 road 2018 2
7 street 2016 1
8 street 2018 1
9 town 2016 1
10 town 2017 0
11 town 2018 3