scrape tennis results including tournament for each match row - python

I want to scrape tennis matches results from this website
The results table I want has the columns: tournament_name match_time player_1 player_2 player_1_score player_2_score
This is an example
tournament_name match_time player_1 player_2 p1_set1 p2_set1
Roma / Italy 11:00 Krajinovic Filip Auger Aliassime Felix 6 4
Iasi (IX) / Romania 10:00 Bourgue Mathias Martineau Matteo 6 1
I can't associate each tournament name on the id="main_tour" with each row (one row is 2 class="match" or 2 class="match1"
I tried this code:
import requests
from bs4 import BeautifulSoup
u = "http://www.tennisprediction.com/?year=2020&month=9&day=14"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
session = requests.Session()
r = session.get(u, timeout=30, headers=headers)
# print(r.status_code)
soup = BeautifulSoup(r.content, 'html.parser')
for table in soup.select('#main_tur'):
tourn_value = [i.get_text(strip=True) for i in table.select('tr:nth-child(1)')][0].split('/')[0].strip()
tourn_name = [i.get_text(strip=True) for i in table.select('tr td#main_tour')]
row = [i.get_text(strip=True) for i in table.select('.match')]
row2 = [i.get_text(strip=True) for i in table.select('.match1')]
print(tourn_value, tourn_name)

You can use this script to save the table to CSV in your format:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://www.tennisprediction.com/?year=2020&month=9&day=14'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for t in soup.select('.main_time'):
p1 = t.find_next(class_='main_player')
p2 = p1.find_next(class_='main_player')
tour = t.find_previous(id='main_tour')
scores1 = {'player_1_set{}'.format(i): s for i, s in enumerate((tag.get_text(strip=True) for tag in t.parent.select('.main_res')), 1)}
scores2 = {'player_2_set{}'.format(i): s for i, s in enumerate((tag.get_text(strip=True) for tag in t.parent.find_next_sibling().select('.main_res')), 1)}
all_data.append({
'tournament_name': ' / '.join( a.text for a in tour.select('a') ),
'match_time': t.text,
'player_1': p1.get_text(strip=True, separator=' '),
'player_2': p2.get_text(strip=True, separator=' '),
})
all_data[-1].update(scores1)
all_data[-1].update(scores2)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)
Saves data.csv:
EDIT: To add Odd, Prob columns for both players:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://www.tennisprediction.com/?year=2020&month=9&day=14'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for t in soup.select('.main_time'):
p1 = t.find_next(class_='main_player')
p2 = p1.find_next(class_='main_player')
tour = t.find_previous(id='main_tour')
odd1 = t.find_next(class_='main_odds_m')
odd2 = t.parent.find_next_sibling().find_next(class_='main_odds_m')
prob1 = t.find_next(class_='main_perc')
prob2 = t.parent.find_next_sibling().find_next(class_='main_perc')
scores1 = {'player_1_set{}'.format(i): s for i, s in enumerate((tag.get_text(strip=True) for tag in t.parent.select('.main_res')), 1)}
scores2 = {'player_2_set{}'.format(i): s for i, s in enumerate((tag.get_text(strip=True) for tag in t.parent.find_next_sibling().select('.main_res')), 1)}
all_data.append({
'tournament_name': ' / '.join( a.text for a in tour.select('a') ),
'match_time': t.text,
'player_1': p1.get_text(strip=True, separator=' '),
'player_2': p2.get_text(strip=True, separator=' '),
'odd1': odd1.text,
'prob1': prob1.text,
'odd2': odd2.text,
'prob2': prob2.text
})
all_data[-1].update(scores1)
all_data[-1].update(scores2)
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
print(df)

Andrej's solution is really nice and elegant. Accept his solution, but here was my go at it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'http://www.tennisprediction.com/?year=2020&month=9&day=14'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
rows=[]
for matchClass in ['match','match1']:
matches = soup.find_all('tr',{'class':'match'})
for idx, match in enumerate(matches):
if idx%2 != 0:
continue
row = {}
tourny = match.find_previous('td',{'id':'main_tour'}).text
time = match.find('td',{'class':'main_time'}).text
p1 = match.find('td',{'class':'main_player'})
player_1 = p1.text
row.update({'tournament_name':tourny,'match_time':time,'player_1':player_1})
sets = p1.find_previous('tr',{'class':'match'}).find_all('td',{'class':'main_res'})
for idx,each_set in enumerate(sets):
row.update({'p1_set%d'%(idx+1):each_set.text})
p2 = match.find_next('td',{'class':'main_player'})
player_2 = p2.text
row.update({'player_2':player_2})
sets = p2.find_next('tr',{'class':'match'}).find_all('td',{'class':'main_res'})
for idx,each_set in enumerate(sets):
row.update({'p2_set%d'%(idx+1):each_set.text})
rows.append(row)
df = pd.DataFrame(rows)
Output:
print(df.head(10).to_string())
tournament_name match_time player_1 p1_set1 p1_set2 p1_set3 p1_set4 p1_set5 player_2 p2_set1 p2_set2 p2_set3 p2_set4 p2_set5
0 Roma / Italy prize / money : 5791 000 USD 11:10 Krajinovic Filip (SRB) (26) 6 7 Krajinovic Filip (SRB) (26) 4 5
1 Roma / Italy prize / money : 5791 000 USD 13:15 Dimitrov Grigor (BGR) (20) 7 6 Dimitrov Grigor (BGR) (20) 5 1
2 Roma / Italy prize / money : 5791 000 USD 13:50 Coric Borna (HRV) (32) 6 6 Coric Borna (HRV) (32) 4 4
3 Roma / Italy prize / money : 5791 000 USD 15:30 Humbert Ugo (FRA) (42) 6 7 Humbert Ugo (FRA) (42) 3 6 (5)
4 Roma / Italy prize / money : 5791 000 USD 19:00 Nishikori Kei (JPN) (34) 6 7 Nishikori Kei (JPN) (34) 4 6 (3)
5 Roma / Italy prize / money : 5791 000 USD 22:00 Travaglia Stefano (ITA) (87) 6 7 Travaglia Stefano (ITA) (87) 4 6 (4)
6 Iasi (IX) / Romania prize / money : 100 000 USD 10:05 Menezes Joao (BRA) (189) 6 6 Menezes Joao (BRA) (189) 4 4
7 Iasi (IX) / Romania prize / money : 100 000 USD 12:05 Cretu Cezar (2001) (ROU) 2 6 6 Cretu Cezar (2001) (ROU) 6 3 4
8 Iasi (IX) / Romania prize / money : 100 000 USD 14:35 Zuk Kacper (POL) (306) 6 6 Zuk Kacper (POL) (306) 2 0
9 Roma / Italy prize / money : 3452 000 USD 11:05 Pavlyuchenkova Anastasia (RUS) (32) 6 6 6 Pavlyuchenkova Anastasia (RUS) (32) 4 7 (5) 1

Related

How to scrape this football page?

https://fbref.com/en/partidas/25d5b9bd/Coritiba-Cuiaba-2022Julho25-Serie-A
I wanna scrape the Team Stats, such as Possession and Shots on Target, also whats below like Fouls, Corners...
What I have now is very over complicated code, basically stripping and splitting multiple times this string to grab the values I want.
#getting a general info dataframe with all matches
championship_url = 'https://fbref.com/en/comps/24/1495/schedule/2016-Serie-A-Scores-and-Fixtures'
data = requests.get(URL)
time.sleep(3)
matches = pd.read_html(data.text, match="Resultados e Calendários")[0]
#putting stats info in each match entry (this is an example match to test)
match_url = 'https://fbref.com/en/partidas/25d5b9bd/Coritiba-Cuiaba-2022Julho25-Serie-A'
data = requests.get(match_url)
time.sleep(3)
soup = BeautifulSoup(data.text, features='lxml')
# ID the match to merge later on
home_team = soup.find("h1").text.split()[0]
round_week = float(soup.find("div", {'id': 'content'}).text.split()[18].strip(')'))
# collecting stats
stats = soup.find("div", {"id": "team_stats"}).text.split()[5:] #first part of stats with the progress bars
stats_extra = soup.find("div", {"id": "team_stats_extra"}).text.split()[2:] #second part
all_stats = {'posse_casa':[], 'posse_fora':[], 'chutestotais_casa':[], 'chutestotais_fora':[],
'acertopasses_casa':[], 'acertopasses_fora':[], 'chutesgol_casa':[], 'chutesgol_fora':[],
'faltas_casa':[], 'faltas_fora':[], 'escanteios_casa':[], 'escanteios_fora':[],
'cruzamentos_casa':[], 'cruzamentos_fora':[], 'contatos_casa':[], 'contatos_fora':[],
'botedef_casa':[], 'botedef_fora':[], 'aereo_casa':[], 'aereo_fora':[],
'defesas_casa':[], 'defesas_fora':[], 'impedimento_casa':[], 'impedimento_fora':[],
'tirometa_casa':[], 'tirometa_fora':[], 'lateral_casa':[], 'lateral_fora':[],
'bolalonga_casa':[], 'bolalonga_fora':[], 'Em casa':[home_team], 'Sem':[round_week]}
#not gonna copy everything but is kinda like this for each stat
#stats = '\nEstatísticas do time\n\n\nCoritiba \n\n\n\t\n\n\n\n\n\n\n\n\n\n Cuiabá\n\nPosse\n\n\n\n42%\n\n\n\n\n\n58%\n\n\n\n\nChutes ao gol\n\n\n\n2 of 4\xa0—\xa050%\n\n\n\n\n\n0%\xa0—\xa00 of 8\n\n\n\n\nDefesas\n\n\n\n0 of 0\xa0—\xa0%\n\n\n\n\n\n50%\xa0—\xa01 of 2\n\n\n\n\nCartões\n\n\n\n\n\n\n\n\n\n\n\n\n\n'
#first grabbing 42% possession
all_stats['posse_casa']=stats.replace('\n','').replace('\t','')[20:].split('Posse')[1][:5].split('%')[0]
#grabbing 58% possession
all_stats['posse_fora']=stats.replace('\n','').replace('\t','')[20:].split('Posse')[1][:5].split('%')[1]
all_stats_df = pd.DataFrame.from_dict(all_stats)
championship_data = matches.merge(all_stats_df, on=['Em casa','Sem'])
There are a lot of stats in that dic bc in previous championship years, FBref has all those stats, only in the current year championship there is only 12 of them to fill. I do intend to run the code in 5-6 different years, so I made a version with all stats, and in current year games I intend to fill with nothing when there's no stat in the page to scrap.
You can get Fouls, Corners and Offsides and 7 tables worth of data from that page with the following code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://fbref.com/en/partidas/25d5b9bd/Coritiba-Cuiaba-2022Julho25-Serie-A'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
coritiba_fouls = soup.find('div', string='Fouls').previous_sibling.text.strip()
cuiaba_fouls = soup.find('div', string='Fouls').next_sibling.text.strip()
coritiba_corners = soup.find('div', string='Corners').previous_sibling.text.strip()
cuiaba_corners = soup.find('div', string='Corners').next_sibling.text.strip()
coritiba_offsides = soup.find('div', string='Offsides').previous_sibling.text.strip()
cuiaba_offsides = soup.find('div', string='Offsides').next_sibling.text.strip()
print('Coritiba Fouls: ' + coritiba_fouls, 'Cuiaba Fouls: ' + cuiaba_fouls)
print('Coritiba Corners: ' + coritiba_corners, 'Cuiaba Corners: ' + cuiaba_corners)
print('Coritiba Offsides: ' + coritiba_offsides, 'Cuiaba Offsides: ' + cuiaba_offsides)
dfs = pd.read_html(r.text)
print('Number of tables: ' + str(len(dfs)))
for df in dfs:
print(df)
print('___________')
This will print in the terminal:
Coritiba Fouls: 16 Cuiaba Fouls: 12
Coritiba Corners: 4 Cuiaba Corners: 4
Coritiba Offsides: 0 Cuiaba Offsides: 1
Number of tables: 7
Coritiba (4-2-3-1) Coritiba (4-2-3-1).1
0 23 Alex Muralha
1 2 Matheus Alexandre
2 3 Henrique
3 4 Luciano Castán
4 6 Egídio Pereira Júnior
5 9 Léo Gamalho
6 11 Alef Manga
7 25 Bernanrdo Lemes
8 78 Régis
9 97 Valdemir
10 98 Igor Paixão
11 Bench Bench
12 21 Rafael William
13 5 Guillermo de los Santos
14 15 Matías Galarza
15 16 Natanael
16 18 Guilherme Biro
17 19 Thonny Anderson
18 28 Pablo Javier García
19 32 Bruno Gomes
20 44 Márcio Silva
21 52 Adrián Martínez
22 75 Luiz Gabriel
23 88 Hugo
___________
Cuiabá (4-1-4-1) Cuiabá (4-1-4-1).1
0 1 Walter
1 2 João Lucas
2 3 Joaquim
3 4 Marllon Borges
4 5 Camilo
5 6 Igor Cariús
6 7 Alesson
7 8 João Pedro Pepê
8 9 Valdívia
9 10 Rodriguinho Marinho
10 11 Rafael Gava
11 Bench Bench
12 12 João Carlos
13 13 Daniel Guedes
14 14 Paulão
15 15 Marcão Silva
16 16 Cristian Rivas
17 17 Gabriel Pirani
18 18 Jenison
19 19 André
20 20 Kelvin Osorio
21 21 Jonathan Cafu
22 22 André Luis
23 23 Felipe Marques
___________
Coritiba Cuiabá
Possession Possession
0 42% 58%
1 Shots on Target Shots on Target
2 2 of 4 — 50% 0% — 0 of 8
3 Saves Saves
4 0 of 0 — % 50% — 1 of 2
5 Cards Cards
6 NaN NaN
_____________
[....]

How to scrape all data from first page to last page using beautifulsoup

I have been trying to scrape all data from the first page to the last page, but it returns only the first page as the output. How can I solve this? Below is my code:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
pages = np.arange(2, 1589, 20)
for page in pages:
page = requests.get( "https://estateintel.com/app/projects/search?q=%7B%22sectors%22%3A%5B%22residential%22%5D%7D&page="+str(page))
sleep(randint(2,10))
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_="project-card-vertical h-full flex flex-col rounded border-thin border-inactive-blue overflow-hidden pointer")
for list in lists:
title = list.find('p', class_ ="project-location text-body text-base mb-3").text. replace ('\n', '',).strip()
location = list.find('span', class_ ="text-gray-1").text. replace ('\n', '',).strip()
status = list.find('span', class_ ="text-purple-1 font-bold").text. replace ('\n', '',).strip()
units = list.find('span', class_ ="text-body font-semibold").text. replace ('\n', '',).strip()
info = [title,location,status,units]
print(info)
The page is loaded dynamically using the API. Therefore, with a regular GET request, you will always get the first page. You need to study how the page communicates with the browser and find the request you need, I wrote an example for review.
import json
import requests
def get_info(page):
url = f"https://services.estateintel.com/api/v2/properties?type\\[\\]=residential&page={page}"
headers = {
'accept': 'application/json',
'authorization': 'false',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'
}
response = requests.request("GET", url, headers=headers)
json_obj = json.loads(response.text)
for data in json_obj['data']:
print(data['name'])
print(data['area'], data['state'])
print(data['status'])
print(data['size']['value'], data['size']['unit'])
print('------')
for page in range(1, 134):
get_info(page)
You can choose the fields you need, this is just an example, also add to dataframe. Output:
Twin Oaks Apartment
Kilimani Nairobi
Completed
0 units
------
Duchess Park
Lavington Nairobi
Completed
62 units
------
Greenvale Apartments
Kileleshwa Nairobi
Completed
36 units
------
The Urban apartments & Suites
Osu Greater Accra
Completed
28 units
------
Chateau Towers
Osu Greater Accra
Completed
120 units
------
Cedar Haus Gardens
Oluyole Oyo
Under Construction
38 units
------
10 Agoro Street
Oluyole Oyo
Completed
1 units
..............
Think it is working well, but needs the time to sleep - Just in case, you could select your elements more specific e.g. with css selectors and store information in a list of dicts instead just printing it.
Example
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
data = []
for page in range(1,134):
print(page)
page = requests.get( "https://estateintel.com/app/projects/search?q=%7B%22sectors%22%3A%5B%22residential%22%5D%7D&page="+str(page))
sleep(randint(2,10))
soup = BeautifulSoup(page.content, 'html.parser')
for item in soup.select('div.project-grid > a'):
data.append({
'title' : item.h3.text.strip(),
'location' : item.find('span', class_ ="text-gray-1").text.strip(),
'status' : item.find('span', class_ ="text-purple-1 font-bold").text.strip(),
'units' : item.find('span', class_ ="text-body font-semibold").text.strip()
})
pd.DataFrame(data)
Output
title
location
status
units
0
Twin Oaks Apartment
Kilimani, Nairobi
Completed
Size: --
1
Duchess Park
Lavington, Nairobi
Completed
Size: 62 units
2
Greenvale Apartments
Kileleshwa, Nairobi
Completed
Size: 36 units
3
The Urban apartments & Suites
Osu, Greater Accra
Completed
Size: 28 units
4
Chateau Towers
Osu, Greater Accra
Completed
Size: 120 units
5
Cedar Haus Gardens
Oluyole, Oyo
Under Construction
Size: 38 units
6
10 Agoro Street
Oluyole, Oyo
Completed
Size: 1 units
7
Villa O
Oluyole, Oyo
Completed
Size: 2 units
8
Avenue Road Apartments
Oluyole, Oyo
Completed
Size: 6 units
9
15 Alafia Street
Oluyole, Oyo
Completed
Size: 4 units
10
12 Saint Mary Street
Oluyole, Oyo
Nearing Completion
Size: 8 units
11
RATCON Estate
Oluyole, Oyo
Completed
Size: --
12
1 Goodwill Road
Oluyole, Oyo
Completed
Size: 4 units
13
Anike's Court
Oluyole, Oyo
Completed
Size: 3 units
14
9 Adeyemo Quarters
Oluyole, Oyo
Completed
Size: 4 units
15
Marigold Residency
Nairobi West, Nairobi
Under Construction
Size: --
16
Kings Distinction
Kilimani, Nairobi
Completed
Size: --
17
Riverview Apartments
Kyumvi, Machakos
Completed
Size: --
18
Serene Park
Kyumvi, Machakos
Under Construction
Size: --
19
Gitanga Duplexes
Lavington, Nairobi
Under Construction
Size: 36 units
20
Westpointe Apartments
Upper Hill, Nairobi
Completed
Size: 254 units
21
10 Olaoluwa Street
Oluyole, Oyo
Under Construction
Size: 12 units
22
Rosslyn Grove
Nairobi West, Nairobi
Under Construction
Size: 90 units
23
7 Kamoru Ajimobi Street
Oluyole, Oyo
Completed
Size: 2 units
#pip install trio httpx pandas
import trio
import httpx
import pandas as pd
allin = []
keys1 = ['name', 'area', 'state']
keys2 = ['value', 'unit']
async def scraper(client, page):
client.params = client.params.merge({'page': page})
r = await client.get('/properties')
allin.extend([[i.get(k, 'N/A') for k in keys1] +
[i['size'].get(b, 'N/A')
for b in keys2] for i in r.json()['data']])
async def main():
async with httpx.AsyncClient(timeout=None, base_url='https://services.estateintel.com/api/v2') as client, trio.open_nursery() as nurse:
client.params = {
'type[]': 'residential'
}
for page in range(1, 3):
nurse.start_soon(scraper, client, page)
df = pd.DataFrame(allin, columns=[keys1 + keys2])
print(df)
if __name__ == "__main__":
trio.run(main)
Output:
0 Cedar Haus Gardens Oluyole Oyo 38 units
1 10 Agoro Street Oluyole Oyo 1 units
2 Villa O Oluyole Oyo 2 units
3 Avenue Road Apartments Oluyole Oyo 6 units
4 15 Alafia Street Oluyole Oyo 4 units
5 12 Saint Mary Street Oluyole Oyo 8 units
6 RATCON Estate Oluyole Oyo 0 units
7 1 Goodwill Road Oluyole Oyo 4 units
8 Anike's Court Oluyole Oyo 3 units
9 9 Adeyemo Quarters Oluyole Oyo 4 units
10 Marigold Residency Nairobi West Nairobi 0 units
11 Riverview Apartments Kyumvi Machakos 0 units
12 Socian Villa Apartments Kileleshwa Nairobi 36 units
13 Kings Pearl Residency Lavington Nairobi 55 units
14 Touchwood Gardens Kilimani Nairobi 32 units
15 Panorama Apartments Upper Hill Nairobi 0 units
16 Gitanga Duplexes Lavington Nairobi 36 units
17 Serene Park Kyumvi Machakos 25 units
18 Kings Distinction Kilimani Nairobi 48 units
19 Twin Oaks Apartment Kilimani Nairobi 0 units
20 Duchess Park Lavington Nairobi 70 units
21 Greenvale Apartments Kileleshwa Nairobi 36 units
22 The Urban apartments & Suites Osu Greater Accra 28 units
23 Chateau Towers Osu Greater Accra 120 units

BeautifulSoup - find + iterate through a table

I am having some trouble trying to cleanly iterate through a table of sold property listings using BeautifulSoup.
In this example
Some rows in the main table are irrelevant (like "set search filters")
The rows have unique IDs
Have tried getting the rows using a style attribute, but this did not return results.
What would be the best approach to get just the rows for sold properties out of that table?
End goal is to pluck out the sold price; date of sale; # bedrooms/bathrooms/car; land area and append into a pandas dataframe.
from bs4 import BeautifulSoup
import requests
# Globals
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
url = 'http://house.ksou.cn/p.php?q=West+Footscray%2C+VIC'
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
r=requests.get(url,headers=headers)
c=r.content
soup=BeautifulSoup(c,"html.parser")
prop_table = soup.find('table', id="mainT")
#prop_table = soup.find('table', {"font-size" : "13px"})
#prop_table = soup.select('.addr') # Pluck out the listings
rows = prop_table.findAll('tr')
for row in rows:
print(row.text)
This HTML is tricky to parse, because it doesn't have fixed structure. Unfortunately, I don't have pandas installed, so I only print the data to the screen:
import requests
from bs4 import BeautifulSoup
url = 'http://house.ksou.cn/p.php?q=West+Footscray&p={page}&s=1&st=&type=&count=300&region=West+Footscray&lat=0&lng=0&sta=vic&htype=&agent=0&minprice=0&maxprice=0&minbed=0&maxbed=0&minland=0&maxland=0'
data = []
for page in range(0, 2): # <-- increase to number of pages you want to crawl
soup = BeautifulSoup(requests.get(url.format(page=page)).text, 'html.parser')
for table in soup.select('table[id^="r"]'):
name = table.select_one('span.addr').text
price = table.select_one('span.addr').find_next('b').get_text(strip=True).split()[-1]
sold = table.select_one('span.addr').find_next('b').find_next_sibling(text=True).replace('in', '').replace('(Auction)', '').strip()
beds = table.select_one('img[alt="Bed rooms"]')
beds = beds.find_previous_sibling(text=True).strip() if beds else '-'
bath = table.select_one('img[alt="Bath rooms"]')
bath = bath.find_previous_sibling(text=True).strip() if bath else '-'
car = table.select_one('img[alt="Car spaces"]')
car = car.find_previous_sibling(text=True).strip() if car else '-'
land = table.select_one('b:contains("Land size:")')
land = land.find_next_sibling(text=True).split()[0] if land else '-'
building = table.select_one('b:contains("Building size:")')
building = building.find_next_sibling(text=True).split()[0] if building else '-'
data.append([name, price, sold, beds, bath, car, land, building])
# print the data
print('{:^25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format('Name', 'Price', 'Sold', 'Beds', 'Bath', 'Car', 'Land', 'Building'))
for row in data:
print('{:<25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format(*row))
Prints:
Name Price Sold Beds Bath Car Land Building
51 Fontein Street $770,000 07 Dec 2019 - - - - -
50 Fontein Street $751,000 07 Dec 2019 - - - - -
9 Wellington Street $1,024,999 Dec 2019 2 1 1 381 -
239 Essex Street $740,000 07 Dec 2019 2 1 1 358 101
677a Barkly Street $780,000 Dec 2019 4 1 - 380 -
23A Busch Street $800,000 30 Nov 2019 3 1 1 215 -
3/2-4 Dyson Street $858,000 Nov 2019 3 2 - 378 119
3/101 Stanhope Street $803,000 30 Nov 2019 2 2 2 168 113
2/4 Rondell Avenue $552,500 30 Nov 2019 2 - - 1,088 -
3/2 Dyson Street $858,000 30 Nov 2019 3 2 2 378 -
9 Vine Street $805,000 Nov 2019 2 1 2 318 -
39 Robbs Road $957,000 23 Nov 2019 2 2 - 231 100
29 Robbs Road $1,165,000 Nov 2019 2 1 1 266 -
5 Busch Street $700,000 Nov 2019 2 1 1 202 -
46 Indwe Street $730,000 16 Nov 2019 3 1 1 470 -
29/132 Rupert Street $216,000 16 Nov 2019 1 1 1 3,640 -
11/10 Carmichael Street $385,000 15 Nov 2019 2 1 1 1,005 -
2/16 Carmichael Street $515,000 14 Nov 2019 2 1 1 112 -
4/26 Beaumont Parade $410,000 Nov 2019 2 1 1 798 -
5/10 Carmichael Street $310,000 Nov 2019 1 1 1 1,004 -

Python cannot get table from stock exchange website

I used python 3 and beautiful soup 4 to parse the webpage from Hong Kong stock exchange. However, the table (ie: No. of listed companies...No. of listed H shares...) under "HONG KONG AND MAINLAND MARKET HIGHLIGHTS" cannot be extracted. Here is the link: "https://www.hkex.com.hk/Mutual-Market/Stock-Connect/Statistics/Hong-Kong-and-Mainland-Market-Highlights?sc_lang=en#select3=0&select2=10&select1=0"
Kindly advice.
My code:
import requests
from bs4 import BeautifulSoup
import csv
import sys
import os
result = requests.get("https://www.hkex.com.hk/Mutual-Market/Stock-Connect/Statistics/Hong-Kong-and-Mainland-Market-Highlights?sc_lang=en#select3=0&select2=10&select1=3")
result.raise_for_status()
result.encoding = "utf-8"
src = result.content
soup = BeautifulSoup(src, 'lxml')
print(soup.prettify())
print(" ")
print("soup.pretty() printed")
print(" ")
wait = input("PRESS ENTER TO CONTINUE.")
table = soup.find_all('table')
print(table)
print(" ")
print("TABLE printed")
print(" ")
wait2 = input("PRESS ENTER TO CONTINUE.")
No need to render the page first, as you can get the data back in the json format. The tricky part is the json format is how to render the table (with the td tags and colspan tags, etc.). So there has to be a little work to be done to iterate through that, but not impossible to do:
import requests
import pandas as pd
url = 'https://www.hkex.com.hk/eng/csm/ws/Highlightsearch.asmx/GetData'
payload = {
'LangCode': 'en',
'TDD': '1',
'TMM': '11',
'TYYYY': '2019'}
jsonData = requests.get(url, params=payload).json()
final_df = pd.DataFrame()
for row in jsonData['data']:
#row = jsonData['data'][1]
data_row = []
for idx, colspan in enumerate(row['colspan']):
colspan_int = int(colspan[0])
data_row.append(row['td'][idx] * colspan_int)
flat_list = [item for sublist in data_row for item in sublist]
temp_row = pd.DataFrame([flat_list])
final_df = final_df.append(temp_row, sort=True).reset_index(drop=True)
df = final_df[final_df[0].str.contains(r'Total market
capitalisation(?!$)')].iloc[:,:2]
df['date'] = date
df.to_csv('file.csv', index=False)
Output:
print (final_df.to_string())
0 1 2 3 4 5 6
0 Hong Kong <br>Exchange (01/11/2019 ) Hong Kong <br>Exchange (01/11/2019 ) Shanghai Stock<br>Exchange (01/11/2019 ) Shanghai Stock<br>Exchange (01/11/2019 ) Shenzhen Stock<br>Exchange (01/11/2019 ) Shenzhen Stock<br>Exchange (01/11/2019 )
1 Main Board GEM A Share B Share A Share B Share
2 No. of listed companies 2,031 383 1,488 50 2,178 47
3 No. of listed H shares 256 22 n.a. n.a. n.a. n.a.
4 No. of listed red-chips stocks 170 5 n.a. n.a. n.a. n.a.
5 Total no. of listed securities 12,573 384 n.a. n.a. n.a. n.a.
6 Total market capitalisation<br>(Bil. dollars) HKD 31,956 HKD 109 RMB 32,945 RMB 81 RMB 22,237 RMB 50
7 Total negotiable <br>capitalisation (Bil. doll... n.a. n.a. RMB 28,756 RMB 81 RMB 16,938 RMB 49
8 Average P/E ratio (Times) 11.16 19.76 13.90 9.18 24.70 9.55
9 Total turnover <br>(Mil. shares) 196,082 560 15,881 15 22,655 14
10 Total turnover <br>(Mil. dollars) HKD 79,397 HKD 160 RMB 169,934 RMB 85 RMB 260,208 RMB 57
11 Total market turnover<br>(Mil. dollars) HKD 79,557 HKD 79,557 RMB 176,232 RMB 176,232 RMB 260,264 RMB 260,264

Scraping a table from the web omits certain values

I am working on a little coding project to help learn how webscraping works, and decided to extract a table from a fantasy football website I like, which can be found here. https://fantasydata.com/nfl/fantasy-football-leaders?position=1&team=1&season=2018&seasontype=1&scope=1&subscope=1&scoringsystem=2&aggregatescope=1&range=1
When I attempt to grab the table the first 10 rows come out okay, but starting Brian Hill's row every value in my table comes up blank. I have inspected the web page as I usually do whenever I run into an issue, and the rows following Hill's seem to follow an identical structure to the ones before it. Any helping both resolving the issue and potentially explaining why it is happening in the first place would be much appreciated!
import pandas
from bs4 import BeautifulSoup
from selenium import webdriver
URLA = 'https://fantasydata.com/nfl/fantasy-football-leaders?position='
URLB = '&team='
URLC = '&season='
URLD = '&seasontype=1&scope=1&subscope=1&scoringsystem=2&aggregatescope=1&range=3'
POSITIONNUMBER = [1,6,7]
TEAMNUMBER = [1]
def buildStatsTable(year):
fullDF = pandas.DataFrame()
fullLength = 0
position = 1
headers = ['Name', 'Team', 'Pos', 'GMS', 'PassingYards', 'PassingTDs', 'PassingINTs',
'RushingYDs', 'RushingTDs', 'ReceivingRECs', 'ReceivingYDs', 'ReceivingTDs',
'FUM LST', 'PPG', 'FPTS']
for team in TEAMNUMBER:
currURL = URLA + str(position)+ URLB + str(team)+URLC+str(year)+URLD
driver = webdriver.Chrome()
driver.get(currURL)
soup = BeautifulSoup(driver.page_source, "lxml")
driver.quit()
tr = soup.findAll('tr', {'role' : 'row'})
length = len(tr)
offset = length/2
maxCap = int((length - 1)/2) + 1
tableList = []
for i, row in enumerate(tr[2:maxCap]):
player = row.get_text().split('\n', 2)[1]
player_row = [value.get_text() for value in tr[int(i + offset + 1)].contents]
tableList.append([player] + player_row)
teamDF = pandas.DataFrame(columns = headers, data = tableList)
fullLength = fullLength + len(tableList)
fullDF = fullDF.append(teamDF)
fullDF.index = list(range(0,fullLength))
return fullDF
falcons = buildStatsTable(2018)
Actual Results (only showed the fist few columns to make the post shorter, the issue is consistent across every column)
Name Team Pos GMS PassingYards PassingTDs PassingINTs \
0 Matt Ryan ATL QB 16 4924 35 7
1 Julio Jones ATL WR 16 0 0 0
2 Calvin Ridley ATL WR 16 0 0 0
3 Tevin Coleman ATL RB 16 0 0 0
4 Mohamed Sanu ATL WR 16 5 1 0
5 Austin Hooper ATL TE 16 0 0 0
6 Ito Smith ATL RB 14 0 0 0
7 Justin Hardy ATL WR 16 0 0 0
8 Marvin Hall ATL WR 16 0 0 0
9 Logan Paulsen ATL TE 15 0 0 0
10 Brian Hill ATL RB
11 Devonta Freeman ATL RB
12 Russell Gage ATL WR
13 Eric Saubert ATL TE

Categories

Resources