How to scrape data from paginated table?

How to scrape data from paginated table? - python

I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.

Try to use the structured data from JSON response of XHR request to create your dataframe. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId
playerName
...
type
atBatsPerHomeRun
0
502671
Paul Goldschmidt
...
player
5.5
1
621439
Byron Buxton
...
player
6.4
2
547180
Bryce Harper
...
player
4.38
3
658668
Edward Olivares
...
player
11.33
4
670351
Jose Rojas
...
player
9
...
...
...
156
593871
Jorge Polanco
...
player
32.00
157
676475
Alec Burleson
...
player
-.--
158
608385
Jesse Winker
...
player
-.--
159
641355
Cody Bellinger
...
player
-.--
160
660162
Yoan Moncada
...
player
-.--
[161 rows x 72 columns]

You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]

Related

web scraping not showing any output

i'm new to web scraping and was trying to get a basic webscraping code to work. The code works just fine, the problem is that I cannot get the CSV file to have any information on it it only shows the name of each column that's it with no data. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
def scrape_cars(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
cars = []
for car_div in soup.find_all("div", class_="c-search-card"):
car = {}
car["title"] = car_div.find("h2").text.strip()
car["price"] = car_div.find("div", class_="c-search-card__price").text.strip()
car["location"] = car_div.find("div", class_="c-search-card__location").text.strip()
car["year"] = car_div.find("div", class_="c-search-card__year").text.strip()
car["km"] = car_div.find("div", class_="c-search-card__km").text.strip()
car["color"] = car_div.find("div", class_="c-search-card__color").text.strip()
car["carrosserie"] = car_div.find("div", class_="c-search-card__body-type").text.strip()
car["puissance fiscale"] = car_div.find("div", class_="c-search-card__tax-horsepower").text.strip()
car["boite"] = car_div.find("div", class_="c-search-card__transmission").text.strip()
cars.append(car)
return cars
url = "https://www.automobile.tn/fr/occasion"
cars = scrape_cars(url)
# write to CSV file
with open("cars.csv", "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["title", "price", "location", "year", "km", "color", "carrosserie", "puissance fiscale", "boite"])
writer.writeheader()
for car in cars:
writer.writerow(car)
this is what i get the csv file

Here is one way of getting that information you're after:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from tqdm import tqdm ## if using jupyter notebook: from tqdm.notebook import tqdm
big_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
for x in tqdm(range(1, 25)): ## to get all cars set range to 266
soup = bs(s.get(f'https://www.automobile.tn/fr/occasion/{x}').text, 'html.parser')
cars = soup.select('div[class="occasion-item"]')
for c in cars:
title = c.select_one('h2').text.strip()
price = c.select_one('div[class="price"]').text.strip()
big_list.append((title, price))
## add other elements as needed
df = pd.DataFrame(big_list, columns=['title', 'price'])
# df.to_csv('various_cars.csv') ## uncomment to save as csv
print(df)
Result in terminal:
100%
24/24 [00:25<00:00, 1.08it/s]
title price
0 Mazda CX-5 69 700 DT
1 Mercedes-Benz Classe E 53 000 DT
2 Mercedes-Benz Classe E 252 000 DT
3 Seat Arona 71 500 DT
4 Volkswagen Golf 7 47 000 DT
... ... ...
283 BMW Série 1 74 000 DT
284 BMW Série 3 135 000 DT
285 Volkswagen Golf 7 70 000 DT
286 Mercedes-Benz Classe C coupé 159 000 DT
287 Volkswagen Jetta 36 000 DT
288 rows × 2 columns

How to create DataFrame with columns based on scraped data?

import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data.extend([e.select_one('[class="d8eab2cf7f c90c0a70d3 db63693c62"]') for e in soup.select('[data-testid="property-card"]')])
data
I am getting name and reviews for all pages in a single line, i want to get this result in separate columns for names and reviews.
I want to get my result like this:

Actually I couldn't understand your question, what do yo want. If you could show a sample dataframe you want it would be great. But generally you can do it like that. For example in this data latitude longitude is in same column and you can separate them to two columns with split function. Don't forget to add headers.
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
base_url = 'https://www.booking.com'
urlss = 'https://www.booking.com/searchresults.html?req_children=0&label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&group_children=0&dest_type=city&rows=15&aid=304142&dest_id=-2092174&nflt=ht_id%3D204&req_adults=2&no_rooms=1&group_adults=2'
data = []
def pars(url):
r = requests.get(url)
soup = bs(r.text, 'html.parser')
foor = {}
try:
foor['description'] = soup.find('div', id = 'property_description_content').text
foor['Title'] = soup.find('h2', class_ = 'd2fee87262 pp-header__title').text
x = soup.find_all('div', class_ = 'a815ec762e ab06168e66')
div_map = soup.select_one('#hotel_sidebar_static_map')
if div_map:
foor['x_lnge'] = div_map['data-atlas-latlng']
for f in range(0, len(x)):
foor[f'feature{f}'] =(x[f].text)
data.append(foor)
except:
None
def general():
r = requests.get(urlss)
soup = bs(r.text, 'html.parser')
x = soup.select('header > a')
for f in x:
urls = base_url + f['href']
obj = {}
obj['urls'] = urls
print(urls)
pars(urls)
f = []
def export_data(data):
f = pd.DataFrame(data)
f = f.drop_duplicates()
presentday = datetime.now()
pese = str(presentday)
a = str(presentday)[0:10].replace('-', '_')
f.to_excel(f'{a}booking.xlsx', index=False)
if __name__ == '__main__':
general()
export_data(data)

Simply adapt my answer from your previous question https://stackoverflow.com/a/75270151/14460824 and select all needed information from detail pages.
Instead of extending the list with lists append dicts.
Example
Be aware, this example breaks after first iteration for demo purposes, simply remove break from loop to get all results, also adapt handling of check if elements ar available.
import requests, re
import pandas as pd
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent bond'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
for e in soup.select('[data-testid="property-card"]'):
data.append({
'title': e.select_one('[data-testid="title"]').text,
'score': e.select_one('[data-testid="review-score"]').contents[0].text if e.select_one('[data-testid="review-score"]') else None,
'ratings': e.select_one('[data-testid="review-score"]').text.split()[-2],
'address':e.select_one('[data-testid="address"]').text,
'distance':e.select_one('[data-testid="distance"]').text
})
break
pd.DataFrame(data)
Output
title
score
ratings
address
distance
0
Hotel Ariana Residency
7
179
Western Suburbs, Mumbai
17.7 km from centre
1
MAXX VALUE - HOTEL KOHINOOR CONTINENTAL
7.4
12
Western Suburbs, Mumbai
16.7 km from centre
2
West End Hotel Opp Bombay Hospital
7.1
168
South Mumbai, Mumbai
3.3 km from centre
3
The Leela Mumbai
8.6
2,536
Western Suburbs, Mumbai
16.6 km from centre
4
Marriott Executive Apartment - Lakeside Chalet, Mumbai
7.8
265
Powai, Mumbai
20.1 km from centre
...
20
Taj Santacruz
8.8
2,980
Mumbai
14.2 km from centre
21
Hotel Suncity Residency
6.9
56
Western Suburbs, Mumbai
17.3 km from centre
22
Niranta Transit Hotel Terminal 2 Arrivals/Landside
7.6
2,380
Andheri, Mumbai
15.5 km from centre
23
JW Marriott Mumbai Juhu
8.4
1,318
Juhu, Mumbai
14.8 km from centre
24
Hotel Bawa Continental
7.9
754
Juhu, Mumbai
14.7 km from centre

How to get HTML changes after pressing button with Beautiful Soup and Requests

I want to get the HTML this site https://www.forebet.com/en/football-predictions after pressing the button More[+] enough times to load all games. Each time the button More[+] on the bottom of the page the HTML changes and shows more football games. How do I get the request to the page with all the football games loaded?
from bs4 import BeautifulSoup
import requests
leagues = {"EPL","UCL","Es1","De1","Fr1","Pt1","It1","UEL"}
class ForeBet:
#gets all games from the leagues on leagues returning the games on a string list
#game format is League|Date|Hour|Home Team|Away Team|Prob Home|Prob Tie| Prob Away
def get_games_and_probs(self):
response=requests.get('https://www.forebet.com/en/football-prediction')
soup = BeautifulSoup(response.text, 'html.parser')
results=list()
games = soup.findAll(class_='rcnt tr_0')+soup.findAll(class_='rcnt tr_1')
for game in games:
if(leagues.__contains__(game.find(class_='shortTag').text.strip())):
game=game.find(class_='shortTag').text+"|"+\
game.find(class_='date_bah').text.split(" ")[0]+"|"+ \
game.find(class_='date_bah').text.split(" ")[1]+"|"+ \
game.find(class_='homeTeam').text+"|"+\
game.find(class_='awayTeam').text+"|"+\
game.find(class_='fprc').findNext().text+"|"+\
game.find(class_='fprc').findNext().findNext().text+"|"+\
game.find(class_='fprc').findNext().findNext().findNext().text
print(game)
results.append(game)
return results

Like stated, requests and beautfulsoup are used to parse data, not to interact with the site. To do that you need Selenium.
Your other option is to see if you can fetch the data directly, and see if there are parameters that can make another request as if you clicked the get more. Does this do the trick for you?
import pandas as pd
import requests
results = pd.DataFrame()
i=0
while True:
print(i)
url = 'https://m.forebet.com/scripts/getrs.php'
payload = {
'ln': 'en',
'tp': '1x2',
'in': '%s' %(i+11),
'ord': '0'}
jsonData = requests.get(url, params=payload).json()
results = results.append(pd.DataFrame(jsonData[0]), sort=False).reset_index(drop=True)
if max(results['id'].value_counts()) <=1:
i+=1
else:
results = results.drop_duplicates()
break
Output:
print(results)
id pr_under ... country full_name
0 1473708 31 ... England Isthmian League
1 1473713 35 ... England Isthmian League
2 1473745 28 ... England Isthmian League
3 1473710 35 ... England Isthmian League
4 1473033 28 ... England Premier League 2
.. ... ... ... ... ...
515 1419208 47 ... Argentina Torneo Federal A
516 1419156 57 ... Argentina Torneo Federal A
517 1450589 50 ... Armenia Premier League
518 1450590 35 ... Armenia Premier League
519 1450591 52 ... Armenia Premier League
[518 rows x 73 columns]

how can you split cell data into different cells in python?

I'm trying to parse this webpage into a pandas dataframe to analyze, but the page is set up such that the table only has two columns of use, one with the name and the other containing all the other information as a single cell.
For example, with my code below:
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
url = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
table = soup.find('tbody')
td = table.find_all('td')
data = []
for element in td:
sub_data = []
for sub_element in element:
try:
sub_data.append(sub_element.get_text())
except:
continue
data.append(sub_data)
dataFrame = pd.DataFrame(data = data)
df = dataFrame[[1,3]]
df = df.dropna()
So df.iat[0,1] would have the program, defense year, advisor, dissertation title, and undergraduate institution. The HTML only uses "br" and "strong" to separate these values, and I am wondering if there is any way to separate this text into different columns so the columns would be "name", "program", "defense year" and such, instead of one cell containing all the information.
thank you so much!

After try: and before sub_data.append line in your code you should split your sub_element text by "<br>". You can try the following:
sub_data_splitted = sub_element.get_text().split("<br>").
# After that you are able to use each field of the data i.e.
program = sub_data_splitted[0].split(":")[1]
defense_year = sub_data_splitted[1].split(":")[1]
advisor = sub_data_splitted[2].split(":")[1]
dissertation_title = sub_data_splitted[3].split(":")[1]
ug_institution = sub_data_splitted[4].split(":")[1]

You can do like this.
You can use .stripped_strings() to get a list of data from each <tr> of the table.
Since you only need the values and not the titles (like Name, Defense Year, etc.) Use List comprehension to select only the required values.
Append the list to a dataframe.
Here is how it is done.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = requests.get(URL)
soup = BeautifulSoup(page.text, "lxml")
t = soup.find('table').find('tbody')
trs = t.find_all('tr')
data = []
for i in trs:
l = [x for i,x in enumerate(list(i.stripped_strings)) if i%2 == 0]
data.append(l)
df = pd.DataFrame(data=data)
0 ... 6
0 Abbott, PhD, Jason ... None
1 Adam, PhD, Gregory Charles ... None
2 Adhikari, PhD, Pramisha ... None
3 Al-Bassam, PhD, Jawdat M. H. ... None
4 Albertshofer, PhD, Klaus ... None
.. ... ... ...
682 Zhou, PhD, Jiacheng ... None
683 Zhou, PhD, Zhaohui (Sunny) ... None
684 Zhu, PhD, Ruyi ... None
685 Zhu, PhD, Yan ... None
686 Zuhl, PhD, Andrea M. ... None
[687 rows x 7 columns]

Is this what you're trying to do?
import bs4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
url = "https://education.scripps.edu/alumni/graduate-alumni-list/index.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
table = soup.find('tbody')
td = table.find_all('td')
data = {}
names = []
prev_name = None
for element in td:
sub_data = {}
for sub_element in element:
try:
data[sub_element['alt']] = {}
prev_name = sub_element['alt']
except:
sub_element = str(sub_element).replace('</strong>', '').replace('<br/>', '</strong>')
temp = BeautifulSoup(sub_element)
if len(temp.find_all('strong')) > 0:
temp = [str(i.string) for i in temp.find_all('strong') if i.string is not None]
temp = {i.split(':', 1)[0] : i.split(':', 1)[1] for i in temp if ':' in i}
data[prev_name] = temp
df = pd.DataFrame(data = data)
df = df.T.reset_index()
df.rename(columns={'index' : 'Name'}, inplace=True)

I can not seem to get information from page via xpath

I am currently creating different projects to grasp the concept of web scraping. Currently I am trying to create a database of items from a shoe selling site, but I cant seem to get the data in text form.
I have tried.
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import numpy
import statistics
import pandas as pd
offset=0
driver=webdriver.Chrome()
listo=[]
while True:
driver.get("https://stockx.com/sneakers?page={offset}".format(offset=offset))
time.sleep(10)
main_div=driver.find_elements_by_xpath('//*[#id="main-content"]/div[2]/div[2]/div/div')
for div in main_div:
links=div.find_elements_by_tag_name("a")
for link in links:
namer=(link.get_attribute('href'))
print(namer)
offset+=0.05
listo.append(namer)
namelist = sorted(set(listo))
for hreflink in namelist:
hreflinks=(hreflink)
driver.get(hreflinks)
time.sleep(10)
LastsaleD=driver.find_elements_by_xpath('//[#id="marketsummary"]/div[2]/div/div[1]
/div[1]')
print(LastsaleD).text
if offset>30:
break

Using Selenium is overkill and less efficient here. The data is found in json format within the <script> tags of the source html. Just do a simple request of the site, pull out the relevant <script> with the json, then parse the json into rows to put into a table.
Also, why increment offset+=0.05? I understand your logic of adding it for ever 20 items on the page, but why not just increment by 1 after that loop through the 20 items? What happens if for whatever reason you get 19 items returned or 21 items? Then your increments will be off for the rest of the loop.
Anyways, here's the code. This will get you going.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
at_end = False
offset = 0
rows = []
while at_end == False:
offset+=1
url = "https://stockx.com/sneakers?page={offset}".format(offset=offset)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script', {'type':'application/ld+json'})
for script in scripts:
jsonMatch = re.compile("{.*}")
jsonStr = jsonMatch.search(str(script))[0]
jsonData = json.loads(jsonStr)
if jsonData['#type'] == 'OfferCatalog':
break
listings = jsonData['itemListElement']
for listing in listings:
item = listing['item']
offers = item.pop('offers')
item.update(offers)
if item not in rows:
rows.append(item)
else:
at_end = True
continue
print('Page: %s' %offset)
df = pd.DataFrame(rows)
Output:
print(df)
#type brand ... highPrice priceCurrency
0 AggregateOffer Jordan ... 165 GBP
1 AggregateOffer Jordan ... 226 GBP
2 AggregateOffer Jordan ... 321 GBP
3 AggregateOffer Jordan ... 159 GBP
4 AggregateOffer Jordan ... 190 GBP
.. ... ... ... ... ...
495 AggregateOffer Nike ... 230 GBP
496 AggregateOffer New Balance ... 159 GBP
497 AggregateOffer Nike ... 152 GBP
498 AggregateOffer Nike ... 162 GBP
499 AggregateOffer Nike ... 167 GBP
[500 rows x 14 columns]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape data from paginated table? - python

Related

web scraping not showing any output

How to create DataFrame with columns based on scraped data?

How to get HTML changes after pressing button with Beautiful Soup and Requests

how can you split cell data into different cells in python?

I can not seem to get information from page via xpath

Categories

Resources