How to create DataFrame with columns based on scraped data?

How to create DataFrame with columns based on scraped data? - python

import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data.extend([e.select_one('[class="d8eab2cf7f c90c0a70d3 db63693c62"]') for e in soup.select('[data-testid="property-card"]')])
data
I am getting name and reviews for all pages in a single line, i want to get this result in separate columns for names and reviews.
I want to get my result like this:

Actually I couldn't understand your question, what do yo want. If you could show a sample dataframe you want it would be great. But generally you can do it like that. For example in this data latitude longitude is in same column and you can separate them to two columns with split function. Don't forget to add headers.
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
base_url = 'https://www.booking.com'
urlss = 'https://www.booking.com/searchresults.html?req_children=0&label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&group_children=0&dest_type=city&rows=15&aid=304142&dest_id=-2092174&nflt=ht_id%3D204&req_adults=2&no_rooms=1&group_adults=2'
data = []
def pars(url):
r = requests.get(url)
soup = bs(r.text, 'html.parser')
foor = {}
try:
foor['description'] = soup.find('div', id = 'property_description_content').text
foor['Title'] = soup.find('h2', class_ = 'd2fee87262 pp-header__title').text
x = soup.find_all('div', class_ = 'a815ec762e ab06168e66')
div_map = soup.select_one('#hotel_sidebar_static_map')
if div_map:
foor['x_lnge'] = div_map['data-atlas-latlng']
for f in range(0, len(x)):
foor[f'feature{f}'] =(x[f].text)
data.append(foor)
except:
None
def general():
r = requests.get(urlss)
soup = bs(r.text, 'html.parser')
x = soup.select('header > a')
for f in x:
urls = base_url + f['href']
obj = {}
obj['urls'] = urls
print(urls)
pars(urls)
f = []
def export_data(data):
f = pd.DataFrame(data)
f = f.drop_duplicates()
presentday = datetime.now()
pese = str(presentday)
a = str(presentday)[0:10].replace('-', '_')
f.to_excel(f'{a}booking.xlsx', index=False)
if __name__ == '__main__':
general()
export_data(data)

Simply adapt my answer from your previous question https://stackoverflow.com/a/75270151/14460824 and select all needed information from detail pages.
Instead of extending the list with lists append dicts.
Example
Be aware, this example breaks after first iteration for demo purposes, simply remove break from loop to get all results, also adapt handling of check if elements ar available.
import requests, re
import pandas as pd
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent bond'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
for e in soup.select('[data-testid="property-card"]'):
data.append({
'title': e.select_one('[data-testid="title"]').text,
'score': e.select_one('[data-testid="review-score"]').contents[0].text if e.select_one('[data-testid="review-score"]') else None,
'ratings': e.select_one('[data-testid="review-score"]').text.split()[-2],
'address':e.select_one('[data-testid="address"]').text,
'distance':e.select_one('[data-testid="distance"]').text
})
break
pd.DataFrame(data)
Output
title
score
ratings
address
distance
0
Hotel Ariana Residency
7
179
Western Suburbs, Mumbai
17.7 km from centre
1
MAXX VALUE - HOTEL KOHINOOR CONTINENTAL
7.4
12
Western Suburbs, Mumbai
16.7 km from centre
2
West End Hotel Opp Bombay Hospital
7.1
168
South Mumbai, Mumbai
3.3 km from centre
3
The Leela Mumbai
8.6
2,536
Western Suburbs, Mumbai
16.6 km from centre
4
Marriott Executive Apartment - Lakeside Chalet, Mumbai
7.8
265
Powai, Mumbai
20.1 km from centre
...
20
Taj Santacruz
8.8
2,980
Mumbai
14.2 km from centre
21
Hotel Suncity Residency
6.9
56
Western Suburbs, Mumbai
17.3 km from centre
22
Niranta Transit Hotel Terminal 2 Arrivals/Landside
7.6
2,380
Andheri, Mumbai
15.5 km from centre
23
JW Marriott Mumbai Juhu
8.4
1,318
Juhu, Mumbai
14.8 km from centre
24
Hotel Bawa Continental
7.9
754
Juhu, Mumbai
14.7 km from centre

Related

How to scrape data from paginated table?

I need your help trying to automate this web page by getting the data of all the players on the different pages.
import request
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.mlb.com/es/stats/spring-training'
pagina = requests.get(url2)
soup = BeautifulSoup(pagina.text, 'lxml')
table = soup.find('table', {'class':"bui-table is-desktop-sKqjv9Sb"})
encabezados = []
for i in table.find_all('th')[:18]:
datos = i.find_all('button')
for td in datos:
titulo = td.text.strip()
encabezados.append(titulo)
datos_mlb = pd.DataFrame(columns = encabezados)
nombres = []
for i in table.find_all('th')[18:]:
datos = i.find_all('a')
for td in datos:
jugadores = td.text.strip()
nombres.append(jugadores)
datos_mlb['JUGADOR'] = nombres
for fila in table.find_all('tr')[1:]:
data = fila.find_all('td')
data_fila = [td.text.strip() for td in data]
largo = len(datos_mlb)-1
datos_mlb.iloc[:,1:] = data_fila
I have tried to fit the vast majority of information, however I cannot complete the data correctly and iterate all the pages.

Try to use the structured data from JSON response of XHR request to create your dataframe. Inspect network tab in your browsers devtools, to get an idea, what parameters you should send and what you will get:
import pandas as pd
import requests
data = []
for i in range(0,175,25):
data.extend(
requests.get(
f'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=25&offset={i}&sortStat=onBasePlusSlugging&order=desc',
headers = {'user-agent': 'Mozilla/5.0'}
).json()['stats']
)
pd.DataFrame(data)
Output
playerId
playerName
...
type
atBatsPerHomeRun
0
502671
Paul Goldschmidt
...
player
5.5
1
621439
Byron Buxton
...
player
6.4
2
547180
Bryce Harper
...
player
4.38
3
658668
Edward Olivares
...
player
11.33
4
670351
Jose Rojas
...
player
9
...
...
...
156
593871
Jorge Polanco
...
player
32.00
157
676475
Alec Burleson
...
player
-.--
158
608385
Jesse Winker
...
player
-.--
159
641355
Cody Bellinger
...
player
-.--
160
660162
Yoan Moncada
...
player
-.--
[161 rows x 72 columns]

You are not getting all the required data because data is loaded dynamically via API.So you have to pull data from API.
Example:
import pandas as pd
import requests
api_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season=2022&sportId=1&stats=season&group=hitting&gameType=S&limit=161&offset=0&sortStat=onBasePlusSlugging&order=desc'
req = requests.get(api_url).json()
data =[]
for item in req['stats']:
playerName=item['playerName']
data.append({
'playerName':playerName
})
df = pd.DataFrame(data)
print(df)
Output:
playerName
0 Paul Goldschmidt
1 Byron Buxton
2 Bryce Harper
3 Edward Olivares
4 Jose Rojas
.. ...
156 Jorge Polanco
157 Alec Burleson
158 Jesse Winker
159 Cody Bellinger
160 Yoan Moncada
[161 rows x 1 columns]

Getting first (or a specific) td in BeautifulSoup with no class

I have one of those nightmare tables with no class given for the tr and td tags.
A sample page is here: https://system.gotsport.com/org_event/events/1271/schedules?age=19&gender=m
(You'll see in the code below that I'm getting multiple pages, but that's not the problem.)
I want the team name (nothing else) from each bracket. The output should be:
OCYS
FL Rush
Jacksonville FC
Atlanta United
SSA
Miami Rush Kendall SC
IMG
Tampa Bay United
etc.
I've been able to get every td in the specified tables. But every attempt to use [0] to get the first td of every row gives me an "index out of range" error.
The code is:
import requests
import csv
from bs4 import BeautifulSoup
batch_size = 2
urls = ['https://system.gotsport.com/org_event/events/1271/schedules?age=19&gender=m', 'https://system.gotsport.com/org_event/events/1271/schedules?age=17&gender=m']
# iterate through urls
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# iterate through leagues and teams
leagues = soup.find_all('table', class_='table table-bordered table-hover table-condensed')
for league in leagues:
row = ''
rows = league.find_all('tr')
for row in rows:
team = row.find_all('td')
teamName = team[0].text.strip()
print(teamName)
After a couple of hours of work, I feel like I'm just one syntax change away from getting this right. Yes?

You can use a CSS Selector nth-of-type(n). It works for both links:
import requests
from bs4 import BeautifulSoup
url = "https://system.gotsport.com/org_event/events/1271/schedules?age=19&gender=m"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for tag in soup.select(".small-margin-bottom td:nth-of-type(1)"):
print(tag.text.strip())
Output:
OCYS
FL Rush
Jacksonville FC
Atlanta United
SSA
...
...
Real Salt Lake U19
Real Colorado
Empire United Soccer Academy

Each bracket corresponds to one "panel", and each panel has two rows, the first of which contains the first table of all teams in the match tables.
def main():
import requests
from bs4 import BeautifulSoup
url = "https://system.gotsport.com/org_event/events/1271/schedules?age=19&gender=m"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
for panel in soup.find_all("div", {"class": "panel-body"}):
for row in panel.find("tbody").find_all("tr"):
print(row.find("td").text.strip())
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
OCYS
FL Rush
Jacksonville FC
Atlanta United
SSA
Miami Rush Kendall SC
IMG
Tampa Bay United
Weston FC
Chargers SC
South Florida FA
Solar SC
RISE SC
...

I think the problem is with the header of the table, which contains th elements instead of td elements. It leads to the index of range error, when you try to retrieve first element from an empty list. Try to add check for the length of the td:
for row in rows:
team = row.find_all('td')
if(len(team) > 0):
teamName = team[0].text.strip()
print(teamName)
It should print you the team names.

Is there any way to select specific a tags using beautiful soup?

I'm using BeautifulSoup and trying to print all a tag href which contains only companies website url. But my code is selecting other href too. There are total 71 companies website links but my code is not selecting all those href.
This is the source from where I'm extracting data
Here is my code
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.constructionplacements.com/top-construction-companies-in-india-2020/'
name_data = []
website_data = []
print(url)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# Loop to select and print all companies title
for h in soup.select('h4'):
print(h.text)
name_data.append(h.text)
# Loop to select and print all companies website url
for w in soup.select('p em a'):
print(w['href'])
website_data.append(w['href'])
df = pd.DataFrame({
'Company Title': name_data,
'Website': website_data
})
print(df)
df.to_csv('ata.csv')

To get all links to companies, you can use this example:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.constructionplacements.com/top-construction-companies-in-india-2020/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
for h4 in soup.find_all(lambda t: t.name=='h4' and re.search(r'^\d+\s*\.', t.text)):
print('{:<75} {}'.format(h4.text, h4.find_next('a')['href']))
Prints:
1. L&T Engineering & Construction Division (L&T ECC), Chennai http://www.lntecc.com/
2. Tata Projects Ltd, Mumbai http://www.tataprojects.com/
3. Shapoorji Pallonji & Co Ltd, Mumbai https://www.shapoorjipallonji.com/
4. GMR Group, Mumbai http://www.gmrgroup.in/
5. Hindustan Construction Company (HCC), Mumbai http://www.hccindia.com/
6. Afcons Infrastructure Limited, Mumbai http://www.shapoorjipallonji.com/
7. JMC Projects, Mumbai https://www.jmcprojects.com/
8. Gammon India Ltd, Mumbai http://www.gammonindia.com
9. IVRCL, Hyderabad http://www.ivrcl.com/
10. J Kumar Infra, Mumbai http://www.jkumar.com/
11. Gammon Infrastructure Projects Limited (GIPL), Mumbai http://www.gammoninfra.com/
12. Reliance Infrastructure http://www.rinfra.com
13. Ashoka Buildcon, Nashik https://ashokabuildcon.com/
14. B L Kashyap & Sons Ltd (BLK), New Delhi http://www.blkashyap.com
15. Consolidated Construction Consortium Ltd (CCCL), Chennai http://www.ccclindia.com/
16. Essar Group, Mumbai https://www.essar.com/
...and so on.

Web Scraping a page with multiple tables

I am trying to web scrape the second table from this website:
https://fbref.com/en/comps/9/stats/Premier-League-Stats
However, I have only ever managed to extract the information from the first table when trying to access the information by finding the table tag. Would anyone be able to explain to me why I cannot access the second table or show me how to do it.
import requests
from bs4 import BeautifulSoup
url = "https://fbref.com/en/comps/9/stats/Premier-League-Stats"
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
pl_table = soup.find_all("table")
player_table = tables[0]

Something along these lines should do it
tables = soup.find_all("table") # returns a list of tables
second_table = tables[1]

The table is inside HTML comments <!-- ... -->.
To get the table from comments, you can use this example:
import requests
from bs4 import BeautifulSoup, Comment
url = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
table = BeautifulSoup(soup.select_one('#all_stats_standard').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')
#print some information from the table to screen:
for tr in table.select('tr:has(td)'):
tds = [td.get_text(strip=True) for td in tr.select('td')]
print('{:<30}{:<20}{:<10}'.format(tds[0], tds[3], tds[5]))
Prints:
Patrick van Aanholt Crystal Palace 1990
Max Aarons Norwich City 2000
Tammy Abraham Chelsea 1997
Che Adams Southampton 1996
Adrián Liverpool 1987
Sergio Agüero Manchester City 1988
Albian Ajeti West Ham 1997
Nathan Aké Bournemouth 1995
Marc Albrighton Leicester City 1989
Toby Alderweireld Tottenham 1989
...and so on.

How to use Beautiful Soup find all to scrape only a list that is a part of the body

I am having trouble scraping this wikipedia list with the neighborhoods of Los Angeles using beautiful soup. I am getting all the content of the body and not just the neighborhood list like I would like to. I saw a lot about how to scrape a table but I got stucked in how to apply the table logic in this case.
This is the code I have been using:
import BeautifulSoup
address = 'Los Angeles, United States'
url = "https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles"
source = requests.get(url).text
soup = BeautifulSoup(source,'lxml')
neighborhoodList = []
-- append the data into the list
for row in soup.find_all("div", class_="mw-body")[0].findAll("li"):
neighborhoodList.append(row.text.replace(', LA',''))
df_neighborhood = pd.DataFrame({"Neighborhood": neighborhoodList})

If you review the page source the neighborhood entries are within divs that have a class of "div-col" and the link contains an attribute of "title".
Also, the replace on the text during the append doesn't appear to be needed.
The following code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
address = 'Los Angeles, United States'
url = "https://en.wikipedia.org/wiki/List_of_districts_and_neighborhoods_of_Los_Angeles"
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
neighborhoodList = []
# -- append the data into the list
links = []
for row in soup.find_all("div", class_="div-col"):
for item in row.select("a"):
if item.has_attr('title'):
neighborhoodList.append(item.text)
df_neighborhood = pd.DataFrame({"Neighborhood": neighborhoodList})
print(f'First 10 Rows:')
print(df_neighborhood.head(n=10))
print(f'\nLast 10 Rows:')
print(df_neighborhood.tail(n=10))
Results:
First 10 Rows:
Neighborhood
0 Angelino Heights
1 Arleta
2 Arlington Heights
3 Arts District
4 Atwater Village
5 Baldwin Hills
6 Baldwin Hills/Crenshaw
7 Baldwin Village
8 Baldwin Vista
9 Beachwood Canyon
Last 10 Rows:
Neighborhood
186 Westwood Village
187 Whitley Heights
188 Wholesale District
189 Wilmington
190 Wilshire Center
191 Wilshire Park
192 Windsor Square
193 Winnetka
194 Woodland Hills
195 Yucca Corridor

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to create DataFrame with columns based on scraped data? - python

Related

How to scrape data from paginated table?

Getting first (or a specific) td in BeautifulSoup with no class

Is there any way to select specific a tags using beautiful soup?

Web Scraping a page with multiple tables

How to use Beautiful Soup find all to scrape only a list that is a part of the body

Categories

Resources