Selecting HTML page values using Beautiful Soup

Selecting HTML page values using Beautiful Soup - python

I have been trying to figure this out for a few hour and it is doing my head in. Every method I try is not presenting the correct value.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.content, 'html.parser')
I want to extract the following values from the webpage (https://www.off---white.com/en/GB/products/omia065s188000160100)
Name = LOW 3.0 SNEAKER
Price = £ 415
img_url = https://cdn.off---white.com/images/156365/large_OMIA065S188000160100_4.jpg?1498202305
How would I extract these 3 values using Beautiful Soup?

import requests
from bs4 import BeautifulSoup
# Get prod name
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.find_all('span', {'class' : 'prod-title'})
data = [span.get_text() for span in spans]
prod_name = ''.join(data)
# Find prod price
spans = soup.find_all('div', {'class' : 'price'})
data = [span.get_text() for span in spans]
prod_price = ''.join(data)
# Find prod img
spans = soup.find_all('img', {'id' : 'image-0'})
for meta in spans:
prod_img = meta.attrs['src']
print(prod_name)
print(prod_price)
print(prod_img)

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.

Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

Web scraping doesn't iterate over entire webpage

Im trying to scrape the information of all the player names and player rating from this website:
https://www.fifaindex.com/players/?gender=0&league=1&order=desc
But i only get the information from the first player on the page.
The code im using:
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('div', class_="responsive-table table-rounded")
for result in results:
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
The HTML parsed is attached in the picture

I tinkered around a little bit and I think I got a version that does what you want
from bs4 import BeautifulSoup
import requests
page = requests.get("https://www.fifaindex.com/players/?
gender=0&league=1&order=desc")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("tr")
for result in results:
try:
result["data-playerid"]
except KeyError:
continue
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)

Get all table lines with a data-playerid attribute will fix it:
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
results = soup.find_all('tr', {'data-playerid': True})
for res in results:
rating = res.find("span", class_="badge badge-dark rating r3").text
name = res.find("a", class_="link-player")
info = [rating, name]
print(info)

Text vanishes when soup produces

I am trying to extract the date from the Yahoo Finance site but when the text is produce only the time appears without the date
import requests
import re
from bs4 import BeautifulSoup
technicals = {}
ticker = "INFY.NS"
try:
url = "https://finance.yahoo.com/quote/" + ticker + "/key-statistics?p=" + ticker
r = requests.get(url)
# soup = BeautifulSoup(open("Yahoo Stats.html"), 'lxml')
soup = BeautifulSoup(r.text, 'lxml')
t = soup.find('span', {"class": "Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)"})
technicals["CMP"] = t.text
# d = soup.find('span', {"class": "C($tertiaryColor) D(b) Fz(12px) Fw(n) Mstart(0)--mobpsm Mt(6px)--mobpsm"})
d = soup.find('div', attrs={'id': 'quote-market-notice'})
print(d.text)
The code runs further on.
Can somebody help get the date and time

As I checked the page source the data is not located in the span tag. It seems that it is loaded dynamically.
<span data-reactid="36">At close: 3:30PM IST</span>
I recommend you to use Selenium library instead to get the required data.

How can I scrape Songs Title from this request that I have collected using python

import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
print(result.class)
From the above code, I am able to scrape this data
https://www.pastiebin.com/5f08080b8db82
Now I would like to scrape only the title of the songs and then make a list out of them like the below:
Meri Aashiqui
Genda Phool
Any suggestions are much appreciated!

Try this :
import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
#print(result)
div = result.find_all('div', class_='track_npqitemdetail')
name_list = []
for x in div:
span = x.find('span').text
name_list.append(span)
print(name_list)
this code will return all song name in name_list list.

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!

You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Selecting HTML page values using Beautiful Soup - python

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

Web scraping doesn't iterate over entire webpage

Text vanishes when soup produces

How can I scrape Songs Title from this request that I have collected using python

Access Hidden Data on a page

Categories

Resources