I am trying to parse a webpage that looks like this with Python->Beautiful Soup
see image
I need data from
<div class="p-offer__price-new">199,99 ₽</div>
I tried this code:
soup = BeautifulSoup(data)
res = soup.findAll("div", {"class": "poffer__price-new"})
print(res)
But result is empty -- []
How can I get this data?
Example of URL: https://edadeal.ru/moskva/offers/d71b75ff-bfee-4731-95ad-52a24ddea72e?from=%2F
import bs4
from selenium import webdriver
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('https://edadeal.ru/moskva/offers/d71b75ff-bfee-4731-95ad-52a24ddea72e?from=%2F')
html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')
res = soup.findAll("div", {"class": "p-offer__price-new"})
print (res[0].text)
driver.close()
Related
Can anyone tell me where is the prblm i am new in python i want get all links from this page here is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get('https://www.industrystock.com/en/companies/Agriculture')
re
soup = BeautifulSoup(re.text, 'lxml')
link_list = []
page1 = soup.find_all('a', class_ = 'btn awe-info gotoJS iconColor_white')
page1
for i in page1:
link = (i.get('href'))
link_list.append(link)
The links to company profiles are stored in data-href= attribute:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.industrystock.com/en/companies/Agriculture")
soup = BeautifulSoup(r.content, "lxml")
page1 = soup.find_all("a", class_="btn awe-info gotoJS iconColor_white")
for i in page1:
print(i["data-href"])
Prints:
https://www.industrystock.com/en/company/profile/ARCA-Internet-Services-Ltd./370071
https://www.industrystock.com/en/company/profile/Забайкальская-аграрная-Ассоциация-образовательных-и-научных-учреждений/256182
https://www.industrystock.com/en/company/profile/...VÁŠ-INTERIÉR-s.r.o./534809
https://www.industrystock.com/en/company/profile/1-WITOS-s.r.o./529071
https://www.industrystock.com/en/company/profile/1.-TOUŠEŇSKÁ-s.r.o./544981
https://www.industrystock.com/en/company/profile/1.HEFAISTOS-s.r.o./541263
https://www.industrystock.com/en/company/profile/1.HRADECKÁ-ZEMĚDĚLSKÁ-a.s./548267
https://www.industrystock.com/en/company/profile/1.MAXIMA-INTERNATIONAL-s.r.o./530049
https://www.industrystock.com/en/company/profile/1.MIROSLAVSKÁ-STROJÍRNA-spol.-s-r.o./544781
https://www.industrystock.com/en/company/profile/1.VASTO-spol.-s-r.o./535985
https://www.industrystock.com/en/company/profile/1C-PRO-s.r.o./534831
https://www.industrystock.com/en/company/profile/1CSC-a.s./528169
https://www.industrystock.com/en/company/profile/1P-CONTROL/549995
https://www.industrystock.com/en/company/profile/2-ES-spol.-s-r.o./547849
https://www.industrystock.com/en/company/profile/2-G-SERVIS-spol.-s-r.o./528391
https://www.industrystock.com/en/company/profile/2-JCP-a.s./537151
https://www.industrystock.com/en/company/profile/2-THETA-ASE-s.r.o./545079
https://www.industrystock.com/en/company/profile/2LMAKERS-s.r.o./542127
https://www.industrystock.com/en/company/profile/2M-SERVIS-s.r.o./550923
https://www.industrystock.com/en/company/profile/2M-STATIC-s.r.o./549935
https://www.industrystock.com/en/company/profile/2M-STROJE-s.r.o./539885
https://www.industrystock.com/en/company/profile/2TMOTORS-s.r.o./543869
https://www.industrystock.com/en/company/profile/2VV-s.r.o./538993
https://www.industrystock.com/en/company/profile/2xSERVIS-s.r.o./528321
https://www.industrystock.com/en/company/profile/3-PLUS-1-SERVICE-s.r.o./535103
https://www.industrystock.com/en/company/profile/3-TOOLING-s.r.o./540599
https://www.industrystock.com/en/company/profile/3B-SOCIÁLNÍ-FIRMA-s.r.o./535127
https://www.industrystock.com/en/company/profile/3D-KOVÁRNA-s.r.o./549765
https://www.industrystock.com/en/company/profile/3D-TECH-spol.-s-r.o./548047
https://www.industrystock.com/en/company/profile/3DNC-SYSTEMS-s.r.o./549379
Try this:
response = requests.get('https://www.industrystock.com/en/companies/Agriculture')
soup = BeautifulSoup(response.text, 'lxml')
link_list = []
page1 = soup.find_all('a', {"class":'btn awe-info gotoJS iconColor_white'})
for i in page1:
link = i['href']
link_list.append(link)
And I would also recommend using html.parser if you are not scraping XML.
I am trying to create a list of all football teams/links from any one of a number of tables within the base URL: https://fbref.com/en/comps/10/stats/Championship-Stats
I would then use the link from the href to scrape each individual team's data. The href is embedded within the th tag as per below
th scope="row" class="left " data-stat="squad">Barnsley</th
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley</a
The following code gives me a list of the 'a' tags
page = "https://fbref.com/en/comps/10/Championship-Stats"
pageTree = requests.get(page)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
Teams = pageSoup.find_all("th", {"class": "left"})
Output(for each class of 'left'):
th class="left" data-stat="squad" scope="row">
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley,
I have tried the guidance from a previous Stack question (Extract links after th in beautifulsoup)
However, the following code based on that thread produces errors
AttributeError: 'NoneType' object has no attribute 'find_parent'
def import_TeamList():
BASE_URL = "https://fbref.com/en/comps/10/Championship-Stats"
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, 'lxml')
team_list = []
team_tr = soup.find('a', {'data-stat': 'squad'}).find_parent('tr')
for tr in reels_tr.find_next_siblings('tr'):
if tr.find('a').text != 'squad':
break
midi_list.append(BASE_URL + tr.find('a')['href'])
return TeamList
Here is a version using CSS selectors, which I find simpler than most other methods.
import requests
from bs4 import BeautifulSoup
url = 'https://fbref.com/en/comps/10/stats/Championship-Stats'
data = requests.get(url).text
soup = BeautifulSoup(data)
links = BeautifulSoup(data).select('th a')
urls = [link['href'] for link in links]
print(urls)
Is this what you're looking for?
import requests
from bs4 import BeautifulSoup as BS
from lxml import etree
with requests.Session() as session:
r = session.get('https://fbref.com/en/comps/10/stats/Championship-Stats')
r.raise_for_status()
dom = etree.HTML(str(BS(r.text, 'lxml')))
for a in dom.xpath('//th[#class="left"]/a'):
print(a.attrib['href'])
import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
print(result.class)
From the above code, I am able to scrape this data
https://www.pastiebin.com/5f08080b8db82
Now I would like to scrape only the title of the songs and then make a list out of them like the below:
Meri Aashiqui
Genda Phool
Any suggestions are much appreciated!
Try this :
import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
#print(result)
div = result.find_all('div', class_='track_npqitemdetail')
name_list = []
for x in div:
span = x.find('span').text
name_list.append(span)
print(name_list)
this code will return all song name in name_list list.
https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king
I want to get TOMATOMETER and AUDIENCE SCORE from that website,
but got an empty list.
soup = BeautifulSoup(html, 'html.parser')
notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
You can use last-child selector for span type with the parent class. This is using BeautifulSoup 4.7.1
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king')
soup = bs(res.content, 'lxml')
ratings = [item.text.strip() for item in soup.select('h1.mop-ratings-wrap__score span:last-child')]
print(ratings)
Your code works well
>>> from bs4 import BeautifulSoup
>>> html = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king').text
>>> soup = BeautifulSoup(html, 'html.parser')
>>> notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
>>> notices
[<span class="mop-ratings-wrap__percentage">93%</span>]
How did you get html variable?
Again I am having trouble scraping href's in BeautifulSoup. I have a list of pages that I am scraping and I have the data but I can't seem to get the hrefs even when I use various codes that work in other scripts.
So here is the code and my data will be below that:
import requests
from bs4 import BeautifulSoup
with open('states_names.csv', 'r') as reader:
states = [states.strip().replace(' ', '-') for states in reader]
url = 'https://www.hauntedplaces.org/state/alabama'
for state in states:
page = requests.get(url+state)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.findAll('div', class_='description')
# When I try to add .get('href') I get a traceback error. Am I trying to scrape the href too early?
h_page = soup.findAll('h3')
<h3>Gaines Ridge Dinner Club</h3>
<h3>Purifoy-Lipscomb House</h3>
<h3>Kate Shepard House Bed and Breakfast</h3>
<h3>Cedarhurst Mansion</h3>
<h3>Crybaby Bridge</h3>
<h3>Gaineswood Plantation</h3>
<h3>Mountain View Hospital</h3>
This works perfectly:
from bs4 import BeautifulSoup
import requests
url = 'https://www.hauntedplaces.org/state/Alabama'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
for link in soup.select('div.description a'):
print(link['href'])
Try that:
soup = BeautifulSoup(page.content, 'html.parser')
list0 = []
possible_links = soup.find_all('a')
for link in possible_links:
if link.has_attr('href'):
print (link.attrs['href'])
list0.append(link.attrs['href'])
print(list0)