exporting label name from from web page using bsObj python - python

i want to get the name of the url target from a webpage
this is what is have done so far :
check ='https://www.zap.co.il/search.aspx?keyword='+'N3580-5092'
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'xml')
storeName = bsObj.select_one('div.StoresLines div.BuyButtonsTxt')
the result is :
<div class="BuyButtonsTxt">
ב-<a aria-label="לקנייה ב-פיסי אונליין Dell Inspiron 15 3580
N3580-5092" href="/fs.aspx?pid=666473435&sog=c-pclaptop" id=""
target="_blank">פיסי אונליין</a>
</div>
i want only the value in the href : "פיסי אונליין"
how to do it ?

I had to change bsObj = BeautifulSoup(html.content,'xml') to bsObj = BeautifulSoup(html.content,'html.parser'), as the 'xml' wouldn't find the tag for me
from bs4 import BeautifulSoup
import requests
check ='https://www.zap.co.il/search.aspx?keyword='+'N3580-5092'
r = requests.get(check)
html = requests.get(r.url)
bsObj = BeautifulSoup(html.content,'html.parser')
storeName = bsObj.select_one('div.StoresLines div.BuyButtonsTxt')
text = storeName.find('a').text
Output:
'פיסי אונליין'

Related

bs4: splitting text with same class - python

I am web scraping for the first time, and ran into a problem: some classes have the same name.
This is the code:
testlink = 'https://www.ah.nl/producten/product/wi387906/wasa-volkoren'
r = requests.get(testlink)
soup = BeautifulSoup(r.content, 'html.parser')
products = (soup.findAll('dd', class_='product-info-definition-list_value__kspp6'))
And this is the output
[<dd class="product-info-definition-list_value__kspp6">13 g</dd>, <dd class="product-info-definition-list_value__kspp6">20</dd>, <dd class="product-info-definition-list_value__kspp6">Rogge, Glutenbevattende Granen</dd>, <dd class="product-info-definition-list_value__kspp6">Sesamzaad, Melk</dd>]
I need to get the 3rd class (Rogge, Glutenbevattende Granen)... I am using this link to test, and eventually want to scrape multiple pages of the website. Anyone any tips?
Thank you!
You can select all of dd tags with class value product-info-definition-list_value__kspp6 and list slicing
import requests
from bs4 import BeautifulSoup
url='https://www.ah.nl/producten/pasta-rijst-en-wereldkeuken?page={page}'
for page in range(1,11):
req = requests.get(url.format(page=page))
soup = BeautifulSoup(req.content, 'html.parser')
for link in soup.select('div[class="product-card-portrait_content__2xN-b"] a'):
abs_url = 'https://www.ah.nl' + link.get('href')
#print(abs_url)
req2 = requests.get(abs_url)
soup2 = BeautifulSoup(req2.content, 'html.parser')
dd = [d.get_text() for d in soup2.select('dd[class="product-info-definition-list_value__kspp6"]')][2:-2]
print(dd)

how can i parsing some string which i want use bs4?

this is html architecture
BlueHouse <span class="blind">selected</span>
and then below is my source code to get only Blue House
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'})
when i run that code to get only Blue House, it's gave me result with selected.
but i wanna only Blue House. so How can i do for it?
bellow is my full code
def crwaling_data_bluehouse(self):
# setting web driver to get object
chrome_driver = webdriver.Chrome('D:/바탕 화면/인턴/python/crawling_software/crwaler/news_crwaling/chromedriver.exe')
url = 'https://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264'
chrome_driver.get(url)
html = chrome_driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#get main category
main_category = soup.find('a',{'class':'nclicks(LNB.pol)'}).find('span',{'class':'tx'}).get_text()
self.set_main_category(main_category)
#get middle category
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'}).get_text()
middle_category = middle_category.find_next(text = True)
self.set_middle_category(middle_category)
#get title
title = soup.find('ul',{'class':'type06_headline'}).find('a')['href']
self.set_title(title)
You can use find_next() which will only return the first match:
from bs4 import BeautifulSoup
txt = """BlueHouse <span class="blind">selected</span>"""
soup = BeautifulSoup(txt, 'html.parser')
middle_category = soup.find('a', {'class': 'snb_s11 nclicks(lmn_pol.mnl,,1)'})
print(middle_category.find_next(text=True))
Output:
BlueHouse
Edit don't call get_text(). Instead of
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'}).get_text()
Use middle_category = soup.find('a', {'class': 'snb_s11 nclicks(lmn_pol.mnl,,1)'}).find_next(text=True)

Certain content not loading when scraping a site with Beautiful Soup

I'm trying to scrape the ratings off recipes on NYT Cooking but having issues getting the content I need. When I look at the source on the NYT page, I see the following:
<div class="ratings-rating">
<span class="ratings-header ratings-content">194 ratings</span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content four-star-rating avg-rating">
The content I'm trying to pull out is 194 ratings and four-star-rating. However, when I pull in the page source via Beautiful Soup I only see this:
<div class="ratings-rating">
<span class="ratings-header ratings-content"><%= header %></span>
<div class="ratings-stars-wrap">
<div class="ratings-stars ratings-content <%= ratingClass %> <%= state %>">
The code I'm using is:
url = 'https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill'
r = get(url, headers = headers, timeout=15)
page_soup = soup(r.text,'html.parser')
Any thoughts why that information isn't pulling through?
Try using below code
import requests
import lxml
from lxml import html
import re
url = "https://cooking.nytimes.com/recipes/1019706-spiced-roasted-cauliflower-with-feta-and-garlic?action=click&module=Recirculation%20Band%20Recipe%20Card&region=More%20recipes%20from%20Alison%20Roman&pgType=recipedetails&rank=1"
r = requests.get(url)
tree = html.fromstring(r.content)
t = tree.xpath('/html/body/script[14]')[0]
# look for value for bootstrap.recipe.avg_rating
m = re.search("bootstrap.recipe.avg_rating = ", t.text)
colon = re.search(";", t.text[m.end()::])
rating = t.text[m.end():m.end()+colon.start()]
print(rating)
# look for value for bootstrap.recipe.num_ratings =
n = re.search("bootstrap.recipe.num_ratings = ", t.text)
colon2 = re.search(";", t.text[n.end()::])
star = t.text[n.end():n.end()+colon2.start()]
print(star)
much easier to use attribute = value selectors to grab from span with class ratings-metadata
import requests
from bs4 import BeautifulSoup
data = requests.get('https://cooking.nytimes.com/recipes/1020049-lemony-chicken-soup-with-fennel-and-dill')
soup = BeautifulSoup(data.content, 'lxml')
rating = soup.select_one('[itemprop=ratingValue]').text
ratingCount = soup.select_one('[itemprop=ratingCount]').text
print(rating, ratingCount)

Data missing on requests.get() Python 2

I want to webscrape the IAA Consensus price on https://www.settrade.com/AnalystConsensus/C04_10_stock_saa_p1.jsp?txtSymbol=PTT&ssoPageId=9&selectPage=10
In Google chrome inspect elements, I can use <h3> through beautifulsoup to get the data. But from the print page.content I get
...
<h3 class="colorGreen"></h3>
...
Where it should be <h3 class="colorGreen">62.00</h3>
Here's my code
import requests
from bs4 import BeautifulSoup
def findPrice(Quote):
link = "http://www.settrade.com/AnalystConsensus/C04_10_stock_saa_p1.jsp?txtSymbol="+Quote+"&ssoPageId=9&selectPage=10"
page = requests.get(link)
soup = BeautifulSoup(page.content,'html.parser')
print page.content
target = soup.findAll('h3')
return target.string
findPrice('PTT')
I guess, the server is checking for a LstQtLst cookie and generates the HTML with the "Consensus Target Price" filled in.
import requests
from bs4 import BeautifulSoup
def find_price(quote):
link = ('http://www.settrade.com/AnalystConsensus/C04_10_stock_saa_p1.jsp'
'?txtSymbol={}'
'&ssoPageId=9'
'&selectPage=10'.format(quote))
html = requests.get(link, cookies={'LstQtLst': quote}).text
soup = BeautifulSoup(html, 'html.parser')
price = soup.find('h3').string
return price
>>> find_price('PTT')
62.00

Python /bs4: trying to print temperature/city from a local website

I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm
I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows
You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))
Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.

Categories

Resources