Beautifulsoup not finding the tag - python

im working on a project on beautifulsoup
from bs4 import BeautifulSoup as soup
from requests import get
url = "https://www.yelp.com/search?find_desc=&find_loc=New+York%2C+NY&ns=1"
clnt = get(url)
page=soup(clnt.text,"html.parser")
container = page.findAll("div",{"class":"lemon--div__373c0__1mboc container__373c0__ZB8u4 hoverable__373c0__3CcYQ margin-t3__373c0__1l90z margin-b3__373c0__q1DuY padding-t3__373c0__1gw9E padding-r3__373c0__57InZ padding-b3__373c0__342DA padding-l3__373c0__1scQ0 border--top__373c0__3gXLy border--right__373c0__1n3Iv border--bottom__373c0__3qNtD border--left__373c0__d1B7K border-color--default__373c0__3-ifU"})
container = container[1]
url2= "https://www.yelp.com"+container.a["href"]
clnt2 = get(url2)
page2 = soup(clnt2.text, 'html.parser')
info = page2.find("div",{"class":"lemon--div__373c0__1mboc island__373c0__3fs6U u-padding-t1 u-padding-r1 u-padding-b1 u-padding-l1 border--top__373c0__19Owr border--right__373c0__22AHO border--bottom__373c0__uPbXS border--left__373c0__1SjJs border-color--default__373c0__2oFDT background-color--white__373c0__GVEnp"})
contact=info.div (Example contact variable)
In this "info" variable i am getting the div which has all the contact details, I want to grab the Contact number form this div
And as I print this "info" variable it also shows that contact no. is present in the variable including other details but as I traverse through the div's to get the Contact No. I couldn't find it.
I have also tried to get all the child div's and even including the class of the div itself I wasn't able to get that
First url given is this :https://www.yelp.com/search?find_desc=&find_loc=New+York%2C+NY&ns=1
Second url "url2" is this : https://www.yelp.com/biz/levain-bakery-new-york Which has the Contact details
Any Solutions ???

You can get the contact number with the help of the class name. But I seriously doubt if it works for any given page as the class names seem to be dynamic. But you could give it a try.
from bs4 import BeautifulSoup as soup
from requests import get
url = "https://www.yelp.com/search?find_desc=&find_loc=New+York%2C+NY&ns=1"
clnt = get(url)
page=soup(clnt.text,"html.parser")
container = page.findAll("div",{"class":"lemon--div__373c0__1mboc container__373c0__ZB8u4 hoverable__373c0__3CcYQ margin-t3__373c0__1l90z margin-b3__373c0__q1DuY padding-t3__373c0__1gw9E padding-r3__373c0__57InZ padding-b3__373c0__342DA padding-l3__373c0__1scQ0 border--top__373c0__3gXLy border--right__373c0__1n3Iv border--bottom__373c0__3qNtD border--left__373c0__d1B7K border-color--default__373c0__3-ifU"})
container = container[1]
url2= "https://www.yelp.com"+container.a["href"]
clnt2 = get(url2)
page2 = soup(clnt2.text, 'html.parser')
info = page2.find("div",{"class":"lemon--div__373c0__1mboc island__373c0__3fs6U u-padding-t1 u-padding-r1 u-padding-b1 u-padding-l1 border--top__373c0__19Owr border--right__373c0__22AHO border--bottom__373c0__uPbXS border--left__373c0__1SjJs border-color--default__373c0__2oFDT background-color--white__373c0__GVEnp"})
ContactNumber = info.find("p",{"class":"lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--left__373c0__2pnx_"})
print(ContactNumber.text)
Output:
(917) 464-3769

The following works for me:
from bs4 import BeautifulSoup as soup
from requests import get
url = "https://www.yelp.com/search?find_desc=&find_loc=New+York%2C+NY&ns=1"
clnt = get(url)
page=soup(clnt.text,"html.parser")
container = page.findAll("div",{"class":"lemon--div__373c0__1mboc container__373c0__ZB8u4 hoverable__373c0__3CcYQ margin-t3__373c0__1l90z margin-b3__373c0__q1DuY padding-t3__373c0__1gw9E padding-r3__373c0__57InZ padding-b3__373c0__342DA padding-l3__373c0__1scQ0 border--top__373c0__3gXLy border--right__373c0__1n3Iv border--bottom__373c0__3qNtD border--left__373c0__d1B7K border-color--default__373c0__3-ifU"})
container = container[1]
url2= "https://www.yelp.com"+container.a["href"]
clnt2 = get(url2)
page2 = soup(clnt2.text, 'html.parser')
info = page2.find("div",{"class":"lemon--div__373c0__1mboc island__373c0__3fs6U u-padding-t1 u-padding-r1 u-padding-b1 u-padding-l1 border--top__373c0__19Owr border--right__373c0__22AHO border--bottom__373c0__uPbXS border--left__373c0__1SjJs border-color--default__373c0__2oFDT background-color--white__373c0__GVEnp"})
p_tags_of_info = info.find_all("p")
print(p_tags_of_info[2].text)
I just extracted all the <p> tags from the info variable, and then selected the text of the third <p> tag.

try this. I have tried for the second URL and it gives a contact number.
url = 'https://www.yelp.com/biz/levain-bakery-new-york'
page = requests.get(url)
soup1 = BeautifulSoup(page.content, "lxml")
info = soup1.find_all("div", {
"class": "lemon--div__373c0__1mboc island-section__373c0__3vKXy border--top__373c0__19Owr border-color--default__373c0__2oFDT"})
contact_number = info[1].find("p", attrs={
'class': 'lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--normal__373c0__K_MKN text-align--left__373c0__2pnx_'}).text
print(contact_number)
approve the answer if useful. others might help it out.

Related

How to get just links of articles in list using BeautifulSoup

Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you
To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

Scraping each element from website with BeautifulSoup

I wrote a code for scraping one real estate website. This is the link:
https://www.nekretnine.rs/stambeni-objekti/stanovi/lista/po-stranici/10/
From this page I can get only location, size and price of the apartment, but Is it possible to write a code that will go on page of each appartment and scrape values from it, because it contains much more info. Check this link:
https://www.nekretnine.rs/stambeni-objekti/stanovi/arena-bulevar-arsenija-carnojevica-97m-2-lode-energoprojekt/NkvJK0Ou5tV/
I have posted a code. I noticed that my url changes when I click on specific real estate. For example:
arena-bulevar-arsenija-carnojevica-97m-2-lode-energoprojekt/NkvJK0Ou5tV/
I taught about creating for loop, but there is no way to know how it changes because it has some id number at the end:
NkvJK0Ou5tV
This is the code that I have:
from bs4 import BeautifulSoup
import requests
website = "https://www.nekretnine.rs/stambeni-objekti/stanovi/lista/po-stranici/10/"
soup = requests.get(website).text
my_html = BeautifulSoup(soup, 'lxml')
lokacija = my_html.find_all('p', class_='offer-location text-truncate')
ukupna_kvadratura = my_html.find_all('p', class_='offer-price offer-price--invert')
ukupna_cena = my_html.find_all('div', class_='d-flex justify-content-between w-100')
ukupni_opis = my_html.find_all('div', class_='mt-1 mb-1 mt-lg-0 mb-lg-0 d-md-block offer-meta-info offer-adress')
for lok, kvadratura, cena_stana, sumarno in zip(lokacija, ukupna_kvadratura, ukupna_cena, ukupni_opis):
lok = lok.text.split(',')[0] #lokacija
kv = kvadratura.span.text.split(' ')[0] #kvadratura
jed = kvadratura.span.text.split(' ')[1] #jedinica mere
cena = cena_stana.span.text #cena
sumarno = sumarno.text
datum = sumarno.split('|')[0].strip()
status = sumarno.split('|')[1].strip()
opis = sumarno.split('|')[2].strip()
print(lok, kv, jed, cena, datum, status, opis)
You can get href from div class="placeholder-preview-box ratio-4-3".
From here you can find the URL.
You can iterate over the links provided by the pagination at the bottom of the page:
from bs4 import BeautifulSoup as soup
import requests
d = soup(requests.get('https://www.nekretnine.rs/stambeni-objekti/stanovi/lista/po-stranici/10/').text, 'html.parser')
def scrape_page(page):
return [{'title':i.h2.get_text(strip=True), 'loc':i.p.get_text(strip=True), 'price':i.find('p', {'class':'offer-price'}).get_text(strip=True)} for i in page.find_all('div', {'class':'row offer'})]
result = [scrape_page(d)]
while d.find('a', {'class':'pagination-arrow arrow-right'}):
d = soup(requests.get(f'https://www.nekretnine.rs{d.find("a", {"class":"pagination-arrow arrow-right"})["href"]}').text, 'html.parser')
result.append(scrape_page(d))

How to scrape links without href attribute?

I want to extract links but there is not any href attribute given. How do I scrape the links from the page?
from bs4 import BeautifulSoup
import requests
for count in range(1,421):
r = requests.get('http://iapsm.org/MemberPage/members.php?
page='+str(count)+'&Search=',headers= {'User-Agent':'Googleboat'})
soup = BeautifulSoup(r.text,'lxml')
links = soup.find_all('div',class_='Table')
for link in soup.find_all('tr'):
c = (link.get('a'))
print c
I'm not getting any output or getting any error
To scrape all the details, first search for all the div which have there class as modal-content
You can try my code below to get all the information of users.
modals = soup.find_all('div',{'class':'modal-content'})
user_data = []
for modal in modals:
uls = modal.find_all('ul',{'class':'Modal-List'})
info = {}
for ul in uls:
info[ul[0]] = ul[1]
user_data.append(info)
print(user_data)

Python /bs4: trying to print temperature/city from a local website

I'm trying to get and print the current weather temperature and city name from a local website, but no success.
All I need it to read and print the city (Lodrina), the Temperature (23.1C) and if possible the title in ca-cond-firs ("Temperatura em declínio") - this last one changes as temps goes up or down...
This is the html section of the site:
THIS IS THE HTML (the part of matters:)
#<div class="ca-cidade">Londrina</div>
<ul class="ca-condicoes">
<li class="ca-cond-firs"><img src="/site/imagens/icones_condicoes/temperatura/temp_baixa.png" title="Temperatura em declínio"/><br/>23.1°C</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/vento/L.png"/><br/>10 km/h</li>
<li class="ca-cond"><div class="ur">UR</div><br/>54%</li>
<li class="ca-cond"><img src="/site/imagens/icones_condicoes/chuva.png"/><br/>0.0 mm</li>
THIS IS THE CODE I DID SO FAR:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
id = soup.find('a', 'id=23185109')
print(id)
any help?
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser') # parse page as html
temp_table = soup.find_all('table', {'class':'cidadeTempo'}) # get detail of table with class name cidadeTempo
for entity in temp_table:
city_name = entity.find('h3').text # fetches name of city
city_temp_max = entity.find('span', {'class':'tempMax'}).text # fetches max temperature
city_temp_min = entity.find('span', {'class':'tempMin'}).text # fetches min temperature
print("City :{} \t Max_temp: {} \t Min_temp: {}".format(city_name, city_temp_max, city_temp_min)) # prints content
below code can get details of temprature at right side of page as you require.
result_table = soup.find('div', {'class':'ca-content-wrapper'})
print(result_table.text) # in your case there is no other div exist with class name ca-content-wrapper hence I can use it directly without iterating. you can use if condition to control which city temprature to print and which to not.
# output will be like :
# Apucarana
# 21.5°C
# 4 km/h
# UR60%
# 0.0 mm
I'm not sure what problems you are running into with your code. In my attempts to use your code, I found that I needed to use the html parser to successfully parse the website. I also used soup.findAll() in order to find elements that matched the desired class. Hopefully the below will lead you to your answer:
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
rows = soup.findAll('li', {'class', 'ca-cond-firs'})
print rows
You should try out the CSS3 selectors in BS4, I personally find it a lot easier to use than find and find_all.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'lxml')
# soup.select returns the list of all the elements that matches the CSS3 selector
# get the text inside each <a> tag inside div.ca-cidade
cities = [cityTag.text for cityTag in soup.select("div.ca-cidade > a")]
# get the temperature inside each li.ca-cond-firs
temps = [tempTag.text for tempTag in soup.select("li.ca-cond-firs")]
# get the temperature status inside each li.ca-cond-firs > img title attibute
tempStatus = [tag["title"] for tag in soup.select("li.ca-cond-firs > img")]
# len(cities) == len(temps) == len(tempStatus) => This is normally true.
for i in range(len(cities)):
print("City: {}, Temperature: {}, Status: {}.".format(cities[i], temps[i], tempStatus[i]))
Here you go. You can customize that wind thing depending on icon name.
#!/usr/bin/env python
# -*- encoding: utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from bs4 import BeautifulSoup
import requests
def get_weather_data():
URL = 'http://www.simepar.br/site/index.shtml'
rawhtml = requests.get(URL).text
soup = BeautifulSoup(rawhtml, 'html.parser')
cities = soup.find('div', {"class":"ca-content-wrapper"})
weather_data = []
for city in cities.findAll("div", {"class":"ca-bg"}):
name = city.find("div", {"class":"ca-cidade"}).text
temp = city.find("li", {"class":"ca-cond-firs"}).text
conditons = city.findAll("li", {"class":"ca-cond"})
weather_data.append({
"city":name,
"temp":temp,
"conditions":[{
"wind":conditons[0].text +" "+what_wind(conditons[0].find("img")["src"]),
"humidity":conditons[1].text,
"raind":conditons[2].text,
}]
})
return weather_data
def what_wind(img):
if img.find ("NE"):
return "From North East"
if img.find ("O"):
return "From West"
if img.find ("N"):
return "From North"
#you can add other icons here
print get_weather_data()
And that is all weather data from that website.

Categories

Resources