how can i parsing some string which i want use bs4? - python

this is html architecture
BlueHouse <span class="blind">selected</span>
and then below is my source code to get only Blue House
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'})
when i run that code to get only Blue House, it's gave me result with selected.
but i wanna only Blue House. so How can i do for it?
bellow is my full code
def crwaling_data_bluehouse(self):
# setting web driver to get object
chrome_driver = webdriver.Chrome('D:/바탕 화면/인턴/python/crawling_software/crwaler/news_crwaling/chromedriver.exe')
url = 'https://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264'
chrome_driver.get(url)
html = chrome_driver.page_source
soup = BeautifulSoup(html, 'html.parser')
#get main category
main_category = soup.find('a',{'class':'nclicks(LNB.pol)'}).find('span',{'class':'tx'}).get_text()
self.set_main_category(main_category)
#get middle category
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'}).get_text()
middle_category = middle_category.find_next(text = True)
self.set_middle_category(middle_category)
#get title
title = soup.find('ul',{'class':'type06_headline'}).find('a')['href']
self.set_title(title)

You can use find_next() which will only return the first match:
from bs4 import BeautifulSoup
txt = """BlueHouse <span class="blind">selected</span>"""
soup = BeautifulSoup(txt, 'html.parser')
middle_category = soup.find('a', {'class': 'snb_s11 nclicks(lmn_pol.mnl,,1)'})
print(middle_category.find_next(text=True))
Output:
BlueHouse
Edit don't call get_text(). Instead of
middle_category = soup.find('a',{'class':'snb_s11 nclicks(lmn_pol.mnl,,1)'}).get_text()
Use middle_category = soup.find('a', {'class': 'snb_s11 nclicks(lmn_pol.mnl,,1)'}).find_next(text=True)

Related

Getting only numbers from BeautifulSoup instead of whole div

I am trying to learn python by creating a small websraping program to make life easier, although I am having issues with only getting number when using BS4. I was able to get the price when I scraped an actual ad, but I would like to get all the prices from the page.
Here is my code:
from bs4 import BeautifulSoup
import requests
prices = []
url = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
result = requests.get(url)
print (result.status_code)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
print ("CLEARING")
price = soup.findAll("div", class_="price")
prices.append(price)
print (prices)
Here is my output
[<div class="price">
$46,999.00
<div class="dealer-logo">
<div class="dealer-logo-image">
<img src="https://i.ebayimg.com/00/s/NjBYMTIw/z/xMQAAOSwi9ZfoW7r/$_69.PNG"/>
</div>
</div>
</div>
Ideally, I would only want the output to be "46,999.00".
I tried with text=True, although this did not work and I would not get any output from it besides an empty list.
Thank you
You need to get the text portion of tag and then perform some regex processing on it.
import re
def get_price_from_div(div_item):
str_price = re.sub('[^0-9\.]','', div_item.text)
float_price = float(str_price)
return float_price
Just call this method in your code after you find the divs
from bs4 import BeautifulSoup
import requests
prices = []
url = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
result = requests.get(url)
print (result.status_code)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
print ("CLEARING")
price = soup.findAll("div", class_="price")
prices.extend([get_price_from_div(curr_div) for curr_div in price])
print (prices)
An option without using RegEx, is to filter out tags that startwith() a dollar sign $:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.kijiji.ca/b-cars-trucks/calgary/new__used/c174l1700199a49?ll=51.044733%2C-114.071883&address=Calgary%2C+AB&radius=50.0'
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
price_tags = soup.find_all("div", class_="price")
prices = [
tag.get_text(strip=True)[1:] for tag in price_tags
if tag.get_text(strip=True).startswith('$')
]
print(prices)
Output:
['48,888.00', '21,999.00', '44,488.00', '5,500.00', '33,000.00', '14,900.00', '1,750.00', '35,600.00', '1,800.00', '25,888.00', '36,888.00', '32,888.00', '30,888.00', '18,888.00', '21,888.00', '29,888.00', '22,888.00', '30,888.00', '17,888.00', '17,888.00', '16,888.00', '22,888.00', '22,888.00', '34,888.00', '31,888.00', '32,888.00', '30,888.00', '21,888.00', '15,888.00', '21,888.00', '28,888.00', '19,888.00', '18,888.00', '30,995.00', '30,995.00', '30,995.00', '19,888.00', '47,995.00', '21,888.00', '46,995.00', '32,888.00', '29,888.00', '26,888.00', '21,888.00']

Scraping multiple div classes with the same name using Beautiful Soup in Python

I have succesfully scraped the image and title of a webpage (URL in code). I want to do this for all movies that are on this page. 'movie_list' contains all of this, but only the first movie information is scraped.
I tried changing .find to .find_all but this gives me an error.
url = 'https://5movies.to/movie/'
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
movie_list = soup.find_all('div', {'class': 'movie-list'})
all_item = []
for allContainers in movie_list:
filmName = allContainers.find('img').get('alt')
filmFoto = allContainers.find('img').get('src')
filmLink = allContainers.find('a').get('href')
print(filmName)
Moving the print statement inside of the for loop made it print out the whole list.
Indeed I replaced the print with all_item.append((filmName, filmFoto, filmLink)) to send it to my frontend.
Acknowledgements to #furas for the information!
url = 'https://5movies.to/movie/'
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
movie_list = soup.find_all('div', {'class': 'movie-list'})
all_item = []
for allContainers in movie_list:
filmName = allContainers.find('img').get('alt')
filmFoto = allContainers.find('img').get('src')
filmLink = allContainers.find('a').get('href')
print(filmName)

extract a html ID from inside a tag using beautiful soup python

I am trying to extract only an iid code in html so that i can append it to a url and open the page I need.
I can find the tag I need by specifying the class of the tag. However I also get 4 other tags in the output. All i want is the iid inside the first tag "183988596953"
I have tried using this code to specify only the idd
rslt_table = soup.find_all("iid",{"div class": "lvpic pic img left"})
This however only seems to return an empty list []
The output i get when repacing the line of code above with 2nd last line of code below is the output with 4 tags I mentioned
from bs4 import BeautifulSoup
import requests
import re
urls = ['https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=goldfinger+quad']
#https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
#soup = BeautifulSoup(response.content, "lxml")
soup = BeautifulSoup(response.content, "html.parser")
rslt_table = soup.find("div", {"class": "lvpic pic img left"})
return(rslt_table)
My search url is https://www.ebay.co.uk/sch/i.html?_from=R40&_trksid=m570.l1313&_nkw=goldfinger+quad'
Full outpt is
<div class="lvpic pic img left" iid="183988596953">
<div class="lvpicinner full-width picW">
<a class="img imgWr2" href="https://www.ebay.co.uk/itm/GOLDFINGER-1964-Style-A-B-UK-Cinema-High-Quality-Repro-30-x-40-quad-poster/183988596953?hash=item2ad69330d9:g:rYQAAOSwrENdbmEW">
<img alt='GOLDFINGER 1964 Style A & B - UK Cinema High Quality Repro 30"x 40" quad poster' class="img" src="https://i.ebayimg.com/thumbs/images/g/rYQAAOSwrENdbmEW/s-l225.jpg"/>
</a>
</div></div>
Your code updated:
Use attrs to return all the attributes
{'class': ['lvpic', 'pic', 'img', 'left'], 'iid': '183988596953'}
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
return soup.find("div", {"class": "lvpic pic img left"}).attrs['iid']
iid = find_id(urls)
print(iid)
>>> '183988596953'
If you want all iid:
def find_id(urls):
for url in urls:
session = requests.session()
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
div = s.find_all("div", attrs={'class': 'lvpic pic img left'})
return [iid.attrs['iid'] for iid in div]

Selecting HTML page values using Beautiful Soup

I have been trying to figure this out for a few hour and it is doing my head in. Every method I try is not presenting the correct value.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.content, 'html.parser')
I want to extract the following values from the webpage (https://www.off---white.com/en/GB/products/omia065s188000160100)
Name = LOW 3.0 SNEAKER
Price = £ 415
img_url = https://cdn.off---white.com/images/156365/large_OMIA065S188000160100_4.jpg?1498202305
How would I extract these 3 values using Beautiful Soup?
import requests
from bs4 import BeautifulSoup
# Get prod name
r = requests.get('https://www.off---white.com/en/GB/products/omia065s188000160100')
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.find_all('span', {'class' : 'prod-title'})
data = [span.get_text() for span in spans]
prod_name = ''.join(data)
# Find prod price
spans = soup.find_all('div', {'class' : 'price'})
data = [span.get_text() for span in spans]
prod_price = ''.join(data)
# Find prod img
spans = soup.find_all('img', {'id' : 'image-0'})
for meta in spans:
prod_img = meta.attrs['src']
print(prod_name)
print(prod_price)
print(prod_img)

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Categories

Resources