Trying to scrape a particular part of the whole HTML

Trying to scrape a particular part of the whole HTML - python

I want to scrape out only the highlighted part of this:
I am using the following code:
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.select('td'):
print(item.text)
How do I scrape only the required part of the entire <td> tag?
Thanks in advance.

Try this...
from bs4 import BeautifulSoup
r = requests.get("https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(r.text, "lxml")
emojiDescription = []
for tableRow in soup.find_all("tr", attrs={"class": "ugc_emoji_tr"}):
for tabledata in tableRow.findChildren("td"):
if tabledata.has_attr("id"):
k = tabledata.text.strip().split('\n')[-1]
l = k.lstrip()
print(emojiDescription)

This works for me to get the highlighted text-
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.findAll('td'):
print((item.text.strip().split('\n'))[-1])

Assuming you can use the name to look up the description, you can combine :contains with chained next_sibling
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata(
"https://www.emojimeanings.net/list-smileys-people-whatsapp")
soup = BeautifulSoup(htmldata, 'html.parser')
print(soup.select_one('b:contains("Smiling Face With Open Mouth")').next_sibling.next_sibling.strip())
If not, you can use positional matching with nth-of-type
print(soup.select_one('tr:nth-of-type(2) b').next_sibling.next_sibling.strip())

Related

Is there a way to extract only the string of a paragraph when webscraping in python?

We want to know if there is a way to extract only the string of a paragraph when web scraping in python?
The problem lies in our definition of 'price'
Anastacia <3
Code:
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price)
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)

You only have to modify the variable price. Extract the text from between the tags and for esthetic purposes, strip the extracted string.
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price.get_text().strip())
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)

import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.bevco.dk/spiritus")
soup = BeautifulSoup(htmldata, 'html.parser')
data = ''
for data in soup.find_all("p",{'class': 'product-price font-weight-bold mb-0'}):
print(data.get_text()) `enter code here`

how to get all 'href' from page in company profile box

Can anyone tell me where is the prblm i am new in python i want get all links from this page here is my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
re=requests.get('https://www.industrystock.com/en/companies/Agriculture')
re
soup = BeautifulSoup(re.text, 'lxml')
link_list = []
page1 = soup.find_all('a', class_ = 'btn awe-info gotoJS iconColor_white')
page1
for i in page1:
link = (i.get('href'))
link_list.append(link)

The links to company profiles are stored in data-href= attribute:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.industrystock.com/en/companies/Agriculture")
soup = BeautifulSoup(r.content, "lxml")
page1 = soup.find_all("a", class_="btn awe-info gotoJS iconColor_white")
for i in page1:
print(i["data-href"])
Prints:
https://www.industrystock.com/en/company/profile/ARCA-Internet-Services-Ltd./370071
https://www.industrystock.com/en/company/profile/Забайкальская-аграрная-Ассоциация-образовательных-и-научных-учреждений/256182
https://www.industrystock.com/en/company/profile/...VÁŠ-INTERIÉR-s.r.o./534809
https://www.industrystock.com/en/company/profile/1-WITOS-s.r.o./529071
https://www.industrystock.com/en/company/profile/1.-TOUŠEŇSKÁ-s.r.o./544981
https://www.industrystock.com/en/company/profile/1.HEFAISTOS-s.r.o./541263
https://www.industrystock.com/en/company/profile/1.HRADECKÁ-ZEMĚDĚLSKÁ-a.s./548267
https://www.industrystock.com/en/company/profile/1.MAXIMA-INTERNATIONAL-s.r.o./530049
https://www.industrystock.com/en/company/profile/1.MIROSLAVSKÁ-STROJÍRNA-spol.-s-r.o./544781
https://www.industrystock.com/en/company/profile/1.VASTO-spol.-s-r.o./535985
https://www.industrystock.com/en/company/profile/1C-PRO-s.r.o./534831
https://www.industrystock.com/en/company/profile/1CSC-a.s./528169
https://www.industrystock.com/en/company/profile/1P-CONTROL/549995
https://www.industrystock.com/en/company/profile/2-ES-spol.-s-r.o./547849
https://www.industrystock.com/en/company/profile/2-G-SERVIS-spol.-s-r.o./528391
https://www.industrystock.com/en/company/profile/2-JCP-a.s./537151
https://www.industrystock.com/en/company/profile/2-THETA-ASE-s.r.o./545079
https://www.industrystock.com/en/company/profile/2LMAKERS-s.r.o./542127
https://www.industrystock.com/en/company/profile/2M-SERVIS-s.r.o./550923
https://www.industrystock.com/en/company/profile/2M-STATIC-s.r.o./549935
https://www.industrystock.com/en/company/profile/2M-STROJE-s.r.o./539885
https://www.industrystock.com/en/company/profile/2TMOTORS-s.r.o./543869
https://www.industrystock.com/en/company/profile/2VV-s.r.o./538993
https://www.industrystock.com/en/company/profile/2xSERVIS-s.r.o./528321
https://www.industrystock.com/en/company/profile/3-PLUS-1-SERVICE-s.r.o./535103
https://www.industrystock.com/en/company/profile/3-TOOLING-s.r.o./540599
https://www.industrystock.com/en/company/profile/3B-SOCIÁLNÍ-FIRMA-s.r.o./535127
https://www.industrystock.com/en/company/profile/3D-KOVÁRNA-s.r.o./549765
https://www.industrystock.com/en/company/profile/3D-TECH-spol.-s-r.o./548047
https://www.industrystock.com/en/company/profile/3DNC-SYSTEMS-s.r.o./549379

Try this:
response = requests.get('https://www.industrystock.com/en/companies/Agriculture')
soup = BeautifulSoup(response.text, 'lxml')
link_list = []
page1 = soup.find_all('a', {"class":'btn awe-info gotoJS iconColor_white'})
for i in page1:
link = i['href']
link_list.append(link)
And I would also recommend using html.parser if you are not scraping XML.

How to return only hotel description from booking?

How to print only description from specific hotel from booking.com
Ex:
I use this code but return None!
import requests
from scrapy.selector import Selector
url = 'https://www.booking.com/hotel/eg/mediterranean-azur.html?'
response2 = requests.get(url)
if response2.ok is True:
selector = Selector(text=response2.content)
print(selector.xpath("//div[#class='hp__hotel-name']").get())
Any kind of help please?

Try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.booking.com/hotel/eg/mediterranean-azur.uk.html?"
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
answ = soup.find("div", {"id":"property_description_content"}).text

Beautifulsoup findAll returns an empty list

I'm trying to scrape a webpage using beautifulsoup, but findAll() returns an empty list. This is my code:
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html5lib')
recordList = bsObj.findAll('a', attrs = {'class':"lazy-loaded "})
print(recordList)
What am I doing wrong?

you need to find img tags with a class lazyloaded
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html')
recordList = bsObj.findAll('img',class_="lazy-loaded")
recordList =[i['data-src'] for i in recordList ]
print(recordList)
Output:
['https://media.elcinema.com/blank_photos/75x75.jpg', 'https://media.elcinema.com/uploads/_75x75_2fe90cb32f2759181f71eb2a9b29f0735f87ac88150a6a8fd3734300f8714369.jpg', 'https://media.elcinema.com/uploads/_75x75_3d90d1ee22c5f455bc4556073eab69cd218446d6134dc0f2694782ee39ccb5bf.jpg', 'https://media.elcinema.com/uploads/_75x75_81f30061ed82645e9ee688642275d76a23ee329344c5ac25c42f22afa35432ff.jpg', 'https://media.elcinema.com/blank_photos/75x75.jpg',.......]

According to the question, it looks like you need to find all a records who have img tag in it with a specific class lazy-loaded Follow the below code to get those:
Code:
import requests
from bs4 import BeautifulSoup
URL = "https://elcinema.com/en/index/work/country/eg?page=1"
r = requests.get(URL)
bsObj = BeautifulSoup(r.content, 'html.parser')
outputdata=[]
recordList = bsObj.findAll('a')
for record in recordList:
if record.find("img",{"class":"lazy-loaded"}):
outputdata.append(record)
print(len(outputdata))
print(outputdata)
Output:
Let me know if you have any questions :)

How can I scrape Songs Title from this request that I have collected using python

import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
print(result.class)
From the above code, I am able to scrape this data
https://www.pastiebin.com/5f08080b8db82
Now I would like to scrape only the title of the songs and then make a list out of them like the below:
Meri Aashiqui
Genda Phool
Any suggestions are much appreciated!

Try this :
import requests
from bs4 import BeautifulSoup
r = requests.get("https://gaana.com/playlist/gaana-dj-hindi-top-50-1")
soup = BeautifulSoup(r.text, "html.parser")
result = soup.find("div", {"class": "s_c"})
#print(result)
div = result.find_all('div', class_='track_npqitemdetail')
name_list = []
for x in div:
span = x.find('span').text
name_list.append(span)
print(name_list)
this code will return all song name in name_list list.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Trying to scrape a particular part of the whole HTML - python

Related

Is there a way to extract only the string of a paragraph when webscraping in python?

how to get all 'href' from page in company profile box

How to return only hotel description from booking?

Beautifulsoup findAll returns an empty list

How can I scrape Songs Title from this request that I have collected using python

Categories

Resources