I am trying to extract text from websites using BeautifulSoup but willing to explore other options. Currently I am trying to use something like this:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
boston_url = 'https://www.mass.gov/service-details/request-for-proposal-rfp-notices'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(boston_url,headers=hdr)
webpage = urlopen(req)
htmlText = webpage.read().decode('utf-8')
pageText = BeautifulSoup(htmlText, "html.parser")
body = pageText.find_all(text=True)
The goal being to figure out how to extract the text in the red box.You can see the output I get from the CMD photo below. It is very messy and i'm not sure how to find body paragraphs of text from that. I could loop over the output and look for certain words but I need to do this to multiple sites and I won't know what's in the body paragraph.
It's probably simpler than you make it. Let's try to simplify it:
import requests
from bs4 import BeautifulSoup as bs
boston_url = 'https://www.mass.gov/service-details/request-for-proposal-rfp-notices'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = requests.get(boston_url,headers=hdr)
soup = bs(req.text,'lxml')
soup.select('main main div.ma__rich-text>p')[0].text
Output:
'PERAC has not reviewed the RFP notices or other related materials posted on this page for compliance with M.G.L. Chapter 32, section 23B. The publication of these notices should not be interpreted as an indication that PERAC has made a determination as to that compliance.'
You can use the bs.find('p', text=re.compile('PERAC')) to extract that paragraph:
from bs4 import BeautifulSoup
import requests
import re
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/83.0.4103.61 Safari/537.36'
}
boston_url = (
'https://www.mass.gov/service-details/request-for-proposal-rfp-notices'
)
resp = requests.get(boston_url, headers=headers)
bs = BeautifulSoup(resp.text)
bs.find('p', text=re.compile('PERAC'))
Related
BeautifulSoup doesn’t find any tag on this page. Does anyone know what the problem can be?
I can find elements on the page with selenium, but since I have a list of pages, I don’t want to use selenium.
import requests
from bs4 import BeautifulSoup
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
soup.find_all('h1')
You can get the info on that page by adding headers to your requests, mimicking what you can see in Dev tools - Network tab main request to that url. Here is one way to get all links from that page:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'Cookie': 'sso_checked=1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
links = [a.get('href') for a in soup.select('a')]
print(links)
Result printed in terminal:
['/news', 'https://dzen.ru/news', 'https://dzen.ru/news/region/moscow', 'https://dzen.ru/news/rubric/mobilizatsiya', 'https://dzen.ru/news/rubric/personal_feed', 'https://dzen.ru/news/rubric/politics', 'https://dzen.ru/news/rubric/society', 'https://dzen.ru/news/rubric/business', 'https://dzen.ru/news/rubric/world', 'https://dzen.ru/news/rubric/sport', 'https://dzen.ru/news/rubric/incident', 'https://dzen.ru/news/rubric/culture', 'https://dzen.ru/news/rubric/computers', 'https://dzen.ru/news/rubric/science', 'https://dzen.ru/news/rubric/auto', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://mosregtoday.ru/soc/v-podmoskove-zapustili-chat-bot-po-voprosam-chastichnoj-mobilizacii/?utm_source=yxnews&utm_medium=desktop', ...]
so I'm trying to extract some data from a website by webscraping using python but some of the div tags are not expanding to show the data that I want.
This is my code.
import requests
from bs4 import BeautifulSoup as soup
uq_url = "https://my.uq.edu.au/programs-courses/requirements/program/2451/2021"
headers = {
'User-Agent': "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
web_r = requests.get(uq_url, headers=headers)
web_soup = soup(web_r.text, 'html.parser')
print(web_soup.prettify())
This is what the code will scrape but it won't extract any of the data in the div with id="app". It's supposed to have a lot of data in there like the second picture. Any help would be appreciated.
All the that content is present within a script tag, as shown in your image. You can regex out the appropriate javascript object then handle the unquoted keys with json, in order to convert to hjson. Then extract whatever you want:
import requests, re, hjson
from bs4 import BeautifulSoup as bs #there is some data as embedded html you may wish to parse later from json
r = requests.get('https://my.uq.edu.au/programs-courses/requirements/program/2451/2021', headers = {'User-Agent':'Mozilla/5.0'})
data = hjson.loads(re.search(r'window\.AppData = ([\s\S]+?);\n' , r.text).group(1))
# hjson.dumpsJSON(data['programRequirements'])
core_courses = data['programRequirements']['payload']['components'][1]['payload']['body'][0]['body']
for course in core_courses:
if 'curriculumReference' in course:
print(course['curriculumReference'])
This is the URL where I'm trying to extract the shipping price:
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
My code is:
r = requests.get(url,headers=HEADERS,proxies=proxyDict)
soup = BeautifulSoup(r.content,'html.parser')
needle="$93.63"
#I also tried complete sentences
#"$93.63 Shipping & Import Fees Deposit to India"
#"$93.63 Shipping & Import Fees Deposit to India"
print(soup.find_all(text=needle))
#I also tried print(soup.find_all(text=re.compile(needle)))
But this always returns an empty list.
I can see the required text in inspect element as well as downloaded soup that I printed on the console.
However when I do the same thing with the actual product price($27.99), soup.find_all() works as expected.
So far I haven't been able to figure out the problem here. Sorry for any silly mistakes.
Search the field, not the values.
import requests
from bs4 import BeautifulSoup
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
r = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(r.content,'html.parser')
value = soup.find("span", {"id" : "priceblock_ourprice"}).contents
print(value)
from bs4 import BeautifulSoup as bs
import requests
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
soup = bs(requests.get(url).content, 'lxml').prettify()
print(soup)
I've recently started looking into purchasing some land, and I'm writing a little app to help me organize details in Jira/Confluence to help me keep track of who I've talked to and what I talked to them about in regards to each parcel of land individually.
So, I wrote this little scraper for landwatch(dot)com:
[url is just a listing on the website]
from bs4 import BeautifulSoup
import requests
def get_property_data(url):
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
title = soup.find_all(class_='b442a')[0].text
details = soup.find_all('p', class_='d19de')
price = soup.find_all('div', class_='_260f0')[0].text
deets = []
for i in range(len(details)):
if details[i].text != '':
deets.append(details[i].text)
detail = ''
for i in deets:
detail += '<p>' + i + '</p>'
return [title, detail, price]
Everything works great except that the class d19de has a ton of values hidden behind the Read More button.
While Googling away at this, I discovered How to Scrape reviews with read more from Webpages using BeautifulSoup, however I either don't understand what they're doing well enough to implement it, or this just doesn't work anymore:
import requests ; from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get("http://www.mouthshut.com/product-reviews/Lakeside-Chalet-Mumbai-reviews-925017044").text, "html.parser")
for title in soup.select("a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_]"):
items = title.get('href')
if items:
broth = BeautifulSoup(requests.get(items).text, "html.parser")
for item in broth.select("div.user-review p.lnhgt"):
print(item.text)
Any thoughts on how to bypass that Read More button? I'm really hoping to do this in BeautifulSoup, and not selenium.
Here's an example URL for testing: https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403
That data is present within a script tag. Here is an example of extracting that content, parsing with json, and outputting land description info as a list:
from bs4 import BeautifulSoup
import requests, json
url = 'https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403'
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
all_data = json.loads(soup.select_one('[type="application/ld+json"]').string)
details = all_data['description'].split('\r\r')
You may wish to examine what else is in that script tag:
from pprint import pprint
pprint(all_data)
I was practicing the crawler by using Python.
My target is to find the test date on GRE website.
Here is what I've done now.
import urllib2
from bs4 import BeautifulSoup
from urllib2 import urlopen, Request
gre_url = 'https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=Taipei+City%2C+Taiwan&latitude=25.0329636&longitude=121.56542680000007&testStartDate=April-01-2017&testEndDate=May-31-2017¤tTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL='
data = urllib2.urlopen(gre_url).read()
soup = BeautifulSoup(data, "html.parser")
print soup.select('div.panel-heading.accordion-heading') # return []
However, it seems that it can't extract the element div.panel-heading.accordion-heading from data.
How do I fix it?
You need to do it in multiple steps visiting subsequent URLs before making the final get requests to check the availability. Here is something that works for me using requests.Session():
import json
import requests
from bs4 import BeautifulSoup
start_url = "https://www.ets.org/gre/revised_general/register/centers_dates/"
workflow_url = "https://ereg.ets.org/ereg/public/workflowmanager/schlWorkflow?_p=GRI"
seats_url = "https://ereg.ets.org/ereg/public/testcenter/availability/seats"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
session.get(start_url)
session.get(workflow_url)
response = session.get("https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=New+York%2C+NY%2C+United+States&latitude=40.7127837&longitude=-74.00594130000002&testStartDate=March-27-2017&testEndDate=April-30-2017¤tTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL=")#
soup = BeautifulSoup(response.content, "html.parser")
result = json.loads(soup.select_one('#findSeatResponse')['value'])
for date in result['sortedDates']:
print(date['displayDate'])
Of course, change the last URL to the desired one.