Scrape multi page URL using Beautifulsoup - python

How to scrape multi page URL using BeautifulSoup,
I'm trying to scrape webpage but I'm stuck, any idea will be helpful
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract
# URL of the web page you want to extract
url = "https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p=1&selectedItem=viewAllAwardedContracts.do&T01_ps=100"
# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# get the HTML content
html = session.get(url).content
# parse HTML using beautiful soup
soup = bs(html, "html.parser")

You can use a while loop to go through all the pages
# import urllib.parse
url = "https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p=1&selectedItem=viewAllAwardedContracts.do&T01_ps=100"
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
i = 0
while url:
html = session.get(url).content
soup = BeautifulSoup(html, "html.parser")
####### EXTRACT AND SAVE THE DATA YOU WANT #######
nxtBtn = soup.select_one('button.SearchBtn[title="Next"][href]')
if nxtBtn:
url = urllib.parse.urljoin(url, nxtBtn.get('href'))
print('going to next page', url)
else: url = None
nxtBtn gets the tag for the button that takes the user to the next page; as long as that can be found, the href is extracted from the tag to update url and continue to the next iteration - once it can't be found, url is set to None so that the while condition is no longer upheld and the loop terminates.

Related

BeautifulSoup doesn’t find tags

BeautifulSoup doesn’t find any tag on this page. Does anyone know what the problem can be?
I can find elements on the page with selenium, but since I have a list of pages, I don’t want to use selenium.
import requests
from bs4 import BeautifulSoup
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
soup.find_all('h1')
You can get the info on that page by adding headers to your requests, mimicking what you can see in Dev tools - Network tab main request to that url. Here is one way to get all links from that page:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'Cookie': 'sso_checked=1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
links = [a.get('href') for a in soup.select('a')]
print(links)
Result printed in terminal:
['/news', 'https://dzen.ru/news', 'https://dzen.ru/news/region/moscow', 'https://dzen.ru/news/rubric/mobilizatsiya', 'https://dzen.ru/news/rubric/personal_feed', 'https://dzen.ru/news/rubric/politics', 'https://dzen.ru/news/rubric/society', 'https://dzen.ru/news/rubric/business', 'https://dzen.ru/news/rubric/world', 'https://dzen.ru/news/rubric/sport', 'https://dzen.ru/news/rubric/incident', 'https://dzen.ru/news/rubric/culture', 'https://dzen.ru/news/rubric/computers', 'https://dzen.ru/news/rubric/science', 'https://dzen.ru/news/rubric/auto', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://mosregtoday.ru/soc/v-podmoskove-zapustili-chat-bot-po-voprosam-chastichnoj-mobilizacii/?utm_source=yxnews&utm_medium=desktop', ...]

Why is beautifulSoup unable to find text passed in text parameter?

This is the URL where I'm trying to extract the shipping price:
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
My code is:
r = requests.get(url,headers=HEADERS,proxies=proxyDict)
soup = BeautifulSoup(r.content,'html.parser')
needle="$93.63"
#I also tried complete sentences
#"$93.63 Shipping & Import Fees Deposit to India"
#"$93.63 Shipping & Import Fees Deposit to India"
print(soup.find_all(text=needle))
#I also tried print(soup.find_all(text=re.compile(needle)))
But this always returns an empty list.
I can see the required text in inspect element as well as downloaded soup that I printed on the console.
However when I do the same thing with the actual product price($27.99), soup.find_all() works as expected.
So far I haven't been able to figure out the problem here. Sorry for any silly mistakes.
Search the field, not the values.
import requests
from bs4 import BeautifulSoup
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
r = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(r.content,'html.parser')
value = soup.find("span", {"id" : "priceblock_ourprice"}).contents
print(value)
from bs4 import BeautifulSoup as bs
import requests
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
soup = bs(requests.get(url).content, 'lxml').prettify()
print(soup)

Python 3 BeautifulSoup Scraping Content After "Read More" Text

I've recently started looking into purchasing some land, and I'm writing a little app to help me organize details in Jira/Confluence to help me keep track of who I've talked to and what I talked to them about in regards to each parcel of land individually.
So, I wrote this little scraper for landwatch(dot)com:
[url is just a listing on the website]
from bs4 import BeautifulSoup
import requests
def get_property_data(url):
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
title = soup.find_all(class_='b442a')[0].text
details = soup.find_all('p', class_='d19de')
price = soup.find_all('div', class_='_260f0')[0].text
deets = []
for i in range(len(details)):
if details[i].text != '':
deets.append(details[i].text)
detail = ''
for i in deets:
detail += '<p>' + i + '</p>'
return [title, detail, price]
Everything works great except that the class d19de has a ton of values hidden behind the Read More button.
While Googling away at this, I discovered How to Scrape reviews with read more from Webpages using BeautifulSoup, however I either don't understand what they're doing well enough to implement it, or this just doesn't work anymore:
import requests ; from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get("http://www.mouthshut.com/product-reviews/Lakeside-Chalet-Mumbai-reviews-925017044").text, "html.parser")
for title in soup.select("a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_]"):
items = title.get('href')
if items:
broth = BeautifulSoup(requests.get(items).text, "html.parser")
for item in broth.select("div.user-review p.lnhgt"):
print(item.text)
Any thoughts on how to bypass that Read More button? I'm really hoping to do this in BeautifulSoup, and not selenium.
Here's an example URL for testing: https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403
That data is present within a script tag. Here is an example of extracting that content, parsing with json, and outputting land description info as a list:
from bs4 import BeautifulSoup
import requests, json
url = 'https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403'
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
all_data = json.loads(soup.select_one('[type="application/ld+json"]').string)
details = all_data['description'].split('\r\r')
You may wish to examine what else is in that script tag:
from pprint import pprint
pprint(all_data)

printing number from html tag in python

Hi I have been trying to get the time data from this website: https://clockofeidolon.com (hours, minutes, seconds) and tried to use beautifulsoup to print contents of 'span class="big' tags since the time information is kept there and I have come up with this:
from bs4 import BeautifulSoup
from requests import Session
session = Session()
session.headers['user-agent'] = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'66.0.3359.181 Safari/537.36'
)
url = 'https://clockofeidolon.com'
response = session.get(url=url)
data = response.text
soup = BeautifulSoup(data, "html.parser")
spans = soup.find_all('<span class="big')
print([span.text for span in spans])
But the output only shows "[]" and nothing else. How would I go about printing the number in each of the 3 tags?
As mentioned this can be achieved with selenium once you have the correct geckodriver installed the following should get you on the right track:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://clockofeidolon.com')
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
spans = soup.find_all(class_='big-hour')
for span in spans:
print(span.text)
driver.quit()

Python crawler can't find element

I was practicing the crawler by using Python.
My target is to find the test date on GRE website.
Here is what I've done now.
import urllib2
from bs4 import BeautifulSoup
from urllib2 import urlopen, Request
gre_url = 'https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=Taipei+City%2C+Taiwan&latitude=25.0329636&longitude=121.56542680000007&testStartDate=April-01-2017&testEndDate=May-31-2017&currentTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL='
data = urllib2.urlopen(gre_url).read()
soup = BeautifulSoup(data, "html.parser")
print soup.select('div.panel-heading.accordion-heading') # return []
However, it seems that it can't extract the element div.panel-heading.accordion-heading from data.
How do I fix it?
You need to do it in multiple steps visiting subsequent URLs before making the final get requests to check the availability. Here is something that works for me using requests.Session():
import json
import requests
from bs4 import BeautifulSoup
start_url = "https://www.ets.org/gre/revised_general/register/centers_dates/"
workflow_url = "https://ereg.ets.org/ereg/public/workflowmanager/schlWorkflow?_p=GRI"
seats_url = "https://ereg.ets.org/ereg/public/testcenter/availability/seats"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
session.get(start_url)
session.get(workflow_url)
response = session.get("https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=New+York%2C+NY%2C+United+States&latitude=40.7127837&longitude=-74.00594130000002&testStartDate=March-27-2017&testEndDate=April-30-2017&currentTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL=")#
soup = BeautifulSoup(response.content, "html.parser")
result = json.loads(soup.select_one('#findSeatResponse')['value'])
for date in result['sortedDates']:
print(date['displayDate'])
Of course, change the last URL to the desired one.

Categories

Resources