I was practicing the crawler by using Python.
My target is to find the test date on GRE website.
Here is what I've done now.
import urllib2
from bs4 import BeautifulSoup
from urllib2 import urlopen, Request
gre_url = 'https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=Taipei+City%2C+Taiwan&latitude=25.0329636&longitude=121.56542680000007&testStartDate=April-01-2017&testEndDate=May-31-2017¤tTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL='
data = urllib2.urlopen(gre_url).read()
soup = BeautifulSoup(data, "html.parser")
print soup.select('div.panel-heading.accordion-heading') # return []
However, it seems that it can't extract the element div.panel-heading.accordion-heading from data.
How do I fix it?
You need to do it in multiple steps visiting subsequent URLs before making the final get requests to check the availability. Here is something that works for me using requests.Session():
import json
import requests
from bs4 import BeautifulSoup
start_url = "https://www.ets.org/gre/revised_general/register/centers_dates/"
workflow_url = "https://ereg.ets.org/ereg/public/workflowmanager/schlWorkflow?_p=GRI"
seats_url = "https://ereg.ets.org/ereg/public/testcenter/availability/seats"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
session.get(start_url)
session.get(workflow_url)
response = session.get("https://ereg.ets.org/ereg/public/testcenter/availability/seats?testId=30&testName=GRE+General+Test&location=New+York%2C+NY%2C+United+States&latitude=40.7127837&longitude=-74.00594130000002&testStartDate=March-27-2017&testEndDate=April-30-2017¤tTestCenterCount=0&sourceTestCenterCount=0&adminCode=&rescheduleFlow=false&isWorkflow=true&oldTestId=30&oldTestTime=&oldTestCenterId=&isUserLoggedIn=true&oldTestTitle=&oldTestCenter=&oldTestType=&oldTestDate=&oldTestTimeInfo=&peviewTestSummaryURL=%2Fresch%2Ftestpreview%2Fpreviewtestsummary&rescheduleURL=")#
soup = BeautifulSoup(response.content, "html.parser")
result = json.loads(soup.select_one('#findSeatResponse')['value'])
for date in result['sortedDates']:
print(date['displayDate'])
Of course, change the last URL to the desired one.
Related
How to scrape multi page URL using BeautifulSoup,
I'm trying to scrape webpage but I'm stuck, any idea will be helpful
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract
# URL of the web page you want to extract
url = "https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p=1&selectedItem=viewAllAwardedContracts.do&T01_ps=100"
# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# get the HTML content
html = session.get(url).content
# parse HTML using beautiful soup
soup = bs(html, "html.parser")
You can use a while loop to go through all the pages
# import urllib.parse
url = "https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p=1&selectedItem=viewAllAwardedContracts.do&T01_ps=100"
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
i = 0
while url:
html = session.get(url).content
soup = BeautifulSoup(html, "html.parser")
####### EXTRACT AND SAVE THE DATA YOU WANT #######
nxtBtn = soup.select_one('button.SearchBtn[title="Next"][href]')
if nxtBtn:
url = urllib.parse.urljoin(url, nxtBtn.get('href'))
print('going to next page', url)
else: url = None
nxtBtn gets the tag for the button that takes the user to the next page; as long as that can be found, the href is extracted from the tag to update url and continue to the next iteration - once it can't be found, url is set to None so that the while condition is no longer upheld and the loop terminates.
BeautifulSoup doesn’t find any tag on this page. Does anyone know what the problem can be?
I can find elements on the page with selenium, but since I have a list of pages, I don’t want to use selenium.
import requests
from bs4 import BeautifulSoup
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
soup.find_all('h1')
You can get the info on that page by adding headers to your requests, mimicking what you can see in Dev tools - Network tab main request to that url. Here is one way to get all links from that page:
import requests
from bs4 import BeautifulSoup as bs
headers = {
'Cookie': 'sso_checked=1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
url = 'https://dzen.ru/news/story/VMoskovskoj_oblasti_zapushhen_chat-bot_ochastichnoj_mobilizacii--b093f9a22a32ed6731e4a4ca50545831?lang=ru&from=reg_portal&fan=1&stid=fOB6O7PV5zeCUlGyzvOO&t=1664886434&persistent_id=233765704&story=90139eae-79df-5de1-9124-0d830e4d59a5&issue_tld=ru'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
links = [a.get('href') for a in soup.select('a')]
print(links)
Result printed in terminal:
['/news', 'https://dzen.ru/news', 'https://dzen.ru/news/region/moscow', 'https://dzen.ru/news/rubric/mobilizatsiya', 'https://dzen.ru/news/rubric/personal_feed', 'https://dzen.ru/news/rubric/politics', 'https://dzen.ru/news/rubric/society', 'https://dzen.ru/news/rubric/business', 'https://dzen.ru/news/rubric/world', 'https://dzen.ru/news/rubric/sport', 'https://dzen.ru/news/rubric/incident', 'https://dzen.ru/news/rubric/culture', 'https://dzen.ru/news/rubric/computers', 'https://dzen.ru/news/rubric/science', 'https://dzen.ru/news/rubric/auto', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://www.mosobl.kp.ru/online/news/4948743/?utm_source=yxnews&utm_medium=desktop', 'https://mosregtoday.ru/soc/v-podmoskove-zapustili-chat-bot-po-voprosam-chastichnoj-mobilizacii/?utm_source=yxnews&utm_medium=desktop', ...]
This is the URL where I'm trying to extract the shipping price:
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
My code is:
r = requests.get(url,headers=HEADERS,proxies=proxyDict)
soup = BeautifulSoup(r.content,'html.parser')
needle="$93.63"
#I also tried complete sentences
#"$93.63 Shipping & Import Fees Deposit to India"
#"$93.63 Shipping & Import Fees Deposit to India"
print(soup.find_all(text=needle))
#I also tried print(soup.find_all(text=re.compile(needle)))
But this always returns an empty list.
I can see the required text in inspect element as well as downloaded soup that I printed on the console.
However when I do the same thing with the actual product price($27.99), soup.find_all() works as expected.
So far I haven't been able to figure out the problem here. Sorry for any silly mistakes.
Search the field, not the values.
import requests
from bs4 import BeautifulSoup
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
r = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(r.content,'html.parser')
value = soup.find("span", {"id" : "priceblock_ourprice"}).contents
print(value)
from bs4 import BeautifulSoup as bs
import requests
url = "https://www.amazon.com/AmazonBasics-Ultra-Soft-Micromink-Sherpa-Blanket/dp/B0843ZJGNP/ref=sr_1_1_sspa?dchild=1&keywords=amazonbasics&pd_rd_r=5cb1aaf8-d692-4abf-9131-ebd533ad5763&pd_rd_w=8Uw69&pd_rd_wg=kTKEB&pf_rd_p=9349ffb9-3aaa-476f-8532-6a4a5c3da3e7&pf_rd_r=PYFBYA98FS6B8BR7TGJD&qid=1623412994&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzM0xaSFIzVzFTUUpMJmVuY3J5cHRlZElkPUEwNzk3MjgzM1NQRlFQQkc4VFJGWSZlbmNyeXB0ZWRBZElkPUEwNzU1NzM0M0VMQ1hTNDJFTzYxQyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
soup = bs(requests.get(url).content, 'lxml').prettify()
print(soup)
I've recently started looking into purchasing some land, and I'm writing a little app to help me organize details in Jira/Confluence to help me keep track of who I've talked to and what I talked to them about in regards to each parcel of land individually.
So, I wrote this little scraper for landwatch(dot)com:
[url is just a listing on the website]
from bs4 import BeautifulSoup
import requests
def get_property_data(url):
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
title = soup.find_all(class_='b442a')[0].text
details = soup.find_all('p', class_='d19de')
price = soup.find_all('div', class_='_260f0')[0].text
deets = []
for i in range(len(details)):
if details[i].text != '':
deets.append(details[i].text)
detail = ''
for i in deets:
detail += '<p>' + i + '</p>'
return [title, detail, price]
Everything works great except that the class d19de has a ton of values hidden behind the Read More button.
While Googling away at this, I discovered How to Scrape reviews with read more from Webpages using BeautifulSoup, however I either don't understand what they're doing well enough to implement it, or this just doesn't work anymore:
import requests ; from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get("http://www.mouthshut.com/product-reviews/Lakeside-Chalet-Mumbai-reviews-925017044").text, "html.parser")
for title in soup.select("a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews_]"):
items = title.get('href')
if items:
broth = BeautifulSoup(requests.get(items).text, "html.parser")
for item in broth.select("div.user-review p.lnhgt"):
print(item.text)
Any thoughts on how to bypass that Read More button? I'm really hoping to do this in BeautifulSoup, and not selenium.
Here's an example URL for testing: https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403
That data is present within a script tag. Here is an example of extracting that content, parsing with json, and outputting land description info as a list:
from bs4 import BeautifulSoup
import requests, json
url = 'https://www.landwatch.com/huerfano-county-colorado-recreational-property-for-sale/pid/410454403'
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
response = requests.get(url, headers=headers) # Maybe request Url with read more already gone
soup = BeautifulSoup(response.text, 'html5lib')
all_data = json.loads(soup.select_one('[type="application/ld+json"]').string)
details = all_data['description'].split('\r\r')
You may wish to examine what else is in that script tag:
from pprint import pprint
pprint(all_data)
For a project work on text analytics, I am trying to scrape some reviews. I am using python and beautiful soup to do the job. I am not getting any errors but not getting any data also. I am sure I am making mistake in specifying the div tags. Can someone help? The following is the code which I used:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.zomato.com/brewbot")
soup = BeautifulSoup(r.content)
links = soup.find.all("div")
k_data = soup.find_all({"class":"rev-text"})
for item in k_data:
print item.text
I have changed "class":"rev-text" to "tabindex='0'", "class"-"rev.text", included the "itemprop"="description", and other combinations...nothing seem to work. Can someone help?
Reviews are dynamically loaded from a response to a POST request to the social_load_more.php endpoint. Simulate that in your code, get the HTML with reviews from the JSON response and parse it with BeautifulSoup. Complete working code:
import requests
from bs4 import BeautifulSoup
with requests.Session() as session:
session.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"}
r = session.get("https://www.zomato.com/brewbot")
soup = BeautifulSoup(r.content, "html.parser")
itemid = soup.body["itemid"]
# get reviews
r = session.post("https://www.zomato.com/php/social_load_more.php", data={
"entity_id": itemid,
"profile_action": "reviews-top",
"page": "0",
"limit": "5"
})
reviews = r.json()["html"]
soup = BeautifulSoup(reviews, "html.parser")
k_data = soup.select("div.rev-text")
for item in k_data:
print(item.get_text())