Incorporating pagination scraping into my script - python

url = "https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313&_nkw=electronics"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
names = soup.find_all("div", class_="s-item__title")
prices = soup.find_all("span", class_="s-item__price")
shippings = soup.find_all("span", class_="s-item__shipping s-item__logisticsCost"
for name,price,shipping in zip(names,prices,shippings):
print(name.text, price.text, shipping.text)
Right now, this script works perfectly. It prints everything that needs to be printed.
But... I want to be able to go to the next page and scrape everything off of there as well.
The class for the next page is "pagination__next icon-link"
I'm not sure how I would go about it.

Just iterate link by pagination url query value
base_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=electronics&_pgn='
for i in range(pages_count):
base_url+f'{i}'
# your code...
response = requests.get(url)
For correct parsing by category, due to the specifics of the displayed pages of the site, I advise you to refer to the pagination object for each request, look at the last page number and substitute it in the request
Take last number of available page on current page:
ol = soup.find("ol", class_="pagination__items")
lis = ol.find_all("li")
print(f"Last available number of post on current page {lis[-1].text}")

In order to collect all the information from all pages, you can use the while loop which dynamically paginates through all pages.
The while loop will be executed until there is a stop command, in our case, the command to end the loop will be to check for the presence of the next page, for which the CSS selector is responsible - ".pagination__next".
Also, there's a URL parameter that is responsible for pagination: _pgn which is used to increase page number by 1 and thus selects the next page:
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
See the full code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
}
params = {
"_nkw": "electronics", # search query
"_pgn": 1 # page number
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Nintendo DSi XL Japan Import Console & USB Charger - Pick Your Color TESTED",
"price": "$69.99",
"link": "https://www.ebay.com/itm/165773301243?hash=item2698dbd5fb:g:HFcAAOSwTdNhnqy~&amdata=enc%3AAQAHAAAA4MXRmWPDY6vBlTlYLy%2BEQPsi1HJM%2BFzt2TWJ%2BjCbK6Q2mreLV7ZpKmZOvU%2FMGqxY2oQZ91aPaHW%2FS%2BRCUW3zUKWDIDoN2ITF3ooZptkWCkd8x%2FIOIaR7t2rSYDHwQEFUD7N6wdnY%2Bh6SpljeSkCPkoKi%2FDCpU0YLOO3mpuLVjgO8GQYKhrlXG59BDDw8IyaayjRVdWyjh534fuIRToSqFrki97dJMVXE0LNE%2BtPmJN96WbYIlqmo4%2B278nkNigJHI8djvwHMmqYUBQhQLN2ScD%2FLnApPlMJXirqegMet0DZQ%7Ctkp%3ABk9SR7K0tsSSYQ"
},
{
"title": "Anbernic RG351P White, Samsung 64 GB SD Card AmberElec & Case",
"price": "$89.99",
"link": "https://www.ebay.com/itm/144690079314?hash=item21b0336652:g:8qwAAOSw93ZjO6n~&amdata=enc%3AAQAHAAAAoNGQWvtymUdp2cEYaKyfTAzWm0oZvBODZsm2oeHl3s%2F6jF9k3nAIpsQkpiZBFI657Cg53X9zAgExAxQAfmev0Bgh7%2FjEtC5FU8O5%2FfoQ3tp8XYtyKdoRy%2FwdebmsGKD%2FIKvW1lWzCNN%2FpSAUDLrPgPN9%2Fs8igeU7jqAT4NFn3FU7W4%2BoFV%2B2gNOj8nhxYlm3HZ6vm21T4P3IAA4KXJZhW2E%3D%7Ctkp%3ABk9SR7K0tsSSYQ"
},
{
"title": "New ListingWhite wii console ONLY Tested Working",
"price": "$24.99",
"link": "https://www.ebay.com/itm/385243730852?hash=item59b250d3a4:g:t3YAAOSwZBBjctqi&amdata=enc%3AAQAHAAAAoH9I%2BSQlJpKebgObGE7Idppe2cewzEiV0SdZ6pEu0sVpIJK5%2F3q15ygTFAdPRElY232LwDKIMXjkIwag1FUN76geBg2vCnPfd3x8BAHzXn%2B1u5zF9cBITLCuawKTYnfUeCYMavO4cBmpnsrvUOSokvnTacfB078MF95%2FH1sUQH%2BfIjDtPzFoFTJrTtKLINRlXZ9edD%2BVW%2FB2TLYZ%2FHMAHkE%3D%7Ctkp%3ABk9SR7K0tsSSYQ"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code that paginates through all pages:
from serpapi import EbaySearch
from urllib.parse import (parse_qsl, urlsplit)
import os, json
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": "electronics", # search query
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
next_page_query_dict = dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query))
current_page = results["serpapi_pagination"]["current"] # 1,2,3...
# looks for the next page data (_pgn):
if "next" in results.get("pagination", {}):
# if current_page = 20 and next_page_query_dict["_pgn"] = 20: break
if int(current_page) == int(next_page_query_dict["_pgn"]):
break
# update next page data
search.params_dict.update(next_page_query_dict)
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$169.00",
"extracted": 169.0
},
"link": "https://www.ebay.com/itm/113356737439?hash=item1a64968b9f:g:4qoAAOSwQypdKgT6&amdata=enc%3AAQAHAAAA4N8GJRRCbG8WIU7%2BzjrvsRMMmKaTEnA0l7Nz9nOWUUSin3gZ5Ho41Fc4A2%2FFLtlLzbb5UuTtU5s3Qo7Ky%2FWB%2FTEuDKBhFldxMZUzVoixZXII6T1CTtgG5YFJWs0Zj8QldjdM9PwBFuiLNJbsRzG38k7v1rJdg4QGzVUOauPxH0kiANtefqiBhnYHWZ0RfMqwh4S%2BbQ59JYQWSZjAefL61WYyNwkfSdrfcq%2BW2B7b%2BR8QEfynka5CE6g7YPpoWWp4Bk3IOvd4CZxAzTpgvOPoMMKPy0VCW1gPJDG4R2CsfDEv%7Ctkp%3ABk9SR56IpsWSYQ"
},
{
"price": {
"raw": "$239.00",
"extracted": 239.0
},
"link": "https://www.ebay.com/itm/115600879000?hash=item1aea596d98:g:F3YAAOSwsXxjbuYn&amdata=enc%3AAQAHAAAA4LuAhrdA4ahkT85Gf15%2FtEH9GBe%2B0qlDZfEt4p9O0YPmJZVPyq%2Fkuz%2FV86SF3%2B7SYY%2BlK04XQtCyS3NGyNi03GurFWx2dYwoKFUj2G7YsLw%2BalUKmdiv5bC3jJaRTnXuBOJGPXQxw2IwTHcvZ%2Fu8T7tEnYF5ih3HGMg69vCVZdVHqRa%2FYehvk14wVwj3OwBTVrNM8dq7keGeoLKUdYDHCMAH6Y4je4mTR6PX4pWFS6S7lJ8Zrk5YhyHQInwWYXwkclgaWadC4%2BLwOzUjcKepXl5mDnxUXe6pPcccYL3u8g4O%7Ctkp%3ABk9SR56IpsWSYQ"
},
# ...
]

Related

Get Web data with images for HTML table

I am trying to extract the article body with images from this link, so that using the extracted article body I can make a HTML table. So, I have tried using BeautifulSoup.
t_link = 'https://www.cnbc.com/2022/01/03/5-ways-to-reset-your-retirement-savings-and-save-more-money-in-2022.html'
page = requests.get(t_link)
soup_page = BeautifulSoup(page.content, 'html.parser')
html_article = soup_page.find_all("div", {"class": re.compile('ArticleBody-articleBody.?')})
for article_body in html_article:
print(article_body)
But unfortunately the article_body didn't show any image, like this. Because, <div class="InlineImage-wrapper"> is't scraping in this way
So, how can I get article data with article images, so that I can make a HTML table?
I didn't quite understand your goal, so mine is probably not the answer you want.
In the html source of that page you have all inside the script you at the bottom.
It has inside the content of the page in JSON format.
If you simply use grep and jq (a great JSON cli utility), you can run
curl -kL "https://www.cnbc.com/2022/01/03/5-ways-to-reset-your-retirement-savings-and-save-more-money-in-2022.html" | \
grep -Po '"body":.+"body".' | \
grep -Po '{"content":\[.+"body".' | \
jq '[.content[]|select(.tagName|contains("image"))]'
to have all infos about the images
[
{
"tagName": "image",
"attributes": {
"id": "106967852",
"type": "image",
"creatorOverwrite": "PM Images",
"headline": "Retirement Savings",
"url": "https://image.cnbcfm.com/api/v1/image/106967852-1635524865061-GettyImages-1072593728.jpg?v=1635525026",
"datePublished": "2021-10-29T16:30:26+0000",
"copyrightHolder": "PM Images",
"width": "2233",
"height": "1343"
},
"data": {
"__typename": "image"
},
"children": [],
"__typename": "bodyContent"
},
{
"tagName": "image",
"attributes": {
"id": "106323101",
"type": "image",
"creatorOverwrite": "JGI/Jamie Grill",
"headline": "GP: 401k money jar on desk of businesswoman",
"url": "https://image.cnbcfm.com/api/v1/image/106323101-1578344280328gettyimages-672157227.jpeg?v=1641216437",
"datePublished": "2020-01-06T20:58:19+0000",
"copyrightHolder": "JGI/Jamie Grill",
"width": "5120",
"height": "3418"
},
"data": {
"__typename": "image"
},
"children": [],
"__typename": "bodyContent"
}
]
If you need only the URLs, run
curl -kL "https://www.cnbc.com/2022/01/03/5-ways-to-reset-your-retirement-savings-and-save-more-money-in-2022.html" | \
grep -Po '"body":.+"body".' | \
grep -Po '{"content":\[.+"body".' | \
jq -r '[.content[]|select(.tagName|contains("image"))]|.[].attributes.url'
to get
https://image.cnbcfm.com/api/v1/image/106967852-1635524865061-GettyImages-1072593728.jpg?v=1635525026
https://image.cnbcfm.com/api/v1/image/106323101-1578344280328gettyimages-672157227.jpeg?v=1641216437
Everything you want is in the source HTML, but you need to jump through a couple of hoops to get that data.
I'm providing the following:
article body
two (2) images that go with the article body and a url to header video (1)
Here's how:
import json
import re
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
}
with requests.Session() as s:
s.headers.update(headers)
url = "https://www.cnbc.com/2022/01/03/5-ways-to-reset-your-retirement-savings-and-save-more-money-in-2022.html"
script = [
s.text for s in
BeautifulSoup(s.get(url).text, "lxml").find_all("script")
if "window.__s_data" in s.text
][0]
payload = json.loads(
re.match(r"window\.__s_data=(.*);\swindow\.__c_data=", script).group(1)
)
article_data = (
payload
["page"]
["page"]
["layout"][3]
["columns"][0]
["modules"][2]
["data"]
)
print(article_data["articleBodyText"])
for item in article_data["body"]["content"]:
if "url" in item["attributes"].keys():
print(item["attributes"]["url"])
This should print:
The entire article body (Redacted for brevity)
The new year offers opportunities for many Americans in their careers and financial lives. The "Great Reshuffle" is expected to continue as employees leave jobs and take new ones at a rapid clip. At the same time, many workers have made a vow to save more this year, yet many admit they don't know how they'll stick to that goal. One piece of advice: Keep it simple.
[...]
The above mentioned urls to assets:
https://www.cnbc.com/video/2022/01/03/how-to-choose-the-best-retirement-strategy-for-2022.html
https://image.cnbcfm.com/api/v1/image/106967852-1635524865061-GettyImages-1072593728.jpg?v=1635525026
https://image.cnbcfm.com/api/v1/image/106323101-1578344280328gettyimages-672157227.jpeg?v=1641216437
EDIT:
If you want to download the images, use this:
import json
import os
import re
from pathlib import Path
from shutil import copyfileobj
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:104.0) Gecko/20100101 Firefox/104.0",
}
url = "https://www.cnbc.com/2022/01/03/5-ways-to-reset-your-retirement-savings-and-save-more-money-in-2022.html"
def download_images(image_source: str, directory: str) -> None:
"""Download images from a given source and save them to a given directory."""
os.makedirs(directory, exist_ok=True)
save_dir = Path(directory)
if re.match(r".*\.jp[e-g]", image_source):
file_name = save_dir / image_source.split("/")[-1].split("?")[0]
with s.get(image_source, stream=True) as img, open(file_name, "wb") as output:
copyfileobj(img.raw, output)
with requests.Session() as s:
s.headers.update(headers)
script = [
s.text for s in
BeautifulSoup(s.get(url).text, "lxml").find_all("script")
if "window.__s_data" in s.text
][0]
payload = json.loads(
re.match(r"window\.__s_data=(.*);\swindow\.__c_data=", script).group(1)
)
article_data = (
payload
["page"]
["page"]
["layout"][3]
["columns"][0]
["modules"][2]
["data"]
)
print(article_data["articleBodyText"])
for item in article_data["body"]["content"]:
if "url" in item["attributes"].keys():
url = item["attributes"]["url"]
print(url)
download_images(url, "images")

How to scrape data from sciencedirect

I want to scrape all data from sciencedirect by keyword.
I know that sciencedirect is program by ajax,
so the data of their page could't be extract directly via the
url of search result page.
The page I want to scrape
I've find the json data from numerous requests in Network area, in my view, I could get json data by this url of the request.But there are some error msg and garbled. Here is my code.
The request that contain json
import requests as res
import json
from bs4 import BeautifulSoup
keyword="digital game"
url = 'https://www.sciencedirect.com/search/api?'
payload = {
'tak': keyword,
't': 'ZNS1ixW4GGlMjTKbRHccgZ2dHuMVHqLqNBwYzIZayNb8FZvZFnVnLBYUCU%2FfHTxZMgwoaQmcp%2Foemth5%2FnqtM%2BGQW3NGOv%2FI0ng6yDADzynQO66j9EPEGT0aClusSwPFvKdDbfVcomCzYflUlyb3MA%3D%3D',
'hostname': 'www.sciencedirect.com'
}
r = res.get(url, params = payload)
print(r.content) # get garbled
r = r.json()
print(r) # get error msg
Garbled (not json data I expect)
Error msg (about .json()
Try setting the HTTP headers in the request such as user-agent to mimic a standard web browser. This will return query search results in JSON format.
import requests
keyword = "digital game"
url = 'https://www.sciencedirect.com/search/api?'
headers = {
'User-Agent': 'Mozilla/5.0',
'Accept': 'application/json'
}
payload = {
'tak': keyword,
't': 'ZNS1ixW4GGlMjTKbRHccgZ2dHuMVHqLqNBwYzIZayNb8FZvZFnVnLBYUCU%2FfHTxZMgwoaQmcp%2Foemth5%2FnqtM%2BGQW3NGOv%2FI0ng6yDADzynQO66j9EPEGT0aClusSwPFvKdDbfVcomCzYflUlyb3MA%3D%3D',
'hostname': 'www.sciencedirect.com'
}
r = requests.get(url, headers=headers, params=payload)
# need to check if the response output is JSON
if "json" in r.headers.get("Content-Type"):
data = r.json()
else:
print(r.status_code)
data = r.text
print(data)
Output:
{'searchResults': [{'abstTypes': ['author', 'author-highlights'], 'authors': [{'order': 1, 'name': 'Juliana Tay'},
..., 'resultsCount': 961}}
I've got the same problem. The point is that sciencedirect.com is using cloudflare which blocks the access for scraping bots. I've tried to use different approaches like cloudsraper, cfscrape etc... Unsuccessful! Then I've made a small parser based on Selenium which allows me to take metadata from publications and put it into my own json file with following schema:
schema = {
"doi_number": {
"metadata": {
"pub_type": "Review article" | "Research article" | "Short communication" | "Conference abstract" | "Case report",
"open_access": True | False,
"title": "title_name",
"journal": "journal_name",
"date": "publishing_date",
"volume": str,
"issue": str,
"pages": str,
"authors": [
"author1",
"author2",
"author3"
]
}
}
}
If you have any questions or maybe ideas fill free to contact me.

Not all containers loading using beautiful soup

I am trying to dump a website (website link is given below in code) and all containers are not loading. In my case, price container is not dumping. See screenshots for more details. How to solve this?
In this case, container inside class "I6yQz" are not loading.
MyCode:
url = "https://gomechanic.in/gurgaon/car-battery-replacement/maruti-suzuki-versa/petrol"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
I need the following content shown in screenshot
Some thing like this:
data = {'CityName' : 'Gurgaon', 'CarName' : 'Versa-Petrol', 'serviceName' : 'Excide (55 Months Warranty)', 'Price' : '4299', 'ServicesOffered' : '['Free pickup & drop', 'Free Installation', 'Old Battery Price Included', 'Available at Doorstep']}
I have also got the API which is have all the information: https://gomechanic.app/api/v2/oauth/customer/get-services-details-by-category?car_id=249&city_id=1&category_id=-4&user_car_id=null (it will be visible by name 'get-services-details-by-category' in inspect element). The only problem is that I have to give carId and cityId instead of carName and cityName which I don't know which carId maps to what carName.
As comment pointed out - this website dynamically loads some objects like prices via javascript.
When you connect to the page you can see a request in the background being made:
What you have to do is figure out how to replicate this request in your python code:
import requests
headers = {
# this website sues authroization for all requests
'Authorization': 'Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOiJiNGJjM2NhZjVkMWVhOTlkYzk2YjQzM2NjYzQzMDI0ZTAyM2I0MGM2YjQ5ZjExN2JjMDk5OGY2MWU3ZDI1ZjM2MTU1YWU5ZDIxNjE2ZTc5NSIsInNjb3BlcyI6W10sInN1YiI6IjE2MzM5MzQwNjY5NCIsImV4cCI6MTYzNjUyNjA2Ny4wLCJhdWQiOiIzIiwibmJmIjoxNjMzOTM0MDY3LjAsImlhdCI6MTYzMzkzNDA2Ny4wfQ.QQI_iFpNgONAIp4bfoUbGDtnnYiiViEVsPQEK3ouYLjeyhMkEKyRclazuJ9i-ExQyqukFuqiAn4dw7drGUhRykJY6U67iSnbni0aXzzF9ZTEZrvMmqItHXjrdrxzYCqoKJAf2CYY-4hkO-NXIrTHZEnk-N_jhv30LHuK9A5I1qK8pajt4XIkC7grAn3gaMe3c6rX6Ko-AMZ801TVdACD4qIHb4o73a3vodEMvh4wjIcxRGUBGq4HBgAKxKLCcWaNz-z7XjvYrWhNJNB_iRjZ1YBN97Xk4CWxC0B4sSgA2dVsBWaKGW4ck8wvrHQyFRfFpPHux-6sCMqCC-e4okOhku3AasqPKwvUuJK4oov9tav4YsjfFevKkdsCZ1KmTehtvadoUXAHQcij0UqgMtzNPO-wKYoXwLc8yZGi_mfamAIX0izFOlFiuL26X8XUMP5HkuypUqDa3MLg91f-8oTMWfUjVYYsnjw7lwxKSl7KRKWWhuHwL6iDUjfB23qjEuq2h9JBVkoG71XpA9SrJbunWARYpQ48mc0LlYCXCbGkYIh9pOZba7JGMh7E15YyRla8qhU9pEkgWVYjzgYJaNkhrSNBaIdY56i_qlnTBpC00sqOnHRNVpYMb4gF3PPKalUMMJjbSqzEE2BNTFO5dGxGcz2cKP0smoVi_SK3XcKgPXc',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) QtWebEngine/5.15.2 Chrome/87.0.4280.144 Safari/537.36',
}
url = 'https://gomechanic.in/api/v1/priceList?city=gurgaon&brand=maruti-suzuki&service=car-battery-replacement'
response = requests.get(url, headers=headers)
print(response.json())
Which will result in:
{
"success": true,
"data": [
{
"id": 1,
"name": "800 Petrol",
"price": 3400,
"savings": "25%"
},
{
"id": 2,
"name": "800 CNG",
"price": 3400,
"savings": "25%"
},
{
"id": 3,
"name": "Alto Petrol",
"price": 3400,
"savings": "25%"
},
{
"id": 4,
"name": "Alto CNG",
"price": 3400,
"savings": "25%"
},
{
"id": 5,
"name": "Alto 800 Petrol",
"price": 3400,
"savings": "25%"
},
{
"id": 6,
"name": "Alto 800 CNG",
"price": 3400,
"savings": "25%"
}
]
}
This whole process is called reverse engineering and for a more in-depth introduction you can see my tutorial blog here: https://scrapecrow.com/reverse-engineering-intro.html
As for parameters that are used in these backend API requests - they are most likely in initial html document initial state json object. If you view page source of the html page and ctrl+f parameter name like city_id you can see it's hidden deep in some json. You can either extract this whole JSON and parse it or use regular expressions like re.findall('"city_id":(\d+)', html)[0] to just get this one value.

Calling back-end API of CNBC in python

As a followup to this question, how can I locate the XHR request which is used to retrieve the data from the back-end API on CNBC News in order to be able to scrape this CNBC search query?
The end goal is to have a doc with: headline, date, full article and url.
I have found this: https://api.sail-personalize.com/v1/personalize/initialize?pageviews=1&isMobile=0&query=coronavirus&qsearchterm=coronavirus
Which tells me I don't have access. Is there a way to access information anyway?
Actually my previous answer for you were addressing your question regarding the XHR request:
But here we go with a screenshot:
import requests
params = {
"queryly_key": "31a35d40a9a64ab3",
"query": "coronavirus",
"endindex": "0",
"batchsize": "100",
"callback": "",
"showfaceted": "true",
"timezoneoffset": "-120",
"facetedfields": "formats",
"facetedkey": "formats|",
"facetedvalue":
"!Press Release|",
"needtoptickers": "1",
"additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
}
goal = ["cn:title", "_pubDate", "cn:liveURL", "description"]
def main(url):
with requests.Session() as req:
for page, item in enumerate(range(0, 1100, 100)):
print(f"Extracting Page# {page +1}")
params["endindex"] = item
r = req.get(url, params=params).json()
for loop in r['results']:
print([loop[x] for x in goal])
main("https://api.queryly.com/cnbc/json.aspx")
Pandas DataFrame version:
import requests
import pandas as pd
params = {
"queryly_key": "31a35d40a9a64ab3",
"query": "coronavirus",
"endindex": "0",
"batchsize": "100",
"callback": "",
"showfaceted": "true",
"timezoneoffset": "-120",
"facetedfields": "formats",
"facetedkey": "formats|",
"facetedvalue":
"!Press Release|",
"needtoptickers": "1",
"additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
}
goal = ["cn:title", "_pubDate", "cn:liveURL", "description"]
def main(url):
with requests.Session() as req:
allin = []
for page, item in enumerate(range(0, 1100, 100)):
print(f"Extracting Page# {page +1}")
params["endindex"] = item
r = req.get(url, params=params).json()
for loop in r['results']:
allin.append([loop[x] for x in goal])
new = pd.DataFrame(
allin, columns=["Title", "Date", "Url", "Description"])
new.to_csv("data.csv", index=False)
main("https://api.queryly.com/cnbc/json.aspx")
Output: view online

Extracting JSON from HTML using BeautifulSoup python

While I was practicing some web-scraping on a webpage (param cookies required), I found myself having problems to scrape out JSON data embedded in the HTML. The following was what I did:
import requests from bs4
import BeautifulSoup as soup
import json
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'html.parser')
data = page_soup.findAll('script', {'type':'application/ld+json'})
The output is as follows:
[
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "BreadcrumbList",
"itemListElement": [
{
"item": {
"name": "Home",
"#id": "https://www.lazada.sg/"
},
"#type": "ListItem",
"position": "1"
}
]
}</script>,
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "ItemList",
"itemListElement": [
{
"offers": {
"priceCurrency": "SGD",
"#type": "Offer",
"price": "71.00",
"availability": "https://schema.org/InStock"
},
"image": "https://sg-test-11.slatic.net/p/670a73a9613c36b2bb01555ab4092ba2.jpg",
"#type": "Product",
"name": "Switch: Super Mario Party [Available in Stock! Immediate Shipping]",
"url": "https://www.lazada.sg/products/switch-super-mario-party-available-in-stock-immediate-shipping-i278269540-s429667097.html?search=1"
},
...
I tried to follow an existing thread Extract json from html in python beautifulsoup but found myself stuck, probably due to the different JSON formatting in the HTML soup. The part which I scrape out contains all the different products in that page, is there a way where I further scrape out each product's details (eg. Title, price, rating, etc) and count the number of products present? Thanks!
You can loop parsing out from the json after loading with json.loads. All the product info for those containers is listed in one script tag so you can just grab that.
import requests
from bs4 import BeautifulSoup as soup
import json
import pandas as pd
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'lxml')
data = page_soup.select("[type='application/ld+json']")[1]
oJson = json.loads(data.text)["itemListElement"]
numProducts = len(oJson)
results = []
for product in oJson:
results.append([product['name'], product['offers']['price'], product['offers']['availability'].replace('https://schema.org/', '')]) # etc......
df = pd.DataFrame(results)
print(df)

Categories

Resources