unable to scrape status of product - python

I want to scrape price and status of website. I am able to scrape price but unable to scrape status. Couldn't find in JSON as well.
here is link: https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true
from requests import get
from bs4 import BeautifulSoup
resp = get(url)
soup = BeautifulSoup(resp.text, 'lxml')
# print(soup.prettify())
price = soup.find('div', class_ = 'product-price')
status = soup.find('div', class_ = 'avl-status buy-box__shipping-item')
print(status.text)

You can use Json microformat embedded inside the page to obtain availability (price, images, description...).
For example:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true"
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
data = json.loads(soup.select_one('script[type="application/ld+json"]').contents[0])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
print('Price : ', data['offers']['price'])
print('Availability: ', data['offers']['availability'])
Prints:
Price : 17.13
Availability: http://schema.org/InStock
EDIT: You can observe all product data that is embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zoro.com/baldwin-filters-filter-service-kit-thermo-king-bk6092/i/G1609513/"
# url = 'https://www.zoro.com/jonard-tools-diagonal-cutting-plier-8-l-jic-2488/i/G2736212/?recommended=true'
soup = BeautifulSoup( requests.get(url).content, 'html.parser' )
data = json.loads(soup.select_one('div.hidden[data-state]')['data-state'] )
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
_, product_data = data['product']['productDetailsData'].popitem()
print(json.dumps(product_data, indent=4))
print()
print('isExpeditable = ', product_data['isExpeditable'])
When this key isExpeditable is set to False, it means Drop Shipping (I think). When I tested it with product that is in stock, it prints True.
The output:
{
"packageQty": 1,
"isMotorCompliant": false,
"zoroNo": "G1609513",
"brand": "Baldwin Filters",
"salesStatus": "TP",
"orderChannel": "Default",
"description": "Filter Service Kit, For Vehicle Type - Filter Kits Thermo King, Includes Lube Spin-On, Fuel, Water Separator Element, Fuel Spin-On",
"restrictedStates": [],
"title": "Filter Service Kit",
"categoryPaths": [
[
{
"name": "Automotive Filters",
"slug": "automotive-filters",
"code": "7540"
},
{
"name": "Filter Service Kits",
"slug": "filter-service-kits",
"code": "10660"
}
]
],
"restrictedSaleItemCode": "",
"slug": "baldwin-filters-filter-service-kit-thermo-king-bk6092",
"energyGuideLabelFileName": "",
"variants": null,
"isForcedOutOfStock": false,
"lightingFactLabelFileName": "",
"isExpeditable": false,
"erpId": "2770121",
"californiaProp65Message": null,
"isHazmat": false,
"leadTime": 8,
"mfrNo": "BK6092",
"attributes": [
{
"name": "For Vehicle Type - Filter Kits",
"value": "Thermo King"
},
{
"name": "Item",
"value": "Filter Service Kit"
},
{
"name": "For Use With",
"value": "Thermo King"
},
{
"name": "Includes",
"value": "Lube Spin-On, Fuel, Water Separator Element, Fuel Spin-On"
},
{
"name": "Country of Origin (subject to change)",
"value": "United States"
}
],
"originalPrice": null,
"isCircleECompliant": false,
"lowLeadComplianceLevel": "",
"priceUnit": "EA",
"isDropShipDirect": false,
"minRetailQty": 1,
"price": 118.29,
"media": [
{
"name": "Z1qr7ymcpEx_.JPG",
"type": "image/jpeg"
}
]
}
isExpeditable = False

Related

How can I scrape the "Time" and other data in the advanced details section using Beautiful Soup

Here is the url that I want to scrape the data 'https://www.blockchain.com/explorer/transactions/btc/43eebdc59c6c5ce948ccd9cf514b6c2ece9f1289f136c2e3d9d69dcd29304142'
I tried to scrape using Beautiful Soup because it doesn't have to open the browser like the Selenium does. So I tried to extract the data from the outer section
('section',{'class':'sc-f9148dd7-2 irWxzm'})
the irWxzm section
and then tried to find a little bit deeper to the targeted div tag but I don't understand why after I extracted the data from 'section',{'class':'sc-f9148dd7-2 irWxzm'}, it seemed that the data has stopped from the advanced detailsthe data from irWxzm section and I can't dive deeper to the desired div tag in the first picture.
Here my code that I wrote
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.blockchain.com/explorer/transactions/btc/43eebdc59c6c5ce948ccd9cf514b6c2ece9f1289f136c2e3d9d69dcd29304142'
res = requests.get(url)
format = '%Y-%m-%d %H:%M'
soup = BeautifulSoup(res.content, 'html.parser')
soup2 = soup.find('section',{'class':'sc-f9148dd7-2 irWxzm'})
print(soup2)
I tried a lot but it can't find any tag under 'class':'sc-f9148dd7-2 irWxzm' except class="sc-c907597a-0 MqlNG and class="sc-c907597a-3 ctQMfW" according to second picture.
Could you help me find the way to get the data in the advanced details section please.
desired data
Thank you so very much in advance.
The page loads the data from external URL via JavaScript, to load the data you can use next example:
from datetime import datetime
import requests
api_url = "https://www.blockchain.com/explorer/api/transaction?asset=btc&id=43eebdc59c6c5ce948ccd9cf514b6c2ece9f1289f136c2e3d9d69dcd29304142"
data = requests.get(api_url).json()
print(data)
Prints:
{
"ticker": "btc",
"transaction": {
"txid": "43eebdc59c6c5ce948ccd9cf514b6c2ece9f1289f136c2e3d9d69dcd29304142",
"size": 381,
"version": 1,
"locktime": 0,
"fee": 170170,
"inputs": [
{
"coinbase": False,
"txid": "667f2825db6b03e349b5e4be7b4c4c5be266c242a6aaa0218480572ffc5a7b37",
"output": 0,
"sigscript": "47304402204ba063dca925f759777ed8818027c421cb4052ecf2e3b980c814bc528c73638e02206a3d58ec92d0be9915c14d6c4cef40a01d301286c90d82c1bcf166db0e94c3bb012103951bbeb5b73e530b6849fca68e470118f4b379ad9126015caf1355dc2a9e8480",
"sequence": 4294967295,
"pkscript": "76a9149c8ab044348d826b9ae88d698d575a45a6e8fc6988ac",
"value": 207730,
"address": "1FGiZB7K757EUixGcyeyME6Jp8qQZEiUUk",
"witness": [],
},
{
"coinbase": False,
"txid": "3c2dc36fd0bebc46062362aff0c4f307d1c99900c5f358fdd37b436a15d37a5f",
"output": 0,
"sigscript": "4730440220322e489e971b2c651224c2e03bea408df8c67a0a1c18ddfd20e940d90a8e61990220707ba2431bde31500ebe6a2b3c4a7974b87c4b9ee33849e1453c0831318bed14012103951bbeb5b73e530b6849fca68e470118f4b379ad9126015caf1355dc2a9e8480",
"sequence": 4294967295,
"pkscript": "76a9149c8ab044348d826b9ae88d698d575a45a6e8fc6988ac",
"value": 231716,
"address": "1FGiZB7K757EUixGcyeyME6Jp8qQZEiUUk",
"witness": [],
},
],
"outputs": [
{
"address": "1FGiZB7K757EUixGcyeyME6Jp8qQZEiUUk",
"pkscript": "76a9149c8ab044348d826b9ae88d698d575a45a6e8fc6988ac",
"value": 269276,
"spent": True,
"spender": {
"txid": "c7ed715e9f73b2792957af94d3143750525a29f6a62fd6f68d470e56e4bbef7b",
"input": 0,
},
},
{
"address": None,
"pkscript": "6a208627c703aeac41df8acad1c643d9ee9c2370f9cace1af05a0ac41219116b5e0b",
"value": 0,
"spent": False,
"spender": None,
},
],
"block": {"height": 756449, "position": 2},
"deleted": False,
"time": 1664582470,
"rbf": False,
"weight": 1524,
},
"rate": 16526.38,
"latestBlock": 769853,
"id": "43eebdc59c6c5ce948ccd9cf514b6c2ece9f1289f136c2e3d9d69dcd29304142",
"description": False,
"fiat": "USD",
"labels": {},
}
To get the time:
print(datetime.fromtimestamp(data["transaction"]["time"]))
Prints:
2022-10-01 02:01:10

Python - Print url and name of page only

I have the following code:
url = requests.get("http://www.ucdenver.edu/pages/ucdwelcomepage.aspx")
soup = BeautifulSoup(res.content, 'html5lib')
scripts = soup.select('script', {"type":"application/ld+json"})
scripts = [script for script in scripts] #for each script in the script, from all scripts found
>! print(scripts)
for script in scripts:
script.get(res)
print(script)
and from this code I got the result(s):
I want to get into the departments array to capture two elements,
(there are multiple departments in "departments")
{
"#context": "https://schema.org/",
"#type": "Organization",
"url": "https://www.ucdenver.edu",
"logo": "https://www.ucdenver.edu/images/default-source/global-theme-images/cu_logo.png",
"name": "University of Colorado Denver",
"alternateName": "CU Denver",
"telephone": "1+ 303-315-5969",
"address": {
"#type": "PostalAddress",
"streetAddress": "1201 Larimer Street",
"addressLocality": "Denver",
"addressRegion": "CO",
"postalCode": "80204",
"addressCountry": "US"
},
"department": [{
"name": "Center for Undergraduate Exploration and Advising",
"email": "mailto:CUEA#ucdenver.edu",
"telephone": "1+ 303-315-1940",
"url": "https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising",
"address": [{
"#type": "PostalAddress",
"streetAddress": "1201 Larimer Street #1113",
"addressLocality": "Denver",
"addressRegion": "CO",
"postalCode": "80204",
"addressCountry": "US"
}]
},
from the object I only want to capture "name" and "url".
This is my first time playing with web scraping, but i'm not too sure how you get into "department": [{ to then capture the two elements I want.
Once you get back the JSON output you've shown as a Python dict and stored it in a variable called data, for example, you can do:
result = []
for department in data["department"]:
result.append({"name": department["name"], "url": department["url"]})
print(result) # prints out [{"name": "Center for Undergraduate Exploration and Advising", "url": "https://www.ucdenver.edu/center-for-undergraduate-exploration-and-advising"}, {"name": "another name", "url": "another url"}, ...]
This worked for me:
from bs4 import BeautifulSoup
import requests
import json
res = requests.get("http://www.ucdenver.edu/pages/ucdwelcomepage.aspx")
soup = BeautifulSoup(res.content, 'html5lib')
scripts = soup.find_all(attrs={"type":"application/ld+json"})
for s in scripts:
content = s.contents[0] # get the text of the script node
j = json.loads(content) # parse it as JSON into a Python data structure
for dept in j["department"]:
print(">>>", dept["name"], dept["url"])
You first extract the text of the script node. Then convert that text using the json package to a Python data structure. Then you can iterate through the data using a for-loop.

How to scrape information from one website that need scroll down every time

I have one website where I need to scroll down every time to show the information
my code as below, but can not get anything
# -*- coding: UTF-8 -*-
import csv
from parsel import Selector
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
records=[]
driver = webdriver.Chrome('/Users/stevenqi/Downloads/chromedriver')
driver.get('https://www.yaskawa.eu.com/services/robotics-support')
contacts = driver.find_elements_by_xpath('//*[#class="icon-close"]')
for contact in contacts:
contact.click()
email=driver.find_element_by_xpath('//*[contains(#href,"mailto:")]').get_attribute("href")
website=driver.find_element_by_xpath('//*[contains(#href,"http")]').get_attribute("href")
Tel=
records.append((website, email,Tel))
I have used another python library beautifulsoup for extracting data from the given website, hope this will be helpful, By using the library you can parse HTML content and extract data on the bases of tags, following is code snippet
import json
from bs4 import BeautifulSoup
from selenium import webdriver
records = []
driver = webdriver.Chrome("your-chrome-drive-path")
driver.get('https://www.yaskawa.eu.com/services/robotics-support')
def partner_name(soup):
extracted_heading = soup.find("h3")
_partner_name = str(extracted_heading.text.strip()).replace(extracted_heading.find('i').text, '')
return _partner_name
def department_name(soup):
button_text = soup.find("button", class_="accordion__toggle")
return button_text.text.strip()
def extract_department_info(soup):
contacts = soup.find('div', class_='contact')
contact_info = {
"phone": "",
"email": "",
"website": ""
}
if contacts:
href_links = contacts.find_all('a')
for href_link in href_links:
if href_link.get('href').startswith('tel'):
contact_info["phone"] = href_link.text.strip()
elif href_link.get('href').startswith('mailto'):
contact_info["email"] = href_link.text.strip()
else:
icon_home = href_link.find('i', class_='icon-home')
if icon_home:
contact_info["website"] = href_link.text.strip()
return contact_info
def extract_contact_info(soup):
accordion_soup = soup.find('div', class_="accordion")
contact_info = []
if accordion_soup:
departments_contact = accordion_soup.find_all("div", recursive=False)
for department_contact in departments_contact:
department = department_name(department_contact)
extract_info = extract_department_info(department_contact)
extract_info["department"] = department
contact_info.append(extract_info)
return contact_info
soup = BeautifulSoup(driver.page_source, 'html.parser')
partners_card = soup.find_all('div', class_='card-partner')
partners_info = []
for partner in partners_card:
extract_contact_info(partner)
partners_info.append({"partner": partner_name(partner), "contact": extract_contact_info(partner)})
print(json.dumps(partners_info, indent=4))
Following is the output of shared code,
[
{
"partner": "Yaskawa Europe GmbH ",
"contact": [
{
"phone": "+49 1805 76 26 83",
"email": "tcs#yaskawa.eu",
"website": "",
"department": "Robotics Service Hotline"
},
{
"phone": "+49 8166 90 2005",
"email": "YEUR-Repair-Sales#yaskawa.eu",
"website": "",
"department": "Robotics Repairs"
},
{
"phone": "+49 8166 90 2000",
"email": "spare-parts-sales#yaskawa.eu",
"website": "",
"department": "Robotics Spare Parts"
},
{
"phone": "+49 8166 90 0",
"email": "tcs-sales#yaskawa.eu",
"website": "",
"department": "Robotics Sales for Services"
},
{
"phone": "+49 8166-90-0",
"email": "robotics#yaskawa.eu",
"website": "",
"department": "Robotics Sales"
}
]
},
{
"partner": "Yaskawa Europe GmbH ",
"contact": [
{
"phone": "+49 6196 569 300",
"email": "",
"website": "https://www.yaskawa.eu.com/lp/contacts",
"department": "Contact"
},
{
"phone": "+ 49 6196 569 500",
"email": "support#yaskawa.eu.com",
"website": "",
"department": "Drives Motion Support"
},
{
"phone": "+49 9132 744 1150",
"email": "support#yaskawa.eu.com",
"website": "",
"department": "Controls Support (VIPA)"
}
]
},
# Reduced number of lines just to show raw output
{
"partner": "ZyTECH Innovative Solutions ",
"contact": [
{
"phone": "+598-2-901 3311",
"email": "info#zytech.com.uy",
"website": "www.zytech.com.uy",
"department": "Contact"
}
]
}
]
bs4 can extract data based on HTML tags, classes, attributes, and so on. Hope this will help you.

Web scraping in python returns "None"

I'm trying to scrape something from a site using python. For example the views on this video (the url) it always returns "None". What am I doing wrong? here is the code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.youtube.com/watch?v=1OfK8UmLMl0&ab_channel=HitraNtheUnnecessaryProgrammer'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
views = soup.body.find(class_='view-count style-scope ytd-video-view-count-renderer')
print(views)
Thanks!
(btw when I try the code shown in the video it works fine)
The page is loaded dynamically, requests doesn't support dynamically loaded pages. However, the data is available in JSON format, you can use the re/json modules to get the correct data.
For example, to get the "view count":
import re
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.youtube.com/watch?v=1OfK8UmLMl0&ab_channel=HitraNtheUnnecessaryProgrammer"
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# We locate the JSON data using a regular-expression pattern
data = re.search(r"var ytInitialData = ({.*?});", soup).group(1)
data = json.loads(data)
print(
data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][0][
"videoPrimaryInfoRenderer"
]["viewCount"]["videoViewCountRenderer"]["viewCount"]["simpleText"]
)
Output:
124 views
The variable data contains all the data in a Python dictionary (dict) to print all the data you can use:
print(json.dumps(data, indent=4))
Output (truncated):
{
"responseContext": {
"serviceTrackingParams": [
{
"service": "CSI",
"params": [
{
"key": "c",
"value": "WEB"
},
{
"key": "cver",
"value": "2.20210701.07.00"
},
{
"key": "yt_li",
"value": "0"
},
{
"key": "GetWatchNext_rid",
"value": "0x1d62a299beac9e1f"
}
]
},
{
"service": "GFEEDBACK",
"params": [
{
"key": "logged_in",
"value": "0"
},
{
"key": "e",
"value": "24037443,24058293,24058128,24003103,24042870,23882685,24023960,23944779,24027649,24046896,24059898,24049577,23983296,23966208,24056265,23891346,1714258,24049575,24045412,24003105,23999405,24051884,23891344,23986022,24049573,24056839,24053866,24058240,23744176,23998056,24010336,24037586,23934970,23974595,23735348,23857950,24036947,24051353,24038425,23990875,24052245,24063702,24058380,23983813,24058812,24026834,23996830,23946420,24001373,24049820,24030040,24062848,23968386,24027689,24004644,23804281,24049569,23973490,24044110,23884386,24012512,24044124,24059521,23918597,24007246,24049567,24022729,24037794"
}
]
},
{
"service": "GUIDED_HELP",
"params": [
{
"key": "logged_in",
"value": "0"
}
]
},
{
"service": "ECATCHER",
"params": [
{
"key": "client.version",
"value": "2.20210701"
},
{
"key": "client.name",
"value": "WEB"
}
]
}
],
"mainAppWebResponseContext": {
"loggedOut": true
},
"webResponseContextExtensionData": {
"ytConfigData": {
"visitorData": "CgtoanprT1pPbmtWTSjYk46HBg%3D%3D",
"rootVisualElementType": 3832
},
I usually try to view the API requests (from the network tab on dev tools) when a site is dynamically loaded. I was successful with sites such as udemy, skillshare and few others but not with youtube. so in such case, I would use the youtube official API. which is quite easy to use and have plenty of code samples on github. with that you just request your data and get a json response. that you can convert to a dictionary with response.json(). or another option would be using selenium which is not a solution I like and it's pretty resource and time consuming. requesting from API is faster than scraping or any other solution on earth. when something doesn't provide an API, you need scraping

Python: JSONDecodeError: Expecting property name enclosed in double quotes

Need some expert help on how to debug this and force a double quotes where missing, so that i can get a successful response. It is throwing error here df = pd.io.json.json_normalize(rt.json(), record_path='offers'). The josn looks fine i have staged into csv, so not sure where the issue is
my Code
import requests
import csv
import json
import pandas as pd
from pandas.io.json import json_normalize
url = "https://authentication.skiapi.com/access_token"
payload = {
"client_id": "00c7fcf******",
"client_secret": "7676cd5a********",
"grant_type": "client_credentials"
}
headers = {"Content-Type": "application/json"}
response = requests.post(url, json=payload, headers=headers)
r = response.json()
access_token = r.get('access_token')
print(response.json()['access_token'])
uri = "https://private-anon-73f9ac5d87-slinksmerchantapi.apiary-mock.com/v4/publisher/12633/offers?country=US"
headers = {'Authorization': access_token,'Content-Type': "application/json"}
rt = requests.get(uri, headers=headers)
df = pd.io.json.json_normalize(rt.json(), record_path='offers') ###problem is here
#print(df)
df.to_csv(r"C:\\Users\ral\Downloads\\offers.csv", index=False)
My Sample JSON
{
"has_more": true
"offers": [{
"coupon_code": null
"terms": null
"description": "40% Off Comforters & Other Cool Bedding Stuff Online at BoxLunch! Stock up on select blankets and bedding online only for a limited time -- See site for details. Valid 3/3-3/5"
"offer_starts": "2017-03-03 08:00:00"
"title": "40% Off Comforters & Other Cool Bedding Stuff Online at BoxLunch!"
"url": "http://www.boxlunch.com/home/bedroom/?soffer=152034"
"merchant_details": {
"domain": "boxlunchgifts.com"
"verticals": []
"country_code": null
"id": 393756
"metadata": {}
"favourite": false
"partner_type": null
"merchant_id": 383288
"advertiser_id": 123456
"name": "BoxLunch"
"countries": []
"domains": [
"boxlunchgifts.com"
"boxlunch.com"
]
}
"offer_type": "sale"
"id": 152034
"offer_ends": "2017-03-05 08:00:00"
}]
"last_val": 152034
"next_val": 152032
"num_returned": 1
}
The JSON is not fine - as per comments commas are missing plus true should be True and null should be None in python
Additionally you are using deprecated interface to json_normalize
what does json.dumps(rt.json(), indent=2) return? Valid json?
patched json
json = {
"has_more": True,
"offers": [{
"coupon_code": None,
"terms": None,
"description": "40% Off Comforters & Other Cool Bedding Stuff Online at BoxLunch! Stock up on select blankets and bedding online only for a limited time -- See site for details. Valid 3/3-3/5",
"offer_starts": "2017-03-03 08:00:00",
"title": "40% Off Comforters & Other Cool Bedding Stuff Online at BoxLunch!",
"url": "http://www.boxlunch.com/home/bedroom/?soffer=152034",
"merchant_details": {
"domain": "boxlunchgifts.com",
"verticals": [],
"country_code": None,
"id": 393756,
"metadata": {},
"favourite": False,
"partner_type": None,
"merchant_id": 383288,
"advertiser_id": 123456,
"name": "BoxLunch",
"countries": [],
"domains": [
"boxlunchgifts.com",
"boxlunch.com"
]
},
"offer_type": "sale",
"id": 152034,
"offer_ends": "2017-03-05 08:00:00"
}],
"last_val": 152034,
"next_val": 152032,
"num_returned": 1
}
pd.json_normalize(json, record_path="offers")

Categories

Resources