Python Scrape specific JS data - python
Im having some trouble extracting the following data from a page:
I have highlighted the json I would like to obtain from the page.
I have also pasted the javascript section it is in below:
<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>
How can I obtain this quickly and efficiently - I have tried using Bs4 but I always get an object value of None returned. Please could someone show me how this can be done :)
Thanks!
This script looks like JSON data - so use module json to convert it to Python dictionary (ie. data) and get what you want -
data["#conf-select-attr-173"]["Magento_ConfigurableProduct/js/configurable/select/action"]["config"]
html = '''<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>'''
from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html, 'html.parser')
text = soup.find('script').string
#print(text)
data = json.loads(text)
config = data["#conf-select-attr-173"]["Magento_ConfigurableProduct/js/configurable/select/action"]["config"]
print(config)
Eventually you may get 4-th line from this text, remove "config": and , at the end and again use json to convert it to Python dictionary
html = '''<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>'''
from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html, 'html.parser')
text = soup.find('script').string
lines = text.split('\n')
line4 = lines[4].strip()
line4 = line4.replace('"config": ', '')
line4 = line4[:-1] # remove `,` at the end
config = json.loads(line4)
print(config)
Related
Execute js function in HTML page scraped by python to get json data
I have a website with products https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01 When I inspect the html page I see they have all info in json format in script tag under window.INITIAL_DATA = JSON.parse('{"pa...') I tried to scrape the html with requests and get the json string with regex, however my code somehow change the json structure and I cannot load it with json.loads() response = requests.get('https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01', headers=headers) soup = BeautifulSoup(response.text, 'html.parser') regex = "JSON.parse\(.*;" match = re.search(regex, str(soup)) json_string = match.group(0).replace("JSON.parse(", "")[1:-3] json_data = json.loads(json_string) it ends with json error because there are multiple weird spaces and " which does json library in python cannot handle json.decoder.JSONDecodeError: Expecting ',' delimiter: line 1 column 22173 (char 22172) Is there a way how to get the json data or even better how to execute the window.INITIAL_DATA function directly in html response in python?
Try: import re import js2py import requests url = "https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01" html_doc = requests.get(url).text data = re.search(r"window\.INITIAL_DATA = (.*)", html_doc) data = js2py.eval_js(data.group(1)) print(data) Prints: { "currentCountry": { "englishName": "Sweden", "localName": "Sverige", "twoLetterCode": "SE", }, "currentCurrency": "SEK", "currentLanguage": "sv-SE", "currentLanguageRevision": "43", "currentLanguageTwoLetterName": "sv", "dynamicData": [ { "data": {}, "type": "NordicNest.ContentApi.DynamicData.MenuApiModel,NordicNest.ContentApi", }, { "type": "NordicNest.Core.Contentful.Model.SiteLayout.Footer,NordicNest.Core" }, ...
Parsing with Python Help me
How can I pull all coordinates value (56.10457, 47.211815,36.130162, 67.135758) from the entire text? <script> data = milestonesMap.getEmptyData(); data.points.push({ properties: { balloonContentHeader: "CORDON", }, geometry: { type: "Point", coordinates: [46.10457, 67.211815] } }); data.points.push({ properties: { balloonContentHeader: "CORDON", }, geometry: { type: "Point", coordinates: [36.130162, 67.135758] } }); from bs4 import BeautifulSoup import requests url = 'https://xn--90adear.xn--p1ai/r/21/milestones' page = requests.get(url) print(page.status_code) filteredNews = [] allNews = [] soup = BeautifulSoup(page.text, "html.parser") print(soup)
user regex: coord = re.findall("coordinates: \[([0-9., ]*),([0-9., ]*)\]", soup) output [('46.10457', ' 67.211815'), ('36.130162', ' 67.135758')] or just re.findall("coordinates: \[([0-9., ]*)\]", soup) to have both long, lat as one tuple
Sending GET request to URL with fragment returns the content of the main page
I am trying to web-scrap this webpage but I always end up getting the "main" page (same URL but without "#face-a-face" at the end). It's the same problem as this guy encountered, see this forum. He got an answer but I am not able to generalize and apply this for the website I want to scrap. import requests from bs4 import BeautifulSoup url_main = "https://www.lequipe.fr/Football/match-direct/ligue-1/2020-2021/ol-dijon-live/477168" url_target = url_main + "#face-a-face" soup_main = BeautifulSoup(requests.get(url_main, verify=False).content, "html.parser") soup_target = BeautifulSoup(requests.get(url_target, verify=False).content, "html.parser") print(soup_main == soup_target) returns True. I would like to get different contents, this is not the case here. For example, I would like to extract all the "confrontations depuis 2011" in the target webpage. How can I get the final content of this webpage with a GET request (or with another way) ? Thanks !
All the data comes from a highly nested JSON file. You can get that file and extract the information you need. Here's how: import json import requests endpoint = "https://iphdata.lequipe.fr/iPhoneDatas/EFR/STD/ALL/V2/Football/Prelive/68/477168.json" team_data = requests.get(endpoint).json() specifics = team_data["items"][1]["objet"]["matches"][0]["specifics"] print(json.dumps(specifics, indent=2)) This should get you a dictionary: { "__type": "specifics_sport_collectif", "vainqueur": "domicile", "score": { "__type": "score", "exterieur": "1", "domicile": "4" }, "exterieur": { "__type": "effectif_sport_collectif", "equipe": { "__type": "equipe", "id": "202", "url_image": "https://medias.lequipe.fr/logo-football/202/{width}{declinaison}", "nom": "Dijon", "url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub202.html" } }, "domicile": { "__type": "effectif_sport_collectif", "equipe": { "__type": "equipe", "id": "22", "url_image": "https://medias.lequipe.fr/logo-football/22/{width}{declinaison}", "nom": "Lyon", "url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub22.html" } }, "is_final": false, "prolongation": false, "vainqueur_final": "domicile", "is_qualifier": false } And if you, for example, just want the socre, add this line: just_the_score = specifics["score"] print(just_the_score) To get this: {'__type': 'score', 'exterieur': '1', 'domicile': '4'}
How to use Beautiful Soup to extract string in <script> tag email?
In a given .html page, I have a script tag like so: <script> atomic({ "playlist": [{ "id": "123456", "email": "ename#email.com", "token": "92426029ccf14bca5e495a419868af30" }] } }).$mount('#app'); </script> How can I use Beautiful Soup to extract the email address?
You can locate the <script> tag using soup.find('script'), and then use the built-in re module to extract the email after calling .string: soup = BeautifulSoup(script, "html.parser") script_tag = soup.find("script") print( re.search(r'"email": "(.*?)"', script_tag.string).group(1) ) Output: ename#email.com
You could do it this way. Select the <script> tag first and then extract it's text using .string. Note that get_text() will not work for <script> To get the internal JSON string do some string manipulations like - Remove tabs, newlines, spaces etc., and strip off unwanted data. Convert the JSON string to a JSON object using json module and extract the info you need. Here is how it is done. import json import re from bs4 import BeautifulSoup s = """ <script> atomic({ "playlist": [{ "id": "123456", "email": "ename#email.com", "token": "92426029ccf14bca5e495a419868af30" }] }).$mount('#app'); </script>""" soup = BeautifulSoup(s, 'lxml') t = soup.find('script') x = t.string.strip() x = re.sub(r"[\n\t\s]*", "", x) #Removing the newlines, spaces and tabs from string # Stripping off unwanted characters from string to get the internal JSON string x = x.lstrip('atomic(') x = x.rstrip(').$mount(\'#app\');') json_str = json.loads(x) for i,v in json_str['playlist'][0].items(): print(f"{i}: {v}") id: 123456 email: ename#email.com token: 92426029ccf14bca5e495a419868af30
Extracting JSON from HTML using BeautifulSoup python
While I was practicing some web-scraping on a webpage (param cookies required), I found myself having problems to scrape out JSON data embedded in the HTML. The following was what I did: import requests from bs4 import BeautifulSoup as soup import json my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1' cookies = { "Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412", "Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525", "x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", } ret = requests.get(my_url, cookies=cookies) print("New Super Mario Bros" in ret.text) # True page_soup = soup(ret.text, 'html.parser') data = page_soup.findAll('script', {'type':'application/ld+json'}) The output is as follows: [ <script type="application/ld+json">{ "#context": "https://schema.org", "#type": "BreadcrumbList", "itemListElement": [ { "item": { "name": "Home", "#id": "https://www.lazada.sg/" }, "#type": "ListItem", "position": "1" } ] }</script>, <script type="application/ld+json">{ "#context": "https://schema.org", "#type": "ItemList", "itemListElement": [ { "offers": { "priceCurrency": "SGD", "#type": "Offer", "price": "71.00", "availability": "https://schema.org/InStock" }, "image": "https://sg-test-11.slatic.net/p/670a73a9613c36b2bb01555ab4092ba2.jpg", "#type": "Product", "name": "Switch: Super Mario Party [Available in Stock! Immediate Shipping]", "url": "https://www.lazada.sg/products/switch-super-mario-party-available-in-stock-immediate-shipping-i278269540-s429667097.html?search=1" }, ... I tried to follow an existing thread Extract json from html in python beautifulsoup but found myself stuck, probably due to the different JSON formatting in the HTML soup. The part which I scrape out contains all the different products in that page, is there a way where I further scrape out each product's details (eg. Title, price, rating, etc) and count the number of products present? Thanks!
You can loop parsing out from the json after loading with json.loads. All the product info for those containers is listed in one script tag so you can just grab that. import requests from bs4 import BeautifulSoup as soup import json import pandas as pd my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1' cookies = { "Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412", "Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525", "x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", } ret = requests.get(my_url, cookies=cookies) print("New Super Mario Bros" in ret.text) # True page_soup = soup(ret.text, 'lxml') data = page_soup.select("[type='application/ld+json']")[1] oJson = json.loads(data.text)["itemListElement"] numProducts = len(oJson) results = [] for product in oJson: results.append([product['name'], product['offers']['price'], product['offers']['availability'].replace('https://schema.org/', '')]) # etc...... df = pd.DataFrame(results) print(df)