Extracting JSON from HTML using BeautifulSoup python - python

While I was practicing some web-scraping on a webpage (param cookies required), I found myself having problems to scrape out JSON data embedded in the HTML. The following was what I did:
import requests from bs4
import BeautifulSoup as soup
import json
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'html.parser')
data = page_soup.findAll('script', {'type':'application/ld+json'})
The output is as follows:
[
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "BreadcrumbList",
"itemListElement": [
{
"item": {
"name": "Home",
"#id": "https://www.lazada.sg/"
},
"#type": "ListItem",
"position": "1"
}
]
}</script>,
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "ItemList",
"itemListElement": [
{
"offers": {
"priceCurrency": "SGD",
"#type": "Offer",
"price": "71.00",
"availability": "https://schema.org/InStock"
},
"image": "https://sg-test-11.slatic.net/p/670a73a9613c36b2bb01555ab4092ba2.jpg",
"#type": "Product",
"name": "Switch: Super Mario Party [Available in Stock! Immediate Shipping]",
"url": "https://www.lazada.sg/products/switch-super-mario-party-available-in-stock-immediate-shipping-i278269540-s429667097.html?search=1"
},
...
I tried to follow an existing thread Extract json from html in python beautifulsoup but found myself stuck, probably due to the different JSON formatting in the HTML soup. The part which I scrape out contains all the different products in that page, is there a way where I further scrape out each product's details (eg. Title, price, rating, etc) and count the number of products present? Thanks!

You can loop parsing out from the json after loading with json.loads. All the product info for those containers is listed in one script tag so you can just grab that.
import requests
from bs4 import BeautifulSoup as soup
import json
import pandas as pd
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'lxml')
data = page_soup.select("[type='application/ld+json']")[1]
oJson = json.loads(data.text)["itemListElement"]
numProducts = len(oJson)
results = []
for product in oJson:
results.append([product['name'], product['offers']['price'], product['offers']['availability'].replace('https://schema.org/', '')]) # etc......
df = pd.DataFrame(results)
print(df)

Related

Execute js function in HTML page scraped by python to get json data

I have a website with products https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01 When I inspect the html page I see they have all info in json format in script tag under
window.INITIAL_DATA = JSON.parse('{"pa...')
I tried to scrape the html with requests and get the json string with regex, however my code somehow change the json structure and I cannot load it with json.loads()
response = requests.get('https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01', headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
regex = "JSON.parse\(.*;"
match = re.search(regex, str(soup))
json_string = match.group(0).replace("JSON.parse(", "")[1:-3]
json_data = json.loads(json_string)
it ends with json error because there are multiple weird spaces and " which does json library in python cannot handle
json.decoder.JSONDecodeError: Expecting ',' delimiter: line 1 column 22173 (char 22172)
Is there a way how to get the json data or even better how to execute the window.INITIAL_DATA function directly in html response in python?
Try:
import re
import js2py
import requests
url = "https://www.svenssons.se/varumarken/swedese/lamino-fatolj-och-fotpall-lackad-bokfarskinn/?variantId=514023-01"
html_doc = requests.get(url).text
data = re.search(r"window\.INITIAL_DATA = (.*)", html_doc)
data = js2py.eval_js(data.group(1))
print(data)
Prints:
{
"currentCountry": {
"englishName": "Sweden",
"localName": "Sverige",
"twoLetterCode": "SE",
},
"currentCurrency": "SEK",
"currentLanguage": "sv-SE",
"currentLanguageRevision": "43",
"currentLanguageTwoLetterName": "sv",
"dynamicData": [
{
"data": {},
"type": "NordicNest.ContentApi.DynamicData.MenuApiModel,NordicNest.ContentApi",
},
{
"type": "NordicNest.Core.Contentful.Model.SiteLayout.Footer,NordicNest.Core"
},
...

Sending GET request to URL with fragment returns the content of the main page

I am trying to web-scrap this webpage but I always end up getting the "main" page (same URL but without "#face-a-face" at the end). It's the same problem as this guy encountered, see this forum. He got an answer but I am not able to generalize and apply this for the website I want to scrap.
import requests
from bs4 import BeautifulSoup
url_main = "https://www.lequipe.fr/Football/match-direct/ligue-1/2020-2021/ol-dijon-live/477168"
url_target = url_main + "#face-a-face"
soup_main = BeautifulSoup(requests.get(url_main, verify=False).content, "html.parser")
soup_target = BeautifulSoup(requests.get(url_target, verify=False).content, "html.parser")
print(soup_main == soup_target)
returns True. I would like to get different contents, this is not the case here.
For example, I would like to extract all the "confrontations depuis 2011" in the target webpage. How can I get the final content of this webpage with a GET request (or with another way) ? Thanks !
All the data comes from a highly nested JSON file.
You can get that file and extract the information you need.
Here's how:
import json
import requests
endpoint = "https://iphdata.lequipe.fr/iPhoneDatas/EFR/STD/ALL/V2/Football/Prelive/68/477168.json"
team_data = requests.get(endpoint).json()
specifics = team_data["items"][1]["objet"]["matches"][0]["specifics"]
print(json.dumps(specifics, indent=2))
This should get you a dictionary:
{
"__type": "specifics_sport_collectif",
"vainqueur": "domicile",
"score": {
"__type": "score",
"exterieur": "1",
"domicile": "4"
},
"exterieur": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "202",
"url_image": "https://medias.lequipe.fr/logo-football/202/{width}{declinaison}",
"nom": "Dijon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub202.html"
}
},
"domicile": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "22",
"url_image": "https://medias.lequipe.fr/logo-football/22/{width}{declinaison}",
"nom": "Lyon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub22.html"
}
},
"is_final": false,
"prolongation": false,
"vainqueur_final": "domicile",
"is_qualifier": false
}
And if you, for example, just want the socre, add this line:
just_the_score = specifics["score"]
print(just_the_score)
To get this:
{'__type': 'score', 'exterieur': '1', 'domicile': '4'}

Python Scrape specific JS data

Im having some trouble extracting the following data from a page:
I have highlighted the json I would like to obtain from the page.
I have also pasted the javascript section it is in below:
<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>
How can I obtain this quickly and efficiently - I have tried using Bs4 but I always get an object value of None returned. Please could someone show me how this can be done :)
Thanks!
This script looks like JSON data - so use module json to convert it to Python dictionary (ie. data) and get what you want -
data["#conf-select-attr-173"]["Magento_ConfigurableProduct/js/configurable/select/action"]["config"]
html = '''<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>'''
from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html, 'html.parser')
text = soup.find('script').string
#print(text)
data = json.loads(text)
config = data["#conf-select-attr-173"]["Magento_ConfigurableProduct/js/configurable/select/action"]["config"]
print(config)
Eventually you may get 4-th line from this text, remove "config": and , at the end and again use json to convert it to Python dictionary
html = '''<script type="text/x-magento-init">
{
"#conf-select-attr-173": {
"Magento_ConfigurableProduct/js/configurable/select/action": {
"config": {"attributes":{"173":{"id":"173","code":"Size","label":"Size","options":[{"id":"342","label":"Footwear-38","products":["104984"]},{"id":"345","label":"Footwear-39","products":["104985"]},{"id":"347","label":"Footwear-39.5","products":["104986"]},{"id":"349","label":"Footwear-40","products":["104987"]},{"id":"351","label":"Footwear-40.5","products":["104988"]},{"id":"354","label":"Footwear-41.5","products":["104989"]},{"id":"355","label":"Footwear-42","products":["104990"]},{"id":"357","label":"Footwear-42.5","products":["104991"]},{"id":"360","label":"Footwear-43.5","products":["104992"]},{"id":"361","label":"Footwear-44","products":["104993"]},{"id":"363","label":"Footwear-44.5","products":["104994"]},{"id":"364","label":"Footwear-45","products":["104995"]},{"id":"367","label":"Footwear-46","products":["104996"]},{"id":"369","label":"Footwear-46.5","products":["104997"]}],"position":"0"}},"template":"<%- data.price %>\u00a0 \u20ac","currencyFormat":"%s\u00a0 \u20ac","optionPrices":{"104984":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104985":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104986":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104987":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104988":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104989":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104990":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104991":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104992":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104993":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104994":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104995":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104996":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]},"104997":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9},"tierPrices":[]}},"priceFormat":{"pattern":"%s\u00a0 \u20ac","precision":2,"requiredPrecision":2,"decimalSymbol":",","groupSymbol":".","groupLength":3,"integerRequired":1},"prices":{"oldPrice":{"amount":129.9},"basePrice":{"amount":109.15966286555},"finalPrice":{"amount":129.9}},"productId":"104998","chooseText":"Choose an Option...","images":[],"index":{"104984":{"173":"342"},"104985":{"173":"345"},"104986":{"173":"347"},"104987":{"173":"349"},"104988":{"173":"351"},"104989":{"173":"354"},"104990":{"173":"355"},"104991":{"173":"357"},"104992":{"173":"360"},"104993":{"173":"361"},"104994":{"173":"363"},"104995":{"173":"364"},"104996":{"173":"367"},"104997":{"173":"369"}},"sku":{"default":"1201A429-300","104984":"1201A429-300-Footwear-38","104985":"1201A429-300-Footwear-39","104986":"1201A429-300-Footwear-39.5","104987":"1201A429-300-Footwear-40","104988":"1201A429-300-Footwear-40.5","104989":"1201A429-300-Footwear-41.5","104990":"1201A429-300-Footwear-42","104991":"1201A429-300-Footwear-42.5","104992":"1201A429-300-Footwear-43.5","104993":"1201A429-300-Footwear-44","104994":"1201A429-300-Footwear-44.5","104995":"1201A429-300-Footwear-45","104996":"1201A429-300-Footwear-46","104997":"1201A429-300-Footwear-46.5"},"stock":{"104984":{"is_salable":true,"qty":1},"104985":{"is_salable":true,"qty":1},"104986":{"is_salable":true,"qty":0},"104987":{"is_salable":true,"qty":1},"104988":{"is_salable":true,"qty":1},"104989":{"is_salable":true,"qty":2},"104990":{"is_salable":true,"qty":0},"104991":{"is_salable":true,"qty":0},"104992":{"is_salable":true,"qty":3},"104993":{"is_salable":true,"qty":2},"104994":{"is_salable":true,"qty":1},"104995":{"is_salable":true,"qty":0},"104996":{"is_salable":true,"qty":0},"104997":{"is_salable":true,"qty":0}}},
"selected": ""
}
}
}
</script>'''
from bs4 import BeautifulSoup
import json
soup = BeautifulSoup(html, 'html.parser')
text = soup.find('script').string
lines = text.split('\n')
line4 = lines[4].strip()
line4 = line4.replace('"config": ', '')
line4 = line4[:-1] # remove `,` at the end
config = json.loads(line4)
print(config)

How can I scrape the content of this specific website (cineatlas)?

I am trying to scrape the content of this particular website : https://www.cineatlas.com/
I tried scraping the date part as shown in the print screen :
I used this basic beautifulsoup code
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,'html.parser')
type(soup)
time = soup.find('ul',class_='slidee')
This is what I get instead of the list of elements
<ul class="slidee">
<!-- adding dates -->
</ul>
The site creates HTML elements dynamically from the Javascript content. You can get the JS content by using re for example:
import re
import json
import requests
from ast import literal_eval
url = 'https://www.cineatlas.com/'
html_data = requests.get(url).text
movieData = re.findall(r'movieData = ({.*?}), movieDataByReleaseDate', html_data, flags=re.DOTALL)[0]
movieData = re.sub(r'\s*/\*.*?\*/\s*', '', movieData) # remove comments
movieData = literal_eval(movieData) # in movieData you have now the information about the current movies
print(json.dumps(movieData, indent=4)) # print data to the screen
Prints:
{
"2019-08-06": [
{
"url": "fast--furious--hobbs--shaw",
"image-portrait": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603443098_891497ecc8b16b3a662ad8b036820ed1_500x735.jpg",
"image-landscape": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603421049_7c233477779f25725bf22aeaacba469a_700x259.jpg",
"title": "FAST & FURIOUS : HOBBS & SHAW",
"releaseDate": "2019-08-07",
"endpoint": "ST00000392",
"duration": "120 mins",
"rating": "Classification TOUT",
"director": "",
"actors": "",
"times": [
{
"time": "7:00pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8388?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
"attributes": [
{
"_id": "5d468c20f67cc430833a5a2b",
"shortName": "VF",
"description": "Version Fran\u00e7aise"
},
{
"_id": "5d468c20f67cc430833a5a2a",
"shortName": "3D",
"description": "3D"
}
]
},
{
"time": "9:50pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8389?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
... and so on.
lis = time.findChildren()
This returns a list of child nodes

scrape a web page with scrapy dosen't return page content

I'm trying to scrape a web page with scrapy I noticed it won't work when I parsed web page throw my ipython shell it returned this:
'دانلود کتاب و کتاب صوتی با طاقچه\n // more info: http://angulartics.github.io/\n (function (i, s, o, g, r, a, m) {\n i[\'GoogleAnalyticsObject\'] = r; i[r] = i[r] || function () {\n (i[r].q = i[r].q || []).push(arguments)\n }, i[r].l = 1 * new Date(); a = s.createElement(o),\n m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)\n })(window, document, \'script\', \'//www.google-analytics.com/analytics.js\', \'ga\');\n ga(\'create\', \'UA-57199074-1\', { \'cookieDomain\': location.hostname == \'localhost\' ? \'none\' : \'auto\' });\n ga(\'require\', \'ec\');\n Taaghche works best with JavaScript enabled{ "#context": "http://schema.org", "#type": "WebSite", "url": "https://taaghche.ir/", "name": "طاقچه", "alternateName": "نزدیکترین کتاب فروشی شهر", "potentialAction": { "#type": "SearchAction", "target": "https://taaghche.ir/search?term={search_term_string}", "query-input": "required name=search_term_string" } }{ "#context": "http://schema.org", "#type": "Organization", "url": "https://taaghche.ir", "logo": "https://taaghche.ir/assets/images/taaghchebrand.png", "contactPoint": [{ "#type": "ContactPoint", "telephone": "+۹۸-۲۱-۸۸۱۴۹۸۱۶", "contacttype": "customer support", "areaServed": "IR" }] }'
more like a json response. how can I scrape throw it? by the way my scraper looks like this:
class Taaghche(scrapy.Spider):
name='taaghche'
def start_requests(self):
urls = []
link = 'https://taaghche.ir/search?term='
data = pd.read_csv('books.csv')
titles = data.title
for title in titles:
key = title.replace(" ", "%20")
urls.append(link+key)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_front)
def parse_front(self,response):
booklinks = response.xpath('//a[#class="book-link"][1]/#href').extract_first()
#print(booklinks)
#for booklink in booklinks:
yield response.follow(url =booklinks, callback=self.parse_page)
def parse_page(self,response):
...
The Website content is not render by server side.The Content of the website is rendered by JavaScript:
In this case you need use either.
Selenium (Integrate Selenium with scrapy )
Check request url in network tab. There might be API url and you can get data from url.
There might be other possible Solutions.

Categories

Resources