I am trying to scrape the content of this particular website : https://www.cineatlas.com/
I tried scraping the date part as shown in the print screen :
I used this basic beautifulsoup code
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text,'html.parser')
type(soup)
time = soup.find('ul',class_='slidee')
This is what I get instead of the list of elements
<ul class="slidee">
<!-- adding dates -->
</ul>
The site creates HTML elements dynamically from the Javascript content. You can get the JS content by using re for example:
import re
import json
import requests
from ast import literal_eval
url = 'https://www.cineatlas.com/'
html_data = requests.get(url).text
movieData = re.findall(r'movieData = ({.*?}), movieDataByReleaseDate', html_data, flags=re.DOTALL)[0]
movieData = re.sub(r'\s*/\*.*?\*/\s*', '', movieData) # remove comments
movieData = literal_eval(movieData) # in movieData you have now the information about the current movies
print(json.dumps(movieData, indent=4)) # print data to the screen
Prints:
{
"2019-08-06": [
{
"url": "fast--furious--hobbs--shaw",
"image-portrait": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603443098_891497ecc8b16b3a662ad8b036820ed1_500x735.jpg",
"image-landscape": "https://d10u9ygjms7run.cloudfront.net/dd2qd1xaf4pceqxvb41s1xpzs0/1562603421049_7c233477779f25725bf22aeaacba469a_700x259.jpg",
"title": "FAST & FURIOUS : HOBBS & SHAW",
"releaseDate": "2019-08-07",
"endpoint": "ST00000392",
"duration": "120 mins",
"rating": "Classification TOUT",
"director": "",
"actors": "",
"times": [
{
"time": "7:00pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8388?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
"attributes": [
{
"_id": "5d468c20f67cc430833a5a2b",
"shortName": "VF",
"description": "Version Fran\u00e7aise"
},
{
"_id": "5d468c20f67cc430833a5a2a",
"shortName": "3D",
"description": "3D"
}
]
},
{
"time": "9:50pm",
"bookingLink": "https://ticketing.eu.veezi.com/purchase/8389?siteToken=b4ehk19v6cqkjfwdsyctqra72m",
... and so on.
lis = time.findChildren()
This returns a list of child nodes
Related
import requests
from bs4 import BeautifulSoup
import json
data = {
0:{
0:"title",
1:"dates",
2:"city/state",
3:"country"
},
1:{
0:"event",
1:"reps",
2:"prize"
},
2:{
0:"results"
}
}
url = "https://mms.kcbs.us/members/evr_search.php?org_id=KCBA"
response = requests.get(url).text
soup = BeautifulSoup(response, features='lxml')
all_data = []
for element in soup.find_all('div', class_="row"):
event = {}
for i, col in enumerate(element.find_all('div', class_='col-md-4')):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data,indent=4))
heres a link to the website https://mms.kcbs.us/members/evr_search.php?org_id=KCBA
Im unsure why nothing gets added to the list and dictionaries
The data you see is loaded from external URL via JavaScript. To simulate the Ajax request you can use next example:
import json
import requests
from bs4 import BeautifulSoup
api_url = "https://mms.kcbs.us/members/evr_search_ol_json.php"
params = {
"otype": "TEXT",
"evr_map_type": "2",
"org_id": "KCBA",
"evr_begin": "6/16/2022",
"evr_end": "7/16/2022",
"evr_address": "",
"evr_radius": "50",
"evr_type": "269",
"evr_openings": "0",
"evr_region": "",
"evr_region_type": "1",
"evr_judge": "0",
"evr_keyword": "",
"evr_rep_name": "",
}
soup = BeautifulSoup(
requests.get(api_url, params=params).content, "html.parser"
)
data = {
0: {0: "title", 1: "dates", 2: "city/state", 3: "country"},
1: {0: "event", 1: "reps", 2: "prize"},
2: {0: "results"},
}
all_data = []
for element in soup.find_all("div", class_="row"):
event = {}
for i, col in enumerate(element.find_all("div", class_="col-md-4")):
for j, item in enumerate(col.strings):
event[data[i][j]] = item
all_data.append(event)
print(json.dumps(all_data, indent=4))
Prints:
[
{
"title": "Frisco BBQ Challenge",
"dates": "6/16/2022 - 6/18/2022",
"city/state": "Frisco, CO 80443",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: BUNNY TUTTLE, RICH TUTTLE, MICHAEL WINTER",
"prize": "Prize Money: $13,050.00",
"results": "Results Not In"
},
{
"title": "York County BBQ Festival",
"dates": "6/17/2022 - 6/18/2022",
"city/state": "Delta, PA 17314",
"country": "UNITED STATES",
"event": "STATE CHAMPIONSHIP",
"reps": "Reps: ANGELA MCKEE, ROBERT MCKEE, LOUISE WEIDNER",
"prize": "Prize Money: $5,500.00",
"results": "Results Not In"
},
...and so on.
I have parsed XML from website and i found that it has two branches (children),
How to Separate the two branches into two lists of dictionaries,
here's my code so far:
import pandas as pd
import xml.etree.ElementTree as ET
import requests
url = "http://cs.stir.ac.uk/~soh/BD2spring2022/assignmentdata.php"
params = {'data':'spurpyr'}
response = requests.get (url, params)
tree = response.content
#extract the root element as separate variable, and display the root tag.
root = ET.fromstring(tree)
print(root.tag)
#Get attributes of root
root_attr = root.attrib
print(root_attr)
#Finding children of root
for child in root:
print(child.tag, child.attrib)
#extract the two children of the root element into another two separate variables, and display their tags as well
child_dict = []
for child in root:
child_dict.append(child.tag)
tweets_branch = child_dict[0]
cities_branch = child_dict[1]
#the elements in the entire tree
[elem.tag for elem in root.iter()]
#specify both the encoding and decoding of the document you are displaying as the string
print(ET.tostring(root, encoding='utf8').decode('utf8'))
Using beautifulsoup module. To parse tweets and cities to list of dictionaries you can use this example:
import requests
from bs4 import BeautifulSoup
url = "http://cs.stir.ac.uk/~soh/BD2spring2022/assignmentdata.php"
params = {"data": "spurpyr"}
soup = BeautifulSoup(requests.get(url, params=params).content, "xml")
tweets = []
for t in soup.select("tweets > tweet"):
tweets.append({"id": t["id"], **{x.name: x.text for x in t.find_all()}})
cities = []
for c in soup.select("cities > city"):
cities.append({"id": c["id"], **{x.name: x.text for x in c.find_all()}})
print(tweets)
print(cities)
Prints:
[
{
"id": "16620625 5686",
"Name": "Kenyon Conley",
"Phone": "0327 103 9485",
"Email": "malesuada#lobortisClassaptent.edu",
"Location": "45.5333, -73.2833",
"GenderID": "male",
"Tweet": "#FollowFriday #DanielleMorrill - She's with #Seattle20 and #Twilio. Also fun to talk to. #entrepreneur",
"City": "Saint-Basile-le-Grand",
"Country": "Canada",
"Age": "34",
},
{
"id": "16310427-5502",
"Name": "Griffin Norton",
"Phone": "0306 178 7917",
"Email": "in.dolor.Fusce#necmalesuadaut.ca",
"Location": "52.0000, 84.9833",
"GenderID": "male",
"Tweet": "!!!Veryy Bored!!! ~~Craving Million's Of MilkShakes~~",
"City": "Belokurikha",
"Country": "Russia",
"Age": "33",
},
...
I am trying to web-scrap this webpage but I always end up getting the "main" page (same URL but without "#face-a-face" at the end). It's the same problem as this guy encountered, see this forum. He got an answer but I am not able to generalize and apply this for the website I want to scrap.
import requests
from bs4 import BeautifulSoup
url_main = "https://www.lequipe.fr/Football/match-direct/ligue-1/2020-2021/ol-dijon-live/477168"
url_target = url_main + "#face-a-face"
soup_main = BeautifulSoup(requests.get(url_main, verify=False).content, "html.parser")
soup_target = BeautifulSoup(requests.get(url_target, verify=False).content, "html.parser")
print(soup_main == soup_target)
returns True. I would like to get different contents, this is not the case here.
For example, I would like to extract all the "confrontations depuis 2011" in the target webpage. How can I get the final content of this webpage with a GET request (or with another way) ? Thanks !
All the data comes from a highly nested JSON file.
You can get that file and extract the information you need.
Here's how:
import json
import requests
endpoint = "https://iphdata.lequipe.fr/iPhoneDatas/EFR/STD/ALL/V2/Football/Prelive/68/477168.json"
team_data = requests.get(endpoint).json()
specifics = team_data["items"][1]["objet"]["matches"][0]["specifics"]
print(json.dumps(specifics, indent=2))
This should get you a dictionary:
{
"__type": "specifics_sport_collectif",
"vainqueur": "domicile",
"score": {
"__type": "score",
"exterieur": "1",
"domicile": "4"
},
"exterieur": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "202",
"url_image": "https://medias.lequipe.fr/logo-football/202/{width}{declinaison}",
"nom": "Dijon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub202.html"
}
},
"domicile": {
"__type": "effectif_sport_collectif",
"equipe": {
"__type": "equipe",
"id": "22",
"url_image": "https://medias.lequipe.fr/logo-football/22/{width}{declinaison}",
"nom": "Lyon",
"url_fiche": "https://www.lequipe.fr/Football/FootballFicheClub22.html"
}
},
"is_final": false,
"prolongation": false,
"vainqueur_final": "domicile",
"is_qualifier": false
}
And if you, for example, just want the socre, add this line:
just_the_score = specifics["score"]
print(just_the_score)
To get this:
{'__type': 'score', 'exterieur': '1', 'domicile': '4'}
If you do View Page source on the below link;
https://www.zoopla.co.uk/for-sale/details/53818653?search_identifier=7e57533214fc2402ba53dd6c14b624f8
Line 89 has the tag <script> with information under it up to line 164. I am trying to extract this with beautiful soup but am unable to.
I can successfully extract other tags like "h2"/"Div" etc using the below:
From line 1,028 of the page source.
for item_name in soup.findAll('h2', {'class': 'ui-property-summary__address'}):
ad = item_name.get_text(strip=True)"
Can you please advise how I can extract the Script tag from line 89?
Thanks
This example will locate the <script> tag and parse some data from it:
import re
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.zoopla.co.uk/for-sale/details/53818653?search_identifier=7e57533214fc2402ba53dd6c14b624f8'
# locate the tag
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
script = soup.select_one('script:contains("ZPG.trackData.taxonomy")')
# parse some data from script
data1 = re.findall(r'ZPG\.trackData\.ecommerce = ({.*?});', script.text, flags=re.S)[0]
data1 = json.loads( re.sub(r'([^"\s]+):\s', r'"\1": ', data1) )
data2 = re.findall(r'ZPG\.trackData\.taxonomy = ({.*?});', script.text, flags=re.S)[0]
data2 = json.loads( re.sub(r'([^"\s]+):\s', r'"\1": ', data2) )
# print the data
print(json.dumps(data1, indent=4))
print(json.dumps(data2, indent=4))
Prints:
{
"detail": {
"products": [
{
"brand": "Walton and Allen Estate Agents Ltd",
"category": "for-sale/resi/agent/pre-owned/gb",
"id": 53818653,
"name": "FS_Contact",
"price": 1,
"quantity": 1,
"variant": "standard"
}
]
}
}
{
"signed_in_status": "signed out",
"acorn": 44,
"acorn_type": 44,
"area_name": "Aspley, Nottingham",
"beds_max": 3,
"beds_min": 3,
"branch_id": "43168",
"branch_logo_url": "https://st.zoocdn.com/zoopla_static_agent_logo_(586192).png",
"branch_name": "Walton & Allen Estate Agents",
"brand_name": "Walton and Allen Estate Agents Ltd",
"chain_free": false,
"company_id": "21619",
"country_code": "gb",
"county_area_name": "Nottingham",
"currency_code": "GBP",
"display_address": "Melbourne Road, Aspley, Nottingham NG8",
"furnished_state": "",
"group_id": "",
"has_epc": false,
"has_floorplan": true,
"incode": "5HN",
"is_retirement_home": false,
"is_shared_ownership": false,
"listing_condition": "pre-owned",
"listing_id": 53818653,
"listing_status": "for_sale",
"listings_category": "residential",
"location": "Aspley",
"member_type": "agent",
"num_baths": 1,
"num_beds": 3,
"num_images": 15,
"num_recepts": 1,
"outcode": "NG8",
"post_town_name": "Nottingham",
"postal_area": "NG",
"price": 150000,
"price_actual": 150000,
"price_max": 150000,
"price_min": 150000,
"price_qualifier": "guide_price",
"property_highlight": "",
"property_type": "semi_detached",
"region_name": "East Midlands",
"section": "for-sale",
"size_sq_feet": "",
"tenure": "",
"zindex": "129806"
}
Find all the <script> tags, then search them for the one that contains ZPG.trackData.ecommerce.
ecommerce = None
for item in soup.findAll('script'):
if 'ZPG.trackData.ecommerce' in item.string:
ecommerce = item.string
break
While I was practicing some web-scraping on a webpage (param cookies required), I found myself having problems to scrape out JSON data embedded in the HTML. The following was what I did:
import requests from bs4
import BeautifulSoup as soup
import json
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'html.parser')
data = page_soup.findAll('script', {'type':'application/ld+json'})
The output is as follows:
[
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "BreadcrumbList",
"itemListElement": [
{
"item": {
"name": "Home",
"#id": "https://www.lazada.sg/"
},
"#type": "ListItem",
"position": "1"
}
]
}</script>,
<script type="application/ld+json">{
"#context": "https://schema.org",
"#type": "ItemList",
"itemListElement": [
{
"offers": {
"priceCurrency": "SGD",
"#type": "Offer",
"price": "71.00",
"availability": "https://schema.org/InStock"
},
"image": "https://sg-test-11.slatic.net/p/670a73a9613c36b2bb01555ab4092ba2.jpg",
"#type": "Product",
"name": "Switch: Super Mario Party [Available in Stock! Immediate Shipping]",
"url": "https://www.lazada.sg/products/switch-super-mario-party-available-in-stock-immediate-shipping-i278269540-s429667097.html?search=1"
},
...
I tried to follow an existing thread Extract json from html in python beautifulsoup but found myself stuck, probably due to the different JSON formatting in the HTML soup. The part which I scrape out contains all the different products in that page, is there a way where I further scrape out each product's details (eg. Title, price, rating, etc) and count the number of products present? Thanks!
You can loop parsing out from the json after loading with json.loads. All the product info for those containers is listed in one script tag so you can just grab that.
import requests
from bs4 import BeautifulSoup as soup
import json
import pandas as pd
my_url = 'https://www.lazada.sg/catalog/?spm=a2o42.home.search.1.488d46b5mJGzEu&q=switch%20games&_keyori=ss&from=search_history&sugg=switch%20games_0_1'
cookies = {
"Hm_lpvt_7cd4710f721b473263eed1f0840391b4": "1548175412",
"Hm_lvt_7cd4710f721b473263eed1f0840391b4": "1548140525",
"x5sec":"7b22617365727665722d6c617a6164613b32223a223832333339343739626466613939303562613535386138333266383365326132434c4b516e65494645495474764a322b706f6d6f6941453d227d", }
ret = requests.get(my_url, cookies=cookies)
print("New Super Mario Bros" in ret.text) # True
page_soup = soup(ret.text, 'lxml')
data = page_soup.select("[type='application/ld+json']")[1]
oJson = json.loads(data.text)["itemListElement"]
numProducts = len(oJson)
results = []
for product in oJson:
results.append([product['name'], product['offers']['price'], product['offers']['availability'].replace('https://schema.org/', '')]) # etc......
df = pd.DataFrame(results)
print(df)