I am trying to find a list inside of JSON data with RegEx. Here is my code:
import requests
from bs4 import BeautifulSoup
import re
import json
source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
pattern2 = re.compile(r'^\"[0-9]*\":{\"data\":{\"locations\":(.*)},')
data_list = pattern2.search(str(jsonData)).group(1)
print(data_list)
With this regular expression pattern2 = re.compile(r'^\"[0-9]*\":{\"data\":{\"locations\":(.*)},') I want to find the value (list) of locations, but I get an error AttributeError: 'NoneType' object has no attribute 'group'.
The part of JSON data that I want to find looks like this:
"3960485871": {
"data": {
"locations": [
{
"detail": {
"hotel": {
"aliases": [
{
"id": 1099146,
"locale": "de",
"score": 390000,
"text": "hotel coronado"
},
{
"id": 1261196,
"locale": "es",
"score": 260000,
"text": "hotel coronado"
},
{
"id": 261321,
"locale": null,
"score": 112500,
"text": "coronado hotel z\u00fcrich"
}
],
"details": {
"numRooms": 40
}
},
"priceRange": {
"maximum": 212,
"minimum": 133
}
},
"formerName": null,
"locationId": 228146,
"neighborhoods": [],
"parents": [
{
"locationId": 188113,
"name": "Z\u00fcrich",
"placeType": "MUNICIPALITY"
},
{
"locationId": 188111,
"name": "Kanton Z\u00fcrich",
"placeType": "CANTON"
},
{
"locationId": 188045,
"name": "Schweiz",
"placeType": "COUNTRY"
},
{
"locationId": 4,
"name": "Europa",
"placeType": "CONTINENT"
},
{
"locationId": 1,
"name": "Welt",
"placeType": null
}
]
}
]
}
},
Try this (input data reduced as it was too big)
jsonData = {
"3960485871": {
"data": {
"locations": [
{
"detail": {},
"formerName": None,
"locationId": 228146,
"neighborhoods": [],
"parents": []
}
]
}
},
}
def find_recursive(data, type_):
# If we found what we are looking for, return it
if isinstance(data, type_):
return data
# If we didn't find it, recursively look for it
# Lists and tuples
if isinstance(data, (list, tuple)):
for item in data:
item = find_recursive(item, type_)
if item is not None:
return item
# Dicts
elif isinstance(data, dict):
for item in data.values():
item = find_recursive(item, type_)
if item is not None:
return item
# Add here other containers that you want to recursively look for
# If we weren't able to find it recursively, return None
return None
find_recursive(jsonData, list)
Usage: find_recursive(DATA, TYPE) where DATA is the nested containers and TYPE the python type you want to find. It does accept lists and dicts as nested containers but it could be extended to others (such as set) just by iterating over their items and returning them if they are not None. You can use a single if for multiple types like I did with list and tuple in case they behave the same.
Related
I have had trouble appending id's to a separate list as I parse through the JSON I receive from Spotify's "Users Saved Tracks" endpoint.
The JSON received looks like this:
{
"href": "https://api.spotify.com/v1/me/tracks?offset=0&limit=20",
"items": [
{
"added_at": "2021-11-16T13:56:51Z",
"track": {
"album": {
"album_type": "single",
"artists": [
{
"external_urls": {
"spotify": "https://open.spotify.com/artist/3iKDeO8yaOiWz7vkeljunk"
},
"href": "https://api.spotify.com/v1/artists/3iKDeO8yaOiWz7vkeljunk",
"id": "3iKDeO8yaOiWz7vkeljunk",
"name": "Heavenward",
"type": "artist",
"uri": "spotify:artist:3iKDeO8yaOiWz7vkeljunk"
}
],
"available_markets": [
],
"disc_number": 1,
"duration_ms": 224838,
"explicit": false,
"external_ids": {
"isrc": "QZK6P2040977"
},
"external_urls": {
"spotify": "https://open.spotify.com/track/6mJ1nbmQOm6iNClo71K5O6"
},
"href": "https://api.spotify.com/v1/tracks/6mJ1nbmQOm6iNClo71K5O6",
"id": "6mJ1nbmQOm6iNClo71K5O6",
"is_local": false,
"name": "Hole",
"popularity": 33,
"preview_url": "https://p.scdn.co/mp3-preview/c425dc91bdb19f1cddf2b35df08e30a03290c3c0?cid=8c9ee97b95854163a250399fda32d350",
"track_number": 1,
"type": "track",
"uri": "spotify:track:6mJ1nbmQOm6iNClo71K5O6"
}
}
Right now my code that I am using to parse looks like this:
def getLikedTrackIds(session):
url = 'https://api.spotify.com/v1/me/tracks'
payload = makeGetRequest(session, url)
if payload == None:
return None
liked_tracks_ids = []
for track in payload['items']:
for attribute in track['track']:
if (attribute == 'id'):
app.logger.info(f"\n\nTrack ID: {attribute}")
liked_tracks_ids.append(attribute)
return liked_tracks_ids
My liked_track_ids is filled with the string "id", for each song:
[ "id", "id", "id", "id"....]
Can anyone provide insight as to what I am doing wrong?
Already commented under the question but your code can be simplified by getting rid of the loop:
def getLikedTrackIds(session):
url = 'https://api.spotify.com/v1/me/tracks'
payload = makeGetRequest(session, url)
if payload == None:
return None
liked_tracks_ids = []
for track in payload['items']:
liked_id = track['track'].get('id', None)
if liked_id:
app.logger.info(f"\n\nTrack ID: {liked_id}")
liked_tracks_ids.append(liked_id)
return liked_tracks_ids
This is the first time I'm working with JSON, and I'm trying to pull url out of the JSON below.
{
"name": "The_New11d112a_Company_Name",
"sections": [
{
"name": "Products",
"payload": [
{
"id": 1,
"name": "TERi Geriatric Patient Skills Trainer,
"type": "string"
}
]
},
{
"name": "Contact Info",
"payload": [
{
"id": 1,
"name": "contacts",
"url": "https://www.a3bs.com/catheterization-kits-8000892-3011958-3b-scientific,p_1057_31043.html",
"contacts": [
{
"name": "User",
"email": "Company Email",
"phone": "Company PhoneNumber"
}
],
"type": "contact"
}
]
}
],
"tags": [
"Male",
"Airway"
],
"_id": "0e4cd5c6-4d2f-48b9-acf2-5aa75ade36e1"
}
I have been able to access description and _id via
data = json.loads(line)
if 'xpath' in data:
xpath = data["_id"]
description = data["sections"][0]["payload"][0]["description"]
However, I can't seem to figure out a way to access url. One other issue I have is there could be other items in sections, which makes indexing into Contact Info a non starter.
Hope this helps:
import json
with open("test.json", "r") as f:
json_out = json.load(f)
for i in json_out["sections"]:
for j in i["payload"]:
for key in j:
if "url" in key:
print(key, '->', j[key])
I think your JSON is damaged, it should be like that.
{
"name": "The_New11d112a_Company_Name",
"sections": [
{
"name": "Products",
"payload": [
{
"id": 1,
"name": "TERi Geriatric Patient Skills Trainer",
"type": "string"
}
]
},
{
"name": "Contact Info",
"payload": [
{
"id": 1,
"name": "contacts",
"url": "https://www.a3bs.com/catheterization-kits-8000892-3011958-3b-scientific,p_1057_31043.html",
"contacts": [
{
"name": "User",
"email": "Company Email",
"phone": "Company PhoneNumber"
}
],
"type": "contact"
}
]
}
],
"tags": [
"Male",
"Airway"
],
"_id": "0e4cd5c6-4d2f-48b9-acf2-5aa75ade36e1"
}
You can check it on http://json.parser.online.fr/.
And if you want to get the value of the url.
import json
j = json.load(open('yourJSONfile.json'))
print(j['sections'][1]['payload'][0]['url'])
I think it's worth to write a short function to get the url(s) and make a decision whether or not to use the first found url in the returned list, or skip processing if there's no url available in your data.
The method shall looks like this:
def extract_urls(data):
payloads = []
for section in data['sections']:
payloads += section.get('payload') or []
urls = [x['url'] for x in payloads if 'url' in x]
return urls
This should print out the URL
import json
# open json file to read
with open('test.json','r') as f:
# load json, parameter as json text (file contents)
data = json.loads(f.read())
# after observing format of JSON data, the location of the URL key
# is determined and the data variable is manipulated to extract the value
print(data['sections'][1]['payload'][0]['url'])
The exact location of the 'url' key:
1st (position) of the array which is the value of the key 'sections'
Inside the array value, there is a dict, and the key 'payload' contains an array
In the 0th (position) of the array is a dict with a key 'url'
While testing my solution, I noticed that the json provided is flawed, after fixing the json flaws(3), I ended up with this.
{
"name": "The_New11d112a_Company_Name",
"sections": [
{
"name": "Products",
"payload": [
{
"id": 1,
"name": "TERi Geriatric Patient Skills Trainer",
"type": "string"
}
]
},
{
"name": "Contact Info",
"payload": [
{
"id": 1,
"name": "contacts",
"url": "https://www.a3bs.com/catheterization-kits-8000892-3011958-3b-scientific,p_1057_31043.html",
"contacts": [
{
"name": "User",
"email": "Company Email",
"phone": "Company PhoneNumber"
}
],
"type": "contact"
}
]
}
],
"tags": [
"Male",
"Airway"
],
"_id": "0e4cd5c6-4d2f-48b9-acf2-5aa75ade36e1"}
After utilizing the JSON that was provided by Vincent55.
I made a working code with exception handling and with certain assumptions.
Working Code:
## Assuming that the target data is always under sections[i].payload
from json import loads
line = open("data.json").read()
data = loads(line)["sections"]
for x in data:
try:
# With assumption that there is only one payload
if x["payload"][0]["url"]:
print(x["payload"][0]["url"])
except KeyError:
pass
I am new to Python and I am having trouble collecting data from this json file using list comprehension but it is not working so far, so how I structure this list? I need to collect the tagName element of each skill.
This is what I tried:
def getUserSkills(handleList): #List of Strings
for handles in handleList:
response1 = requests.get("http://api.topcoder.com/v3/members/" + handles + "/skills")
data = response1.json()
skillList = [skill['tagName'] for skill in data['result']['content']['skills']]
print(skillList)
Json File:
"id":"-462bfb3:16a2448d765:4ed3",
"result":{
"success":true,
"status":200,
"metadata":null,
"content":{
"userId":21932422,
"userHandle":"saarixx",
"handleLower":"saarixx",
"skills":{
"130":{
"tagName":"Brute Force",
"hidden":false,
"score":88.0,
"sources":[
"CHALLENGE"
]
},
"259":{
"tagName":"JSON",
"hidden":false,
"score":5.0,
"sources":[
"CHALLENGE"
]
},
Iterate through the dictionary given by dct['result']['content']['skills'] and capture value['tagName]
dct = {
"id": "-462bfb3:16a2448d765:4ed3",
"result": {
"success": True,
"status": 200,
"metadata": None,
"content": {
"userId": 21932422,
"userHandle": "saarixx",
"handleLower": "saarixx",
"skills": {
"130": {
"tagName": "Brute Force",
"hidden": False,
"score": 88.0,
"sources": [
"CHALLENGE"
]
},
"259": {
"tagName": "JSON",
"hidden": False,
"score": 5.0,
"sources": [
"CHALLENGE"
]
}
}
}
}
}
skillList = [value['tagName'] for key,value in dct['result']['content']['skills'].items()]
print(skillList)
#['Brute Force', 'JSON']
import requests
def getUserSkills(handleList): # List of Strings
data = []
for handles in handleList:
response = requests.get("http://api.topcoder.com/v3/members/" + handles + "/skills")
data.append(response.json())
skillList = [skill['tagName'] for skill in data['result']['content']['skills']]
print(skillList)
I am trying to extract/parse the values from specifics in a JSON file that I did a post request.
Here is the JSON File. I am trying to get the values from the Key "AN". I want to be able to extract values such as "shannoncampbell_znyq1", "katiekapprelmac", etc. such that the values
from the second row does not equal to the number zero. For example, since the second row values (the for this row is T7) of katiekapprelmac does not equal to zero, my code should spit that out (katiekapprelmac should be the output). However it does not.
JSON File:
{
"id": "jsonrpc",
"jsonrpc": "2.0",
"result": {
"result": [
{
"AccountId": 697429,
"Flags": [
"AutoDeployed"
],
"PartnerId": 287562,
"Settings": [
{
"AN": "shannoncampbell_znyq1"
},
{
"T7": "0"
}
]
},
{
"AccountId": 725177,
"Flags": null,
"PartnerId": 287562,
"Settings": [
{
"AN": "katiekapprelmac"
},
{
"T7": "5"
}
]
},
{
"AccountId": 689130,
"Flags": [
"AutoDeployed"
],
"PartnerId": 287562,
"Settings": [
{
"AN": "sara-pc_wpv7h"
},
{
"T7": "0"
}
]
},
{
"AccountId": 697531,
"Flags": null,
"PartnerId": 287562,
"Settings": [
{
"AN": "kaelaweeksmac"
},
{
"T7": "0"
}
]
},
{
"AccountId": 615877,
"Flags": null,
"PartnerId": 249098,
"Settings": [
{
"AN": "elenimacbookpro"
},
{
"T7": "0"
}
]
},
{
"AccountId": 700661,
"Flags": null,
"PartnerId": 287562,
"Settings": [
{
"AN": "sethnickersonmac"
},
{
"T7": "0"
}
]
},
Here is my python code:
response2 = requests.request("POST", url, data=payload2, headers=headers)
j = json.loads(response2.text)
def find_all(item, level):
if isinstance(item, dict):
for k in item:
(find_all(item[k], level+1))
else:
print(item)
def find_only(item, level):
if isinstance(item, dict):
for k in item:
(find_only(item[k], level+1))
for each in j['result']['result']:
if (find_only(each['Settings'][1], 0)) != json.loads("0"):
find_all(each['Settings'][0], 0)
Instead, I get all the keys in the output. I get the following:
shannoncampbell_znyq1
katiekapprelmac
sara-pc_wpv7h
kaelaweeksmac
elenimacbookpro
sethnickersonmac
Rather than just katiekapprelmac
Please help. Thanks
In the code:
for each in j['result']['result']:
if (find_only(each['Settings'][1], 0)) != json.loads("0"):
find_all(each['Settings'][0], 0)
I actually see, your condition is always True, as you are not returning anything in find_only().
I don't know, why you are using level and so many recursive function. Although it's easy to extract result as per your data posted. please find below code.
response2 = requests.request("POST", url, data=payload2, headers=headers)
j = json.loads(response2.text)
for each in j['result']['result']:
if each['Settings'][1]['T7'] not in ["0", 0]:
print(each['Settings'][0]['AN'])
If your response data is little complex then please post for exact solution.
If you have multiple key name then please look at below code:
response2 = requests.request("POST", url, data=payload2, headers=headers)
j = json.loads(response2.text)
def find_all(item):
if isinstance(item, dict):
for k in item:
return item[k]
# If item is non dict and you want to return this as well on `True`.
# Uncomment below commented lines.
# else:
# item
def find_only(item):
if isinstance(item, dict):
for k in item:
return item[k]
for each in j['result']['result']:
if (find_only(each['Settings'][1])) != str(json.loads("0")):
print(find_all(each['Settings'][0]))
jsonpath-ng can help you with this.
from jsonpath_ng.ext import parse
found = parse(f"$..Settings").find(data)
if found:
for i in found:
if ''.join(i.value[1].values()) != '0':
print(i.value[0]['AN'])
I need to make a get (id, name, fraction id) for each deputy in this json
{
"id": "75785",
"title": "(за основу)",
"asozdUrl": null,
"datetime": "2011-12-21T12:20:26+0400",
"votes": [
{
"deputy": {
"id": "99111772",
"name": "Абалаков Александр Николаевич",
"faction": {
"id": "72100004",
"title": "КПРФ"
}
},
"result": "accept"
},
{
"deputy": {
"id": "99100491",
"name": "Абдулатипов Рамазан Гаджимурадович",
"faction": {
"id": "72100024",
"title": "ЕР"
}
},
"result": "none"
}
.......,` etc
My code is looks like that:
urlData = "https://raw.githubusercontent.com/data-dumaGovRu/vote/master/poll/2011-12-21/75785.json"
response = urllib.request.urlopen(urlData)
content = response.read()
data = json.loads(content.decode("utf8"))
for i in data:
#print(data["name"])
`
And i dont know what to do with that #print line, how I should write it?
You can access the list containing the deputies with data['votes']. Iterating through the list, you can access the keys you're interested in as you would with dict key lookups. Nested dicts imply you have to walk through the keys starting from the root to your point of interest:
for d in data['votes']:
print(d['deputy']['id'], d['deputy']['name'], d['deputy']['faction']['id'])