Can't fetch an item out of weird json content

Can't fetch an item out of weird json content - python

I'm trying to get some items from json content. However, the structure of that json content is foreign to me and as a result I can't fetch the value of property out of it.
I've tried so far with:
import json
import requests
from bs4 import BeautifulSoup
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
def fetch_content(link):
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
print(json.loads(item)['apiCache'])
if __name__ == '__main__':
fetch_content(link)
The result I get running the above script is:
{"VariantQuery{\"zpid\":43835884}":{"property":{"zpid":43835884,"streetAddress":"5958 SW 4th St",
Which I can't further process for that weird key in front.
Expected output:
{"zpid":43835884,"streetAddress":"5958 SW 4th St", ----
How can I get the value of that property?

You can get zpid and address by their mangled json with:
json.loads(json.loads(item.text)['apiCache'])['VariantQuery{"zpid":43835884}']['property']['zpid']
Out[1889]: 43835884
json.loads(json.loads(item.text)['apiCache'])['VariantQuery{"zpid":43835884}']['property']['streetAddress']
Out[1890]: '5958 SW 4th St'
I noticed you can always get the zpid like this:
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
print(json.loads(item)['zpid'])

Just modify your function to the following. I also added another function (process_fetched_content()) to give you some more freedom. You could simply run it and it will take care of situations even when you have multiple keys that start with 'VariantQuery{"zpid":'. The final output is a dict with the keys being your zpid and the values being what you are looking for.
If you have a lot of zpid values, then this will let you accumulate them all together and then process them. The benefit is the list of keys is then the list of zpids you have.
Here's how you could use this code.
results = process_fetched_content(raw_dictionary = fetch_content(link, verbose=False))
print(results)
output:
{'43835884': {'zpid': 43835884, 'streetAddress': '5958 SW 4th St', 'zipcode': '33144', 'city': 'Miami', 'state': 'FL', 'latitude': 25.76661, 'longitude': -80.292801, 'price': 340000, 'dateSold': 1576875600000, 'bathrooms': 2, 'bedrooms': 3, 'livingArea': 1757, 'yearBuilt': 1973, 'lotSize': 4331, 'homeType': 'SINGLE_FAMILY', 'homeStatus': 'RECENTLY_SOLD', 'photoCount': 19, 'imageLink': 'https://photos.zillowstatic.com/p_g/IS7yxihwtuqmlq1000000000.jpg', 'daysOnZillow': 0, 'isFeatured': False, 'shouldHighlight': False, 'brokerId': 0, 'zestimate': 341336, 'rentZestimate': 2200, 'listing_sub_type': {}, 'priceReduction': '', 'isUnmappable': False, 'rentalPetsFlags': 128, 'mediumImageLink': 'https://photos.zillowstatic.com/p_c/IS7yxihwtuqmlq1000000000.jpg', 'isPreforeclosureAuction': False, 'homeStatusForHDP': 'RECENTLY_SOLD', 'priceForHDP': 340000, 'festimate': 341336, 'isListingOwnedByCurrentSignedInAgent': False, 'isListingClaimedByCurrentSignedInUser': False, 'hiResImageLink': 'https://photos.zillowstatic.com/p_f/IS7yxihwtuqmlq1000000000.jpg', 'watchImageLink': 'https://photos.zillowstatic.com/p_j/IS7yxihwtuqmlq1000000000.jpg', 'tvImageLink': 'https://photos.zillowstatic.com/p_m/IS7yxihwtuqmlq1000000000.jpg', 'tvCollectionImageLink': 'https://photos.zillowstatic.com/p_l/IS7yxihwtuqmlq1000000000.jpg', 'tvHighResImageLink': 'https://photos.zillowstatic.com/p_n/IS7yxihwtuqmlq1000000000.jpg', 'zillowHasRightsToImages': True, 'desktopWebHdpImageLink': 'https://photos.zillowstatic.com/p_h/IS7yxihwtuqmlq1000000000.jpg', 'isNonOwnerOccupied': False, 'hideZestimate': False, 'isPremierBuilder': False, 'isZillowOwned': False, 'currency': 'USD', 'country': 'USA', 'taxAssessedValue': 224131, 'streetAddressOnly': '5958 SW 4th St', 'unit': ' '}}
Code
import json
import requests
from bs4 import BeautifulSoup
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
def fetch_content(link, verbose=False):
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
d = json.loads(item)['apiCache']
d = json.loads(d)
if verbose:
print(d)
return d
def process_fetched_content(raw_dictionary=None):
if raw_dictionary is not None:
keys = [k for k in raw_dictionary.keys() if k.startswith('VariantQuery{"zpid":')]
results = dict((k.split(':')[-1].replace('}',''), d.get(k).get('property', None)) for k in keys)
return results
else:
return None

Related

Iterate through a nested dict inside of a list, with some missing keys

I need your guys' help on how to extract information from a nested dictionary inside a list. Here's the code to get the data:
import requests
import json
import time
all_urls = []
for x in range(5000,5010):
url = f'https://api.jikan.moe/v4/anime/{x}/full'
all_urls.append(url)
all_responses = []
for page_url in all_urls:
response = requests.get(page_url)
all_responses.append(response)
time.sleep(1)
print(all_responses)
data = []
for i in all_responses:
json_data = json.loads(i.text)
data.append(json_data)
print(data)
The sample of the extracted data is as follows:
[{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5000/'},
{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5001/'},
{'data': {'mal_id': 5002,
'url': 'https://myanimelist.net/anime/5002/Bari_Bari_Densetsu',
'images': {'jpg': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.jpg',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.jpg',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.jpg'},
'webp': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.webp',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.webp',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.webp'}},
'trailer': {'youtube_id': None,
'url': None,
'embed_url': None,
'images': {'image_url': None,
'small_image_url': None,
'medium_image_url': None,
'large_image_url': None,
'maximum_image_url': None}},
'title': 'Bari Bari Densetsu',
'title_english': None,
'title_japanese': 'バリバリ伝説',
'title_synonyms': ['Baribari Densetsu',
......
I need to extract the title from the list of data. Any help is appreciated! Also, any recommendation on a better/simpler/cleaner code to extract the json data from an API is also greatly appreciated!

Firstly, no need to create multiple lists. You can do everything in one loop:
import requests
import json
data = []
for x in range(5000,5010):
page_url = f'https://api.jikan.moe/v4/anime/{x}/full'
response = requests.get(page_url)
json_data = json.loads(response.text)
data.append(json_data)
print(data)
Second, to address your problem, you have two options. You can use dict.get:
for dic in data:
title = dic.get('title', 'no title')
Or use the try/except pattern:
for dic in data:
try:
title = dic['title']
except KeyError:
# deal with case where dict has no title
pass

Can't get info of a lxml site with Request and BeautifulSoup

I'm trying to make a testing project that scraps info of a specific site but with no success.
I followed some tutorials i have found and even an post on stackoverflow. After all this I'm stuck!
help me stepbrothers, I'm a hot new programmer with python and I can't stop my projects.
more info: this is a lottery website that I was trying to scrap and make some analisys to get a lucky number.
I have followed this tutorials:
https://towardsdatascience.com/how-to-collect-data-from-any-website-cb8fad9e9ec5
https://beautiful-soup-4.readthedocs.io/en/latest/
Using BeautifulSoup in order to find all "ul" and "li" elements
All of you have my gratitute!
from bs4 import BeautifulSoup as bs
import requests
import html5lib
#import urllib3 # another attemp to make another req in the url ------failed
url = '''https://loterias.caixa.gov.br/Paginas/Mega-Sena.aspx'''
#another try to take results in the <ul> but I have no qualified results == None
def parse_ul(elem):#https://stackoverflow.com/questions/50338108/using-beautifulsoup-in-order-to-find-all-ul-and-li-elements
result = {}
for sub in elem.find_all('li', recursive=False):
if sub.li is None:
continue
data = {k: v for k, v in sub.attrs.items()}
if sub.ul is not None:
# recurse down
data['children'] = parse_ul(sub.ul)
result[sub.li.get_text(strip=True)] = data
return result
page = requests.get(url)#taking info from website
print(page.encoding)# == UTF-8
soup = bs(page.content,features="lxml")#takes all info from the url and organizes it ==Beaultiful soup
numbers = soup.find(id='ulDezenas')#searcher in the content of this specific id// another try: soup.find('ul', {'class': ''})
result = parse_ul(soup)#try to parse info, but none is found EVEN WITH THE ORIGINAL ONE
print(numbers)#The result is below:
'''<ul class="numbers megasena" id="ulDezenas">
<li ng-repeat="dezena in resultado.listaDezenas ">{{dezena.length > 2 ? dezena.slice(1) : dezena}}</li>
</ul>'''
print(result)# == "{}" nothing found
#with open('''D:\Documents\python\_abretesesame.txt''', 'wb') as fd:
# for chunk in page.iter_content(chunk_size=128):
# fd.write(chunk)
# =======printing document(HTML) in file still no success in getting the numbers

Main issue is that the content is provided dynamically by JavaScript but you can get the information via another url:
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
will give you folowing JSON:
{'tipoJogo': 'MEGA_SENA', 'numero': 2468, 'nomeMunicipioUFSorteio': 'SÃO PAULO, SP', 'dataApuracao': '02/04/2022', 'valorArrecadado': 158184963.0, 'valorEstimadoProximoConcurso': 3000000.0, 'valorAcumuladoProximoConcurso': 0.0, 'valorAcumuladoConcursoEspecial': 36771176.89, 'valorAcumuladoConcurso_0_5': 33463457.98, 'acumulado': False, 'indicadorConcursoEspecial': 1, 'dezenasSorteadasOrdemSorteio': ['022', '041', '053', '042', '035', '057'], 'listaResultadoEquipeEsportiva': None, 'numeroJogo': 2, 'nomeTimeCoracaoMesSorte': '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', 'tipoPublicacao': 3, 'observacao': '', 'localSorteio': 'ESPAÇO DA SORTE', 'dataProximoConcurso': '06/04/2022', 'numeroConcursoAnterior': 2467, 'numeroConcursoProximo': 2469, 'valorTotalPremioFaixaUm': 0.0, 'numeroConcursoFinal_0_5': 2470, 'listaDezenas': ['022', '035', '041', '042', '053', '057'], 'listaDezenasSegundoSorteio': None, 'listaMunicipioUFGanhadores': [{'posicao': 1, 'ganhadores': 1, 'municipio': 'SANTOS', 'uf': 'SP', 'nomeFatansiaUL': '', 'serie': ''}], 'listaRateioPremio': [{'faixa': 1, 'numeroDeGanhadores': 1, 'valorPremio': 122627171.8, 'descricaoFaixa': '6 acertos'}, {'faixa': 2, 'numeroDeGanhadores': 267, 'valorPremio': 34158.18, 'descricaoFaixa': '5 acertos'}, {'faixa': 3, 'numeroDeGanhadores': 20734, 'valorPremio': 628.38, 'descricaoFaixa': '4 acertos'}], 'id': None, 'ultimoConcurso': True, 'exibirDetalhamentoPorCidade': True, 'premiacaoContingencia': None}
Simply extract dezenasSorteadasOrdemSorteio and prozess in list comprehension:
[n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']]
Result will be:
['22', '35', '41', '42', '53', '57']
Example
import requests
jsonData = requests.get('https://servicebus2.caixa.gov.br/portaldeloterias/api/megasena/').json()
print([n if len(n) < 2 else n[1:] for n in jsonData['listaDezenas']])

Using regex to search for text that follows a specific word

I am searching a string of text which contains dictionaries that look like so:
soup_string = """{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"},
{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}"""
I am looking to return the string which follows each key in the 'dictionaries'.
I have decided an appropriate method is to use Regex expressions. I can return each times and costs using
Costs = re.findall(r"\£[0-9]\.[0-9][0-9]", soup_string)
times = re.findall(r'\"(time)\"\:\"(.{14,16})\"\,', soup_string)
Essentially I would like to be able to look for each key in the dictionary, and search for a specific string then return the value.
The end goal is to create a dictionary with the 'Cost', 'Availability' and 'time'.
Full code:
import requests
from bs4 import BeautifulSoup
import json
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
soup = BeautifulSoup(r2.text, 'html.parser')
soup_string = str(soup)
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]```

Given that you have multiple dictionaries, I'm not exactly sure what you're trying to obtain, but from my understanding this should help:
import json
soup_string = ''' ... ''' # As it is in the question
soup_dicts = json.loads('[' + soup_string + ']')
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:soup_dict[k] for k in keep_keys} for soup_dict in soup_dicts]
It treats your string of dictionaries as a list of JSON dictionaries, and uses the json module to parse it. Then it filters out everything except the key/value pairs you need. The result is a list of the filtered dictionaries.
Output (i.e. value of filtered):
[
{'cost': '£2.00', 'availability': 'full', 'time': '12:00pm - 1:00pm'},
{'cost': '£2.00', 'availability': 'available', 'time': '12:30pm - 1:30pm'}
]
EDIT:
In response to you providing your code, I can see that you're calling str on the results from BeautifulSoup. Rather than doing that, you can just process the client.get() results directly:
import json
import requests
postcode = "L4 0TH"
ASDA_url = "https://groceries.asda.com/api/user/checkpostcode?postcode="+ postcode + "&requestorigin=gi"
ASDA_url2 = "https://groceries.asda.com/api/slot/view?startdate=12%2F06%2F2019&deliveryoption=homedelivery&requestorigin=gi&_="
client = requests.Session()
r = client.get(ASDA_url)
r2 = client.get(ASDA_url2)
dicts = r2.json()['slotHeader'][0]['slots']
keep_keys = ('cost', 'availability', 'time')
filtered = [{k:d[k] for k in keep_keys} for d in dicts]

First you need to put your data into a list and create a dictionary with key: data. (see my example below). Then use json to convert it as a dictionary of dictionaries. Then extract cost, availability and time per dictionary on a loop.
import json
soup_string = """{"data": [{"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:00+01:00/13:00+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"full","slotDiscountedByDP":"false","slotId":"1hr-12-13-20190613","time":"12:00pm - 1:00pm","rawSlotPrice":"","slotDiscounted":"false"}, {"loadType":"","shiftId":"ROVR-DUMMY-SHIFTID","carbonFriendly":"no","cost":"£2.00","initialSlotPrice":"","timeSlotISO":"2019-06-13T12:30+01:00/13:30+01:00","isSameDayPremium":"false","stopId":"10446315588190612134701380","availability":"available","slotDiscountedByDP":"false","slotId":"1hr-12:30-13:30-20190613","time":"12:30pm - 1:30pm","rawSlotPrice":"","slotDiscounted":"false"}]}"""
d = json.loads(soup_string)
result = []
cost, avail, time = [], [], []
for data in d['data']:
tmp = {}
tmp['Cost'] = data['cost']
tmp['Availability'] = data['availability']
tmp['Time'] = data['time']
result.append(tmp)
result
Output:
[{'Cost': '£2.00', 'Availability': 'full', 'Time': '12:00pm - 1:00pm'},
{'Cost': '£2.00', 'Availability': 'available', 'Time': '12:30pm - 1:30pm'}]

combine dictionaries and pass as output to another function

I am learning python and coding. I am trying one web scraping example. I download the currency exchange data from a website and I want to compute average exchange rate for each currency over a 50 days period. The problem is that I am unable to do the following.
I get results from first function which should be in form of a dictionary and then pass these dictionaries to another function as argument and to perform averaging of those values. I am unable to pass correctly dict values to another function.
my code is as follow
import os
import webbrowser
import requests as rq
import sys
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
def saveData(path, date):
session = rq.session()
url = 'https://www.bnm.md/en/official_exchange_rates?get_xml=1&date=' + date
datastore = session.get(url)
with open(path, 'wb') as f:
f.write(datastore.content)
data = ET.fromstring(datastore.content)
'''
elements = {}
for element in data.iter():
if element.tag in ('Name', 'Value'):
elements[element.tag] = element.text
print 'elements:', elements
# Here I want to combine those all dictionaries in variable so that i can pass it as argument to another function
return elements
'''
# i replace the above triple quote code with the following below code
elements = {}
for tag, text in data.items():
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
return elements
def computeAverage(elements): # I want to pass function saveData() results who are in dictioanry form to this function but I am unable to solve this issue.
print elements
def main():
dates = ['20.04.2016', '21.04.2016', '22.04.2016']
paths = []
for date in dates:
path = '/home/robbin/Desktop/webscrape/{}.xml'.format(date)
paths.append(path)
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
print 'data2: ', data2
for k, v in data2.items():
data3.setdefault(k, [])
data3[k].append(v)
print 'data3: ', data3
computeAverage(data3)
if __name__ == '__main__':
main()
Also I am getting the results from saveData() function as dictionaries like this and it repeat every dictionary for the next item too which is wrong.
elements: {'Name': 'Euro'}
elements: {'Name': 'Euro', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '4.9988'}
Also what I tried to get results like this but failed
elements: {'Name': 'Euro', 'Value': '22.4023'}
elements: {'Name': 'US Dollar', 'Value': '19.7707'}
elements: {'Name': 'Russian Ruble', 'Value': '0.3014'}
elements: {'Name': 'Romanian Leu', 'Value': '4.9988'}
Updates:-------------
elements = []
for element in data.iter():
if element.tag in ('Name', 'Value'):
elements.append(element.text)
# print 'elements: ', elements
return elements
and in the main function() i make
for path, date in zip(paths, dates):
data = saveData(path, date)
# print 'data from main: ', data
computeAverage(data)
and the output of "print 'data from main: ', data" looks like this
['Euro', '22.4023', 'US Dollar', '19.7707', 'Russian Ruble', '0.3014', 'Romanian Leu', '4.9988',.........'Special Drawing Rights', '27.8688']
['Euro', '22.4408', 'US Dollar', '19.7421', 'Russian Ruble', '0.3007', 'Romanian Leu', '5.0012',.....'Special Drawing Rights', '27.8606']
I am newbie to coding and if someone help me regarding these two problems. I would be really thankful.

First of all, I agree with #Prakhar Verma.
Second, you didn't mention clearly what you want. But I can assume that you want to merge the data that you got from the 'saveData' function and then calculate average. So, here is the missing code.
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
for k, v in data2.items():
# you can move this line after declaring the data3 dict if keys returned by saveData are fixed i.e. name, value
data3.setdefault(k, [])
data3[k].append(v)
computeAverage(data3)
Update to saveData function:
elements = {}
for tag, text in data.items():
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
===================================================
Update 2:
def saveData(path, date):
#session = rq.session()
url = 'https://www.bnm.md/en/official_exchange_rates?get_xml=1&date=' + date
datastore = rq.get(url)
with open(path, 'wb') as f:
f.write(datastore.content)
data = ET.fromstring(datastore.content)
# i replace the above triple quote code with the following below code
elements = {}
for element in data.iter():
tag = element.tag
text = element.text
if tag in ('Name', 'Value'):
elements.setdefault(tag, [])
elements[tag].append(text)
return elements
def main():
dates = ['20.03.2016', '21.03.2016', '22.03.2016']
paths = []
for date in dates:
#please edit this
path = '{}.xml'.format(date)
paths.append(path)
data3 = {}
for path, date in zip(paths, dates):
data2 = saveData(path, date)
for k, v in data2.items():
data3.setdefault(k, [])
data3[k].append(v)
computeAverage(data3)

The 'saveData' function is returning data but you are not saving it in any variable. So what you need to do is save the data when it's returned from 'saveData' function and then send it as a parameter to 'computeAverage' function.
Please go through the basics of coding and follow any programming tutorial. :)

recursively collect string blocks in python

I have a custom data file formatted like this:
{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}
I want to collect the blocks of data, meaning string between each set of {} while maintaining a hierarhcy. This data is not a typical json format so that is not a possible solution.
My idea was to create a class object like so
class Block:
def __init__(self, header, children):
self.header = header
self.children = children
Where i would then loop through the data line by line 'somehow' collecting the necessary data so my resulting output would like something like this...
Block("data = {}", [
Block("friends = {max = 0 0,\n min = 0 0,}", []),
Block("family = {version = 1}", [...])
])
In short I'm looking for help on ways I can serialize this into useful data I can then easily manipulate. So my approach is to break into objects by using the {} as dividers.
If anyone has suggestions on ways to better approach this I'm all up for ideas. Thank you again.
So far I've just implemented the basic snippets of code
class Block:
def __init__(self, content, children):
self.content = content
self.children = children
def GetBlock(strArr=[]):
print len(strArr)
# blocks = []
blockStart = "{"
blockEnd = "}"
with open(filepath, 'r') as file:
data = file.readlines()
blocks = GetBlock(strArr=data)

You can create a to_block function that takes the lines from your file as an iterator and recursively creates a nested dictionary from those. (Of course you could also use a custom Block class, but I don't really see the benefit in doing so.)
def to_block(lines):
block = {}
for line in lines:
if line.strip().endswith(("}", "},")):
break
key, value = map(str.strip, line.split(" = "))
if value.endswith("{"):
value = to_block(lines)
block[key] = value
return block
When calling it, you have to strip the first line, though. Also, evaluating the "leafs" to e.g. numbers or strings is left as an excercise to the reader.
>>> to_block(iter(data.splitlines()[1:]))
{'data': {'family': {'version': '1,',
'cars': {'bike': '"trek",', 'car': '"ford",', 'van': '"honda",'},
'presets': {'travelers': 'False,', 'size': '10,', 'location': '"italy",'}},
'friends': {'max': '0 0,', 'min': '0 0,'}}}
Or when reading from a file:
with open("data.txt") as f:
next(f) # skip first line
res = to_block(f)
Alternatively, you can do some preprocessing to transform that string into a JSON(-ish) string and then use json.loads. However, I would not go all the way here but instead just wrap the values into "" (and replace the original " with ' before that), otherwise there is too much risk to accidentally turning a string with spaces into a list or similar. You can sort those out once you've created the JSON data.
>>> data = data.replace('"', "'")
>>> data = re.sub(r'= (.+),$', r'= "\1",', data, flags=re.M)
>>> data = re.sub(r'^\s*(\w+) = ', r'"\1": ', data, flags=re.M)
>>> data = re.sub(r',$\s*}', r'}', data, flags=re.M)
>>> json.loads(data)
{'data': {'family': {'version': '1',
'presets': {'size': '10', 'travelers': 'False', 'location': "'italy'"},
'cars': {'bike': "'trek'", 'van': "'honda'", 'car': "'ford'"}},
'friends': {'max': '0 0', 'min': '0 0'}}}

You can also do with ast or json with the help of regex substitutions.
import re
a = """{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}"""
#with ast
a = re.sub("(\w+)\s*=\s*", '"\\1":', a)
a = re.sub(":\s*((?:\d+)(?: \d+)+)", lambda x:':[' + x.group(1).replace(" ", ",") + "]", a)
import ast
print ast.literal_eval(a)
#{'data': {'friends': {'max': [0, 0], 'min': [0, 0]}, 'family': {'cars': {'car': 'ford', 'bike': 'trek', 'van': 'honda'}, 'presets': {'travelers': False, 'location': 'italy', 'size': 10}, 'version': 1}}}
#with json
import json
a = re.sub(",(\s*\})", "\\1", a)
a = a.replace(":True", ":true").replace(":False", ":false").replace(":None", ":null")
print json.loads(a)
#{u'data': {u'friends': {u'max': [0, 0], u'min': [0, 0]}, u'family': {u'cars': {u'car': u'ford', u'bike': u'trek', u'van': u'honda'}, u'presets': {u'travelers': False, u'location': u'italy', u'size': 10}, u'version': 1}}}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Can't fetch an item out of weird json content - python

Related

Iterate through a nested dict inside of a list, with some missing keys

Can't get info of a lxml site with Request and BeautifulSoup

Using regex to search for text that follows a specific word

combine dictionaries and pass as output to another function

recursively collect string blocks in python

Categories

Resources