Parsing XML using json raises ValueError - python

I'm trying to parse a XML file using xml ElementTree and json
from xml.etree import ElementTree as et
import json
def parse_file(file_name):
tree = et.ElementTree()
npcs = {}
for npc in tree.parse(file_name):
quests = []
for quest in npc:
quest_name = quest.attrib['name']
stages = []
for i, stage in enumerate(quest):
next_stage, choice, npc_condition = None, None, None
for key, val in stage.attrib.items():
val = json.loads(val)
if key == 'choices':
choice = val
elif key == 'next_stage':
next_stage = val
elif key == 'ncp_condition':
npc_condition = {stage.attrib['npc_name']: val}
stages.append([i, next_stage, choice, npc_condition])
quests.append( {quest_name:stages})
npcs[npc.attrib['name']] = quests
return npcs
The XML file:
<?xml version="1.0" encoding="utf-8"?>
<npcs>
<npc name="NPC NAME">
<quest0 name="Quest Name here">
<stage0 choices='{"Option1":1, "Option1":2}'>
<text>text1</text>
</stage0>
<stage1 next_stage="[3,4]">
<text>text2</text>
</stage1>
<stage3 npc_name="other_npc_name" ncp_condition='{"some_condition":false}' next_stage="[3, 4]">
<text>text3</text>
</stage3>
</quest0>
</npc>
</npcs>
But I'm having trouble with this bit:
<stage3 npc_name="other_npc_name" ncp_condition='{"some_condition":false}' next_stage="[3, 4]">
Traceback:
Traceback (most recent call last):
File "C:/.../test2.py", line 28, in <module>
parse_file('quests.xml')
File "C:/.../test2.py", line 15, in parse_file
val = json.loads(val)
File "C:\Python27\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 366, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python27\lib\json\decoder.py", line 384, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
It raises this error in the line val = json.loads(val) when key="npc_name" and val="other_npc_name".
What's wrong with that? It didn't raise any error when name="some string", but it does when npc_name="some string".
I noticed that if I change "other_npc_name" to '"other_npc_name"' it doesn't complain, but this seem a bit hackish to me

JSON is a way to store data structures - thus it can only decode said data structures.
When you try to get JSON to decode something like this:
other_npc_name
JSON can't match this to any valid data type. However, if this is wrapped in quotation marks:
"other_npc_name"
JSON recognizes this as a String (as per the JSON spec, that is how a string is defined).
And this is what is happening in your script:
import json
print json.loads("other_npc_name") #throws error
print json.loads('"other_npc_name"') #returns "other_npc_name" as a Unicode string
Thus, it may seem 'hackish' to wrap the string this way, however, this is really the only way for JSON to decode it.
One potential suggestion is that if the npc_name attribute in XML is always a string, then pull it out as a string instead of trying to decode it as a JSON object.

Related

Converting string to raw string for json processing [Python]

I have the following code snippet:
input = "You can check it out here. https://www.youtube.com/watch?v=Ay1gCPAUnxo&t=83s I'll send $20 in bitclout to the first 50 people that follow instructions at end of the video. This is revolutionary. Let's hope it works! <3Building it. What's up y'all"
def createJsonText(input):
input = r'{}'.format(input)
x = r'{ "text":"' + input + r'"}'
print(x)
# parse x as json
y = json.loads(x)
f = open("tone.json", "a")
f.write(str(y))
f.close()
When I execute the aforementioned code I get the following error:
File "hashtag-analyzer.py", line X, in readJson
createJsonText(input) File "hashtag-analyzer.py", line Y, in createJsonText
y = json.loads(x) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/json/init.py",
line 354, in loads
return _default_decoder.decode(s) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/json/decoder.py",
line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end()) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/json/decoder.py",
line 355, in raw_decode
obj, end = self.scan_once(s, idx) json.decoder.JSONDecodeError: Expecting ',' delimiter: line 1 column 4194 (char 4193)
How to resolve this?
Expected output is a json file with name, "tone.json" and the following data inside:
{
"text": "You can check it out here. https://www.youtube.com/watch?v=Ay1gCPAUnxo&t=83s I'll send $20 in bitclout to the first 50 people that follow instructions at end of the video. This is revolutionary. Let's hope it works! <3Building it. What's up y'all"
}
You're going the wrong direction here, if you want to create JSON. You want dumps, not loads':
def createJsonText(txt):
x = {'text': txt}
y = json.dumps(x)
open('tone.json','w').write(y)
Your code had mode='a' for append, but a set of separate JSON lines is NOT a valid JSON file. If you want it to be a valid JSON file, you need the whole file to be one document.
Update
Alternatively:
def createJsonText(txt):
json.dump({'text':txt}, open('tone.json','w'))

json.loads error with configParser value in Python3

I try to get a List from a config.ini file using JSON in python but when I use " ' " for the string value in my list I get an error. Surprisingly I don't have it when I use " " ".
Python Code :
from configparser import ConfigParser
import json
#set and read the config file
config = ConfigParser()
config.read('config.ini')
#get the list with : ""
thisworks = json.loads(config.get('VALUES','value1'))
#get the list with : ''
thisnotwork = json.loads(config.get('VALUES','value2'))
config.ini file :
[VALUES]
value1 = ["tes1", "test2", "test3"]
value2 = ['tes1', 'test2', 'test3']
The variable "thisnotwork" return this error:
Traceback (most recent call last):
File "U:\Desktop\Nouveau dossier (2)\test.py", line 11, in <module>
thisnotwork = json.loads(config.get('VALUES','value2'))
File "C:\Python37\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Python37\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python37\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 2 (char 1)
[Finished in 0.258s]
This can be annoying because json.dumps() return 'string' and not "string". If someone has a solution for this it can be really helpful.
JSON Specification requires double quotes to be used for string values.
I tried json.dumps(['foo', 'bar']) and it outputs double quotes as expected.
maybe you can change the single quotations in config.ini list to double quotations like below:
value2= ['test1', 'test2', 'test3']
f'''"value2" : {str(value2).replace("'", '"')},\n'''
with the output of
value2= ["test1", "test2", "test3"]

unable to parse json content with error `expecting property name enclosed`

I am fetching from database a text that is in a json format. Basically dumping an api content from a website to a db with TEXT field.
I can see the json content has no issues/errors, but when I do ..
try:
get_all = db.query("SELECT id, name, api_content FROM _books")
_result = get_all.fetchall()
except Exception as e:
print("[e::line-163] ", e)
try:
for r in _result:
api_content = r[2]
j = json.loads(api_content)
print('names, ', j['names'])
I get this error .
Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
The api_content has no issues, it is a well-formed json content.
I don't think you have a valid JSON in api_content. Judging by what the error is about, your properties (keys) are enclosed in single quotes but need to be enclosed in double quotes to be a valid JSON.
Here is what is happening:
>>> import json
>>> json.loads('{"key": "value"}') # <- VALID JSON
{'key': 'value'}
>>> json.loads("{'key': 'value'}") # <- INVALID JSON
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/json/decoder.py", line 355, in raw_decode
obj, end = self.scan_once(s, idx)
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
In other words, you should revise the way you dump the API contents into the database. It looks like you are just writing string representations of Python dictionary objects instead of using json.dumps().
If you cannot change that, try using ast.literal_eval() to safely eval the api content:
from ast import literal_eval
for r in _result:
api_content = r[2]
j = literal_eval(api_content)
print('names, ', j['names'])

reading large JSON file in Python (raw_decode)

I am trying to read in large JSON file (data.json) in Python. Because the JSON file has multiple JSON objects, and multiple dictionaries will be created in Python(the number of dictionaries are unknown), I used decoder.raw_decode() and generator.
The following is the code:
import json
import pprint
import io
import pprint
def parse():
with open('data.json',encoding='utf-8') as jfile:
try:
while True:
decoder = json.JSONDecoder()
obj, idx = decoder.raw_decode(jfile)
yield obj
except ValueError as e:
print(e)
pass
else:
print("aha")
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
I get the error:
Traceback (most recent call last):
File "H:\Document\Python\j10.py", line 57, in <module>
main()
File "H:\Document\Python\j10.py", line 36, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j10.py", line 21, in parse
obj, idx = decoder.raw_decode(jfile)
File "C:\Python34\lib\json\decoder.py", line 360, in raw_decode
obj, end = self.scan_once(s, idx)
TypeError: first argument must be a string, not _io.TextIOWrapper
I edited code based on Martijn's answer:
import json
import io
file=open('data.json.txt')
def readin():
return file.read(2000)
def parse():
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(readin, ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
def main():
imputd=parse()
if imputd:
while True:
try:
print(str(next(imputd)).readlines())
except StopIteration as e:
print(e)
break
main()
and I get an UnicodeError:
Traceback (most recent call last):
File "H:\Document\Python\j11.py", line 35, in <module>
main()
File "H:\Document\Python\j11.py", line 30, in main
print(str(next(imputd)).readlines())
File "H:\Document\Python\j11.py", line 14, in parse
for chunk in iter(readin, ''):
File "H:\Document\Python\j11.py", line 8, in readin
return file.read(2000)
File "C:\Python34\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 4217: character maps to <undefined>
You are passing in the file object, but decoder.raw_decode() only takes text data. You need to do the reading yourself:
obj, idx = decoder.raw_decode(jfile.read())
You are then yielding Python objects created from the JSON data, so your .readlines() call in your main() function loop will also fail.
You are not using raw_decode() correctly, however. You are yourself responsible for feeding it chunks of text, it'll not read that text from the file for you. If you wanted to handle the file in chunks, and there are no clear delimiters between the JSON entries, you'll be forced to read the file in blocks:
decoder = json.JSONDecoder()
buffer = ''
for chunk in iter(partial(jfile.read, buffersize), ''):
buffer += chunk
while buffer:
try:
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
# Not enough data to decode, read more
break
This will still yield completely decoded objects; if your file is one long JSON object (like one top-level list or dictionary) then this'll not yield the contents of that object one by one; it'll still read the whole object before yielding.

How to output a live JSON feed in Python 3?

I am using Python 3 to access a live JSON feed from http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson. This is the code:
try:
# For Py 3.0+
from urllib.request import urlopen
except ImportError:
# For Py 2
from urllib2 import urlopen
import json
def printResults(data):
# Use the json module to load the string data into a dictionary
theJSON = json.loads(data) #pass JSON data into a dictionary
# now we can access the contents of the JSON like any other Python object
if "title" in theJSON["metadata"]:
print (theJSON["metadata"]["title"])
def main():
# JSON feed of earthquake activity larger than 2.5 in the past 25 hours
urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
#open url and read contents
webUrl = urlopen(urlData)
print (webUrl.getcode())
if (webUrl.getcode() == 200):
data = webUrl.read()
#print results
printResults(data)
else:
print ("Received an error from server " + str(webUrl.getcode()))
if __name__ == "__main__":
main()
I get the following output:
Traceback (most recent call last):
File "<string>", line 420, in run_nodebug
File "C:\Users\modar\Desktop\jsondata_finished.py", line 56, in <module>
File "C:\Users\modar\Desktop\jsondata_finished.py", line 50, in main
else:
File "C:\Users\modar\jsondata_finished.py", line 13, in printResults
if "title" in theJSON["metadata"]:
File "C:\Python33\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Python33\lib\json\decoder.py", line 352, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
TypeError: can't use a string pattern on a bytes-like object
How can I fix this? An explanation as to what went wrong would also be great. Thanks in advance.
With the requests library, linked to in my comment above, your code becomes:
quake_data = requests.get('http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson').json()
print(quake_data['metadata']['title'])
I do hope it helps...

Categories

Resources