Pandas: reading indented JSON created by to_json

Pandas: reading indented JSON created by to_json - python

I'm writing JSON to a file using DataFrame.to_json() with the indent option:
df.to_json(path_or_buf=file_json, orient="records", lines=True, indent=2)
The important part here is indent=2, otherwise it works.
Then how do I read this file using DataFrame.read_json()?
I'm trying the code below, but it expects the file to be a JSON object per line, so the indentation messes things up:
df = pd.read_json(file_json, lines=True)
I didn't find any options in read_json to make it handle the indentation.
How else could I read this file created by to_json, possibly avoiding writing my own reader?

The combination of lines=True, orient='records', and indent=2 doesn't actually produce valid json.
lines=True is meant to create line-delimited json, but indent=2 adds extra lines. You can't have your delimiter be line breaks, AND have extra line breaks!
If you do just orient='records', and indent=2, then it does produce valid json.
The current read_json(lines=True) code can be found here:
def _combine_lines(self, lines) -> str:
"""
Combines a list of JSON objects into one JSON object.
"""
return (
f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
)
You can see that it expects to read the file line by line, which isn't possible when indent has been used.

The other answer is good, but it turned out it requires reading the entire file in memory. I ended up writing a simple lazy parser that I include below. It requires removing lines=True argument in df.to_json.
The use is following:
for obj, pos, length in lazy_read_json('file.json'):
print(obj['field']) # access json object
It includes pos - start position in file for the object, and length - the length of object in file; it allows some more functionality for me, like being able to index object and load them to memory on demand.
The parser is below:
def lazy_read_json(filename: str):
"""
:return generator returning (json_obj, pos, lenth)
>>> test_objs = [{'a': 11, 'b': 22, 'c': {'abc': 'z', 'zzz': {}}}, \
{'a': 31, 'b': 42, 'c': [{'abc': 'z', 'zzz': {}}]}, \
{'a': 55, 'b': 66, 'c': [{'abc': 'z'}, {'z': 3}, {'y': 3}]}, \
{'a': 71, 'b': 62, 'c': 63}]
>>> json_str = json.dumps(test_objs, indent=4, sort_keys=True)
>>> _create_file("/tmp/test.json", [json_str])
>>> g = lazy_read_json("/tmp/test.json")
>>> next(g)
({'a': 11, 'b': 22, 'c': {'abc': 'z', 'zzz': {}}}, 120, 116)
>>> next(g)
({'a': 31, 'b': 42, 'c': [{'abc': 'z', 'zzz': {}}]}, 274, 152)
>>> next(g)
({'a': 55, 'b': 66, 'c': [{'abc': 'z'}, {'z': 3}, {'y': 3}]}, 505, 229)
>>> next(g)
({'a': 71, 'b': 62, 'c': 63}, 567, 62)
>>> next(g)
Traceback (most recent call last):
...
StopIteration
"""
with open(filename) as fh:
state = 0
json_str = ''
cb_depth = 0 # curly brace depth
line = fh.readline()
while line:
if line[-1] == "\n":
line = line[:-1]
line_strip = line.strip()
if state == 0 and line == '[':
state = 1
pos = fh.tell()
elif state == 1 and line_strip == '{':
state = 2
json_str += line + "\n"
elif state == 2:
if len(line_strip) > 0 and line_strip[-1] == '{': # count nested objects
cb_depth += 1
json_str += line + "\n"
if cb_depth == 0 and (line_strip == '},' or line_strip == '}'):
# end of parsing an object
if json_str[-2:] == ",\n":
json_str = json_str[:-2] # remove trailing comma
state = 1
obj = json.loads(json_str)
yield obj, pos, len(json_str)
pos = fh.tell()
json_str = ""
elif line_strip == '}' or line_strip == '},':
cb_depth -= 1
line = fh.readline()
# this function is for doctest
def _create_file(filename, lines):
# cause doctest can't input new line characters :(
f = open(filename, "w")
for line in lines:
f.write(line)
f.write("\n")
f.close()

Related

How do I skip to next entry if a key doesn't exist in dict

I have a dict of dicts, but a given entry might not exist. For example, I have the following dict where the entry for c is missing:
g = {
'a': {'w': 14, 'x': 7, 'y': 9},
'b': {'w': 9, 'c': 6}, # <- c is not in dict
'w': {'a': 14, 'b': 9, 'y': 2},
'x': {'a': 7, 'y': 10, 'z': 15},
'y': {'a': 9, 'w': 2, 'x': 10, 'z': 11},
'z': {'b': 6, 'x': 15, 'y': 11}
}
My current code
start = 'a'
end = 'z'
queue, seen = [(0, start, [])], set()
while True:
(distance, vertex, path) = heapq.heappop(queue)
if vertex not in seen:
path = path + [vertex]
seen.add(vertex)
if vertex == end:
print(distance, path)
break # new line, based on solutions below
# new line
if vertex not in graph: # new line
continue # new line
for (next_v, d) in graph[vertex].items():
heapq.heappush(queue, (distance + d, next_v, path))
Right now I am getting the error:
for (next_v, d) in graph[vertex].items():
KeyError: 'c'
EDIT 1
If key is not found in dict skip ahead.
EDIT 2
Even with the newly added code I get an error, this time:
(distance, vertex, path) = heapq.heappop(queue)
IndexError: index out of range
Here is the data file I use
https://s3-eu-west-1.amazonaws.com/citymapper-assets/citymapper-coding-test-graph.dat
Here is the file format:
<number of nodes>
<OSM id of node>
...
<OSM id of node>
<number of edges>
<from node OSM id> <to node OSM id> <length in meters>
...
<from node OSM id> <to node OSM id> <length in meters>
And here is the code to create the graph
with open(filename, 'r') as reader:
num_nodes = int(reader.readline())
edges = []
for line in islice(reader, num_nodes + 1, None):
values = line.split()
values[2] = int(values[2])
edges.append(tuple(values))
graph = {k: dict(x[1:] for x in grp) for k, grp in groupby(sorted(edges), itemgetter(0))}
Change start and end to:
start = '876500321'
end = '1524235806'
Any help/advice is highly appreciated.
Thanks

Before accessing graph[vertex], make sure it is in the dict:
if vertex not in graph:
continue
for (next_v, d) in graph[vertex].items():
heapq.heappush(queue, (distance + d, next_v, path))

You can check whether the vertex is in the graph before executing that final for loop:
if vertex in graph:
for (next_v, d) in graph[vertex].items():
heapq.heappush(queue, (distance + d, next_v, path))

You could do a .get and return a empty {} incase the key is not there, so that the .items() won't break like,
for (next_v, d) in graph.get(vertex, {}).items():
heapq.heappush(queue, (distance + d, next_v, path))

Python: Sum each lines with their values from dict

dict = {'A': 71.07884,
'B': 110,
'C': 103.14484,
'D': 115.08864,
'E': 129.11552,
'F': 147.1766,
'G': 57.05196,
'H': 137.1412
}
def search_replace(search, replacement, searchstring):
p = re.compile(search)
searchstring = p.sub(replacement, searchstring)
return (searchstring)
def main():
with open(sys.argv[1]) as filetoread:
lines = filetoread.readlines()
file = ""
for i in range(len(lines)):
file += lines[i]
file = search_replace('(?<=[BC])', ' ', file)
letterlist = re.split('\s+', file)
for j in range(len(letterlist)):
print(letterlist[j])
if __name__ == '__main__':
import sys
import re
main()
My program open a file and split the text of letters after B or C.
The file looks like:
ABHHFBFEACEGDGDACBGHFEDDCAFEBHGFEBCFHHHGBAHGBCAFEEAABCHHGFEEEAEAGHHCF
Now I want to sum each lines with their values from dict.
For example:
AB = 181.07884
HHFB = 531.4590000000001
And so on.
I dont know how to start. Thanks a lot for all your answers.

You already did most of the work! All you miss out is the sum for each substring.
As substrings can occur more often, I'll do the summation only once, and store the values for each substring encountered in a dict (and your above dict for the relation of letter to value I renamed to mydict in order to avoid keyword confustion):
snippets = {}
for snippet in letterlist:
if snippet not in snippets:
value = 0
for s in snippet:
value += mydict.get(s)
snippets[snippet] = value
print(snippets)
That gives me an output of
{
'AB': 181.07884,
'HHFB': 531.4590000000001,
'FEAC': 450.5158,
'EGDGDAC': 647.6204,
'B': 110,
'GHFEDDC': 803.8074,
'AFEB': 457.37096,
'HGFEB': 580.4852800000001,
'C': 103.14484,
'FHHHGB': 725.6521600000001,
'AHGB': 375.272,
'AFEEAAB': 728.64416,
'HHGFEEEAEAGHHC': 1571.6099199999999,
'F': 147.1766}

Try to simplify things...
Given you already have a string s and a dictionary d:
ctr = 0
temp = ''
for letter in s:
ctr += d[letter]
temp += letter
if letter in 'BC':
print(temp, ctr)
ctr = 0
temp = ''
In the case you supplied where:
s = "ABHHFBFEACEGDGDACBGHFEDDCAFEBHGFEBCFHHHGBAHGBCAFEEAABCHHGFEEEAEAGHHCF"
d = {'A': 71.07884,
'B': 110,
'C': 103.14484,
'D': 115.08864,
'E': 129.11552,
'F': 147.1766,
'G': 57.05196,
'H': 137.1412
}
You get the results (printed to terminal):
>>> ('AB', 181.07884)
('HHFB', 531.4590000000001)
('FEAC', 450.5158)
('EGDGDAC', 647.6204)
('B', 110)
('GHFEDDC', 803.8074)
('AFEB', 457.37096)
('HGFEB', 580.4852800000001)
('C', 103.14484)
('FHHHGB', 725.6521600000001)
('AHGB', 375.272)
('C', 103.14484)
('AFEEAAB', 728.64416)
('C', 103.14484)
('HHGFEEEAEAGHHC', 1571.6099199999999)

Open you file and then read each character, then find the character on the dictionary and add the value to your total.
sum_ = 0
letters = "letters_file"
opened = open(letters, "r")
for row in opened:
for char in row:
sum_ += int(your_dictionary[char])
print(sum_)

You can use re.split with itertools.zip_longest in a dict comprehension:
import re
from itertools import zip_longest
i = iter(re.split('([BC])', s))
{w: sum(d[c] for c in w)for p in zip_longest(i, i, fillvalue='') for w in (''.join(p),)}
This returns:
{'AB': 181.07884, 'HHFB': 531.4590000000001, 'FEAC': 450.5158, 'EGDGDAC': 647.6204, 'B': 110, 'GHFEDDC': 803.8074, 'AFEB': 457.37096, 'HGFEB': 580.4852800000001, 'C': 103.14484, 'FHHHGB': 725.6521600000001, 'AHGB': 375.272, 'AFEEAAB': 728.64416, 'HHGFEEEAEAGHHC': 1571.6099199999999, 'F': 147.1766}

How do I use threads on a generator (multiple threads per item) while keeping the order?

I have a code that is mimicking a REST API call (see below).
For every key in the item of the generator, it needs to run a REST call. So in my example, a record could be
{"a": 2, "b": 36, "c": 77}
I need to run a REST call for every key (a, b, and c) individually, then output the results (which just negates the number):
{"a": 2, "a_neg": -2, "b": 36, "b_neg": -36, "c": 77, "c_neg": -77}
Right now my current code works for one key, but with multiple keys, it will repeat the items (so I'm getting triple the results for 3 keys).
Also there is some funky race condition that occurs as well. I guess I could only keep the last record, but I'm not good with threads and concerned about thread safety or other advanced stuff.
Here is an example output:
{'a': 89, 'a_neg': -89, 'b': 69, 'c': 38}
{'a': 89, 'a_neg': -89, 'b': 69, 'c': 38, 'c_neg': -38}
{'a': 89, 'a_neg': -89, 'b': 69, 'b_neg': -69, 'c': 38, 'c_neg': -38}
{'a': 90, 'a_neg': -90, 'b': 43, 'c': 16}
{'a': 90, 'a_neg': -90, 'b': 43, 'c': 16, 'c_neg': -16}
{'a': 90, 'a_neg': -90, 'b': 43, 'b_neg': -43, 'c': 16, 'c_neg': -16}
{'a': 91, 'a_neg': -91, 'b': 49, 'b_neg': -49, 'c': 77, 'c_neg': -77}
{'a': 91, 'a_neg': -91, 'b': 49, 'b_neg': -49, 'c': 77, 'c_neg': -77}
{'a': 91, 'a_neg': -91, 'b': 49, 'b_neg': -49, 'c': 77, 'c_neg': -77}
Finally here is my source code (you can run it yourself):
#!/usr/bin/env python
from concurrent.futures import ThreadPoolExecutor
from time import sleep
from pprint import pprint
import random
def records():
# simulates records generator
for i in range(100):
yield {"a": i, "b": random.randint(0,100), "c": random.randint(0,100)}
def stream(records):
threads = 8
pool = ThreadPoolExecutor(threads)
def rest_api_lookup(record_dict):
# simulates REST call :)
sleep(0.1)
key = record_dict["key"]
record = record_dict["record"]
record[key + "_neg"] = -record[key]
return record
def thread(records):
chunk = []
for record in records:
for key in record:
chunk.append(pool.submit(rest_api_lookup, {"record": record, "key": key}))
if len(chunk) == threads:
yield chunk
chunk = []
if chunk:
yield chunk
def unchunk(chunk_gen):
"""Flattens a generator of Future chunks into a generator of Future results."""
for chunk in chunk_gen:
for f in chunk:
yield f.result() # get result from Future
# Now iterate over all results in same order as records
for result in unchunk(thread(records)):
#yield result
pprint(result)
stream(records())

1st issue here is that your looping over keys in a record that grows...
for key in list(record): # make a copy of the keys!
I think the 2nd issue here is that you have 3 keys and 8 threads... len(chunk) will be 3, 6, 9 ... threads is 8 - the following condition is not reached
if len(chunk) == threads: # try len(chunk) >= threads
yield chunk
chunk = []
last issue is that you yield uncompleted records before all threads are finish. here is a possible fix:
def unchunk(chunk_gen):
"""Flattens a generator of Future chunks into a generator of Future results."""
for chunk in chunk_gen:
old_res = None
for f in chunk:
res = f.result() # get result from Future
if old_res and res is not old_res:
yield old_res
old_res = res
if old_res:
yield old_res

Python how convert single quotes to double quotes to format as json string

I have a file where on each line I have text like this (representing cast of a film):
[{'cast_id': 23, 'character': "Roger 'Verbal' Kint", 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie's Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
I need to convert it in a valid json string, thus converting only the necessary single quotes to double quotes (e.g. the single quotes around word Verbal must not be converted, eventual apostrophes in the text also should not be converted).
I am using python 3.x. I need to find a regular expression which will convert only the right single quotes to double quotes, thus the whole text resulting in a valid json string. Any idea?

First of all, the line you gave as example is not parsable! … 'Edie's Finneran' … contains a syntax error, not matter what.
Assuming that you have control over the input, you could simply use eval() to read in the file. (Although, in that case one would wonder why you can't produce valid JSON in the first place…)
>>> f = open('list.txt', 'r')
>>> s = f.read().strip()
>>> l = eval(s)
>>> import pprint
>>> pprint.pprint(l)
[{'cast_id': 23,
'character': "Roger 'Verbal' Kint",
...
'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
>>> import json
>>> json.dumps(l)
'[{"cast_id": 23, "character": "Roger \'Verbal\' Kint", "credit_id": "52fe4260ca36847f8019af7", "gender": 2, "id": 1979, "name": "Kevin Spacey", "order": 5, "rofile_path": "/x7wF050iuCASefLLG75s2uDPFUu.jpg"}, {"cast_id": 27, "character":"Edie\'s Finneran", "credit_id": "52fe4260c3a36847f8019b07", "gender": 1, "id":2179, "name": "Suzy Amis", "order": 6, "profile_path": "/b1pjkncyLuBtMUmqD1MztDSG80.jpg"}]'
If you don't have control over the input, this is very dangerous, as it opens you up to code injection attacks.
I cannot emphasize enough that the best solution would be to produce valid JSON in the first place.

If you do not have control over the JSON data, do not eval() it!
I created a simple JSON correction mechanism, as that is more secure:
def correctSingleQuoteJSON(s):
rstr = ""
escaped = False
for c in s:
if c == "'" and not escaped:
c = '"' # replace single with double quote
elif c == "'" and escaped:
rstr = rstr[:-1] # remove escape character before single quotes
elif c == '"':
c = '\\' + c # escape existing double quotes
escaped = (c == "\\") # check for an escape character
rstr += c # append the correct json
return rstr
You can use the function in the following way:
import json
singleQuoteJson = "[{'cast_id': 23, 'character': 'Roger \\'Verbal\\' Kint', 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie\\'s Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]"
correctJson = correctSingleQuoteJSON(singleQuoteJson)
print(json.loads(correctJson))

Here is the code to get desired output
import ast
def getJson(filepath):
fr = open(filepath, 'r')
lines = []
for line in fr.readlines():
line_split = line.split(",")
set_line_split = []
for i in line_split:
i_split = i.split(":")
i_set_split = []
for split_i in i_split:
set_split_i = ""
rev = ""
i = 0
for ch in split_i:
if ch in ['\"','\'']:
set_split_i += ch
i += 1
break
else:
set_split_i += ch
i += 1
i_rev = (split_i[i:])[::-1]
state = False
for ch in i_rev:
if ch in ['\"','\''] and state == False:
rev += ch
state = True
elif ch in ['\"','\''] and state == True:
rev += ch+"\\"
else:
rev += ch
i_rev = rev[::-1]
set_split_i += i_rev
i_set_split.append(set_split_i)
set_line_split.append(":".join(i_set_split))
line_modified = ",".join(set_line_split)
lines.append(ast.literal_eval(str(line_modified)))
return lines
lines = getJson('test.txt')
for i in lines:
print(i)

Apart from eval() (mentioned in user3850's answer), you can use ast.literal_eval
This has been discussed in the thread: Using python's eval() vs. ast.literal_eval()?
You can also look at the following discussion threads from Kaggle competition which has data similar to the one mentioned by OP:
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/89313#latest-517927
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/80045#latest-518338

How can I correct the error ' AttributeError: 'dict_keys' object has no attribute 'remove' '?

I was trying shortest path finder using dijkstra algorithm but It seems not working. Can't figure out what the problem is. Here are the code and the error message.
(I'm working on Python 3.5. https://www.youtube.com/watch?v=LHCVNtxb4ss)
graph = {
'A': {'B': 10, 'D': 4, 'F': 10},
'B': {'E': 5, 'J': 10, 'I': 17},
'C': {'A': 4, 'D': 10, 'E': 16},
'D': {'F': 12, 'G': 21},
'E': {'G': 4},
'F': {'E': 3},
'G': {'J': 3},
'H': {'G': 3, 'J': 3},
'I': {},
'J': {'I': 8},
}
def dijkstra(graph, start, end):
D = {}
P = {}
for node in graph.keys():
D[node]= -1
P[node]=""
D[start]=0
unseen_nodes=graph.keys()
while len(unseen_nodes) > 0:
shortest=None
node=' '
for temp_node in unseen_nodes:
if shortest==None:
shortest = D[temp_node]
node = temp_node
elif D[temp_node]<shortest:
shortest=D[temp_node]
node=temp_node
unseen_nodes.remove(node)
for child_node, child_value in graph[node].items():
if D[child_node] < D[node] + child_value:
D[child_node] = D[node] + child_value
P[child_node]=node
path = []
node = end
while not (node==start):
if path.count(node)==0:
path.insert(0, node)
node=P[node]
else:
break
path.insert(0, start)
return path
AttributeError: 'dict_keys' object has no attribute 'remove'

In Python 3, dict.keys() returns a dict_keys object (a view of the dictionary) which does not have remove method; unlike Python 2, where dict.keys() returns a list object.
>>> graph = {'a': []}
>>> keys = graph.keys()
>>> keys
dict_keys(['a'])
>>> keys.remove('a')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'dict_keys' object has no attribute 'remove'
You can use list(..) to get a keys list:
>>> keys = list(graph)
>>> keys
['a']
>>> keys.remove('a')
>>> keys
[]
unseen_nodes = graph.keys()
to
unseen_nodes = list(graph)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pandas: reading indented JSON created by to_json - python

Related

How do I skip to next entry if a key doesn't exist in dict

Python: Sum each lines with their values from dict

How do I use threads on a generator (multiple threads per item) while keeping the order?

Python how convert single quotes to double quotes to format as json string

How can I correct the error ' AttributeError: 'dict_keys' object has no attribute 'remove' '?

Categories

Resources