I have a json message where at the highest level, I have a dictionary with unknown depth and structures, and am looking to traverse it to format it, ending up with a new, formatted dictionary. After using timeit, I found it to be very slow and discovered that recursion in python is not very quick at all. All that being understood, I don't know how to actually transform my recursive function "Foo.format_it" into a loop based one, if possible.
import time
import json
class Foo(object):
def __init__(self):
self.msg_out = {}
self.msg_in = None
self.sample_data = """
{
"data": {
"a": "",
"b": "",
"c": "127.0.0.1",
"d": 80,
"e": {"f": false,"g": false,"h": false,"i": false,"j": false,"k": false},
"l": [ {"ii": 2, "hh": 10, "gg": 200, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": 3, "ee": 0},
{"ii": 5, "hh": 20, "gg": 300, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1},
{"ii": 5, "hh": 30, "gg": -400, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1}],
"m": true,
"n": true,
"o": 1000,
"p": 2000,
"q": "",
"r": 5,
"s": 0,
"t": true,
"u": true,
"v": {"jj": 5, "kk": 0, "ll": 10, "mm": 9, "nn": [ { "aa": 20, "bb": 30 }, { "aa": 20, "bb": 30 } ] }
}
}
"""
def format(self, msg_in):
print msg_in
self.msg_in = json.loads( msg_in )
self.msg_out = {}
self.format_it(self.msg_in, self.msg_out)
import pprint
print pprint.pformat(self.msg_out)
return json.dumps( self.msg_out )
def ff(self, val, out_struct):
if int(val) < 0:
out_struct[u'ff'] = ""
else:
out_struct[u'ff'] = str(val)
def format_it(self, item, out_struct):
if isinstance(item, dict):
for dict_key, dict_val in item.iteritems():
if dict_key in dir(self):
dict_key = getattr(self, dict_key)(dict_val, out_struct)
if dict_key:
if isinstance(dict_val, dict):
out_struct[dict_key] = {}
self.format_it(dict_val, out_struct[dict_key])
elif isinstance(dict_val, list):
out_struct[dict_key] = []
self.format_it(dict_val, out_struct[dict_key])
else:
out_struct[dict_key] = dict_val
elif isinstance(item, list):
for list_val in item:
if isinstance(list_val, dict):
out_struct.append({})
self.format_it(list_val, out_struct[-1])
elif isinstance(list_val, list):
out_struct.append([])
self.format_it(list_val, out_struct[-1])
else:
out_struct.append(list_val)
else:
pass
if __name__ == "__main__":
tic = time.clock()
f = Foo()
f.format(f.sample_data)
print (time.clock()-tic)
Here is the in data and out data per request, in the simplest case, only the key 'ff' needed to be formatted and so -1 became an empty string:
[IN]
{
"data": {
"a": "",
"b": "",
"c": "127.0.0.1",
"d": 80,
"e": {"f": false,"g": false,"h": false,"i": false,"j": false,"k": false},
"l": [ {"ii": 2, "hh": 10, "gg": 200, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": 3, "ee": 0},
{"ii": 5, "hh": 20, "gg": 300, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1},
{"ii": 5, "hh": 30, "gg": -400, "aa": -1, "bb": -1, "ff":-1, "cc": -1, "dd": -1, "ee": -1}],
"m": true,
"n": true,
"o": 1000,
"p": 2000,
"q": "",
"r": 5,
"s": 0,
"t": true,
"u": true,
"v": {"jj": 5, "kk": 0, "ll": 10, "mm": 9, "nn": [ { "aa": 20, "bb": 30 }, { "aa": 20, "bb": 30 } ] }
}
}
[OUT]
{u'data': {u'a': u'',
u'b': u'',
u'c': u'127.0.0.1',
u'd': 80,
u'e': {u'f': False,
u'g': False,
u'h': False,
u'i': False,
u'j': False,
u'k': False},
u'l': [{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': 3,
u'ee': 0,
u'ff': '',
u'gg': 200,
u'hh': 10,
u'ii': 2},
{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': -1,
u'ee': -1,
u'ff': '',
u'gg': 300,
u'hh': 20,
u'ii': 5},
{u'aa': -1,
u'bb': -1,
u'cc': -1,
u'dd': -1,
u'ee': -1,
u'ff': '',
u'gg': -400,
u'hh': 30,
u'ii': 5}],
u'm': True,
u'n': True,
u'o': 1000,
u'p': 2000,
u'q': u'',
u'r': 5,
u's': 0,
u't': True,
u'u': True,
u'v': {u'jj': 5,
u'kk': 0,
u'll': 10,
u'mm': 9,
u'nn': [{u'aa': 20, u'bb': 30}, {u'aa': 20, u'bb': 30}]}}}
The code is a bit pared down and uses tic/toc vs timeit. In using both, the execution of just the recursion seems to be around .0012s (where I even remove the object creation and json load from the time calculation).
Related
What I want to achieve:
>>> from cerberus import Validator
>>> schema = {"x": {"type": "integer", "required": False}, "y": {"type": "integer", "required": False}}
>>> v = Validator(schema)
>>> v.validate({"x": 5})
True
>>> v.validate({"y": 6})
True
>>> v.validate({"x": 5, "y": 6})
True
>>> v.validate({})
False
I have checked all the document but still don't know how to achieve this result. How should I define the schema?
The only viable solution is to use Validator() multiple times.
from cerberus import Validator
def composite_validator(document):
REQUIRED_INTEGER = {"type": 'integer', "required": True}
OPTIONAL_INTEGER = {"type": 'integer', "required": False}
schemas = [
{"x": REQUIRED_INTEGER, "y": OPTIONAL_INTEGER},
{"x": OPTIONAL_INTEGER, "y": REQUIRED_INTEGER},
]
common_schema = {"z1": REQUIRED_INTEGER, "z2": OPTIONAL_INTEGER, "z3": REQUIRED_INTEGER}
for s in schemas:
s.update(common_schema)
validator = Validator()
return any(validator(document, s) for s in schemas)
Test results:
for case in [
{"x": 5, "z1": 0, "z3": -1},
{"y": 6, "z1": 0, "z3": -1},
{"x": 5, "y": 6, "z1": 0, "z3": -1},
{"z1": 0, "z3": -1}]:
print(case)
print(composite_validator(case))
#{'x': 5, 'z1': 0, 'z3': -1}
#True
#{'y': 6, 'z1': 0, 'z3': -1}
#True
#{'x': 5, 'y': 6, 'z1': 0, 'z3': -1}
#True
#{'z1': 0, 'z3': -1}
#False
I have a JSON that need to convert to Excel.
I'm using Python 3.8 with xlsxwriter library.
Below is sample JSON.
{
"companyId": "123456",
"companyName": "Test",
"companyStatus": "ACTIVE",
"document": {
"employee": {
"employeeId": "EM1567",
"employeeLastName": "Test Last",
"employeeFirstName": "Test Fist"
},
"expenseEntry": [
{
"allocation": [
{
"allocationId": "03B249B3598",
"journal": [
{
"journalAccountCode": "888",
"journalPayee": "EMPL",
"journalPayer": "COMP",
"taxGuid": [
"51645A638114E"
]
},
{
"journalAccountCode": "999",
"journalPayee": "EMPL",
"journalPayer": "EMPL",
"taxGuid": [
"8114E51645A63"
]
},
],
"tax": [
{
"taxCode": "TAX123",
"taxSource": "SYST"
},
{
"taxCode": "TAX456",
"taxSource": "SYST"
}
]
}
],
"approvedAmount": 200.0,
"entryDate": "2020-12-10",
"entryId": "ENTRY9988"
}
],
"report": {
"currencyCode": "USD",
"reportCreationDate": "2020-12-10",
"reportId": "ACA849BBB",
"reportName": "Test Report",
"totalApprovedAmount": 200.0
}
},
"id": "c71b7d756f549"
}
And my current code:
https://repl.it/#tonyiscoming/jsontoexcel
I tried with pandas
import pandas as pd
df = pd.json_normalize(data, max_level=5)
df.to_excel('test.xlsx', index=False)
And got the result
I tried with json_excel_converter
from json_excel_converter import Converter
from json_excel_converter.xlsx import Writer
conv = Converter()
conv.convert(data, Writer(file='test.xlsx'))
And got the result
This is my expectation
Would anyone please help me in this case? Thank you so much.
Here is the code what you are looking for. I did this using XlsxWriter package. First I made the template with some cell format stuff. After that, I entered values using according to your JSON.
import xlsxwriter
from itertools import zip_longest
data = [
{
"companyId": "123456",
"companyName": "Test",
"companyStatus": "ACTIVE",
"document": {
"employee": {
"employeeId": "EM1567",
"employeeLastName": "Test Last",
"employeeFirstName": "Test Fist"
},
"expenseEntry": [
{
"allocation": [
{
"allocationId": "03B249B3598",
"journal": [
{
"journalAccountCode": "888",
"journalPayee": "EMPL",
"journalPayer": "COMP",
"taxGuid": [
"51645A638114E"
]
},
{
"journalAccountCode": "999",
"journalPayee": "EMPL",
"journalPayer": "EMPL",
"taxGuid": [
"8114E51645A63"
]
},
],
"tax": [
{
"taxCode": "TAX123",
"taxSource": "SYST"
},
{
"taxCode": "TAX456",
"taxSource": "SYST"
}
]
}
],
"approvedAmount": 200.0,
"entryDate": "2020-12-10",
"entryId": "ENTRY9988"
}
],
"report": {
"currencyCode": "USD",
"reportCreationDate": "2020-12-10",
"reportId": "ACA849BBB",
"reportName": "Test Report",
"totalApprovedAmount": 200.0
}
},
"id": "c71b7d756f549"
}
]
xlsx_file = 'your_file_name_here.xlsx'
# define the excel file
workbook = xlsxwriter.Workbook(xlsx_file)
# create a sheet for our work, defaults to Sheet1.
worksheet = workbook.add_worksheet()
# common merge format
merge_format = workbook.add_format({'align': 'center', 'valign': 'vcenter'})
# set all column width to 20
worksheet.set_column('A:V', 20)
# column wise template creation (A-V)
worksheet.merge_range(0, 0, 4, 0, 'companyId', merge_format) # A
worksheet.merge_range(0, 1, 4, 1, 'companyName', merge_format) # B
worksheet.merge_range(0, 2, 4, 2, 'companyStatus', merge_format) # C
worksheet.merge_range(0, 3, 0, 20, 'document', merge_format) # C-U
worksheet.merge_range(1, 3, 1, 5, 'employee', merge_format) # D-F
worksheet.merge_range(2, 3, 4, 3, 'employeeId', merge_format) # D
worksheet.merge_range(2, 4, 4, 4, 'employeeLastName', merge_format) # E
worksheet.merge_range(2, 5, 4, 5, 'employeeFirstName', merge_format) # F
worksheet.merge_range(1, 6, 1, 15, 'expenseEntry', merge_format) # G-P
worksheet.merge_range(2, 6, 2, 12, 'allocation', merge_format) # G-M
worksheet.merge_range(3, 6, 4, 6, 'allocationId', merge_format) # G
worksheet.merge_range(3, 7, 3, 10, 'journal', merge_format) # H-K
worksheet.write(4, 7, 'journalAccountCode') # H
worksheet.write(4, 8, 'journalPayee') # I
worksheet.write(4, 9, 'journalPayer') # J
worksheet.write(4, 10, 'taxGuid') # K
worksheet.merge_range(3, 11, 3, 12, 'tax', merge_format) # L-M
worksheet.write(4, 11, 'taxCode') # L
worksheet.write(4, 12, 'taxSource') # M
worksheet.merge_range(2, 13, 4, 13, 'approvedAmount', merge_format) # N
worksheet.merge_range(2, 14, 4, 14, 'entryDate', merge_format) # O
worksheet.merge_range(2, 15, 4, 15, 'entryId', merge_format) # P
worksheet.merge_range(1, 16, 1, 20, 'report', merge_format) # Q-U
worksheet.merge_range(2, 16, 4, 16, 'currencyCode', merge_format) # Q
worksheet.merge_range(2, 17, 4, 17, 'reportCreationDate', merge_format) # R
worksheet.merge_range(2, 18, 4, 18, 'reportId', merge_format) # S
worksheet.merge_range(2, 19, 4, 19, 'reportName', merge_format) # T
worksheet.merge_range(2, 20, 4, 20, 'totalApprovedAmount', merge_format) # U
worksheet.merge_range(0, 21, 4, 21, 'id', merge_format) # V
# inserting data
row = 5
for obj in data:
worksheet.write(row, 0, obj.get('companyId'))
worksheet.write(row, 1, obj.get('companyName'))
worksheet.write(row, 2, obj.get('companyStatus'))
document = obj.get('document', {})
# employee details
employee = document.get('employee', {})
worksheet.write(row, 3, employee.get('employeeId'))
worksheet.write(row, 4, employee.get('employeeLastName'))
worksheet.write(row, 5, employee.get('employeeFirstName'))
# report details
report = document.get('report', {})
worksheet.write(row, 16, report.get('currencyCode'))
worksheet.write(row, 17, report.get('reportCreationDate'))
worksheet.write(row, 18, report.get('reportId'))
worksheet.write(row, 19, report.get('reportName'))
worksheet.write(row, 20, report.get('totalApprovedAmount'))
worksheet.write(row, 21, obj.get('id'))
# expenseEntry details
expense_entries = document.get('expenseEntry', [])
for expense_entry in expense_entries:
worksheet.write(row, 13, expense_entry.get('approvedAmount'))
worksheet.write(row, 14, expense_entry.get('entryDate'))
worksheet.write(row, 15, expense_entry.get('entryId'))
# allocation details
allocations = expense_entry.get('allocation', [])
for allocation in allocations:
worksheet.write(row, 6, allocation.get('allocationId'))
# journal and tax details
journals = allocation.get('journal', [])
taxes = allocation.get('tax', [])
for journal_and_tax in list(zip_longest(journals, taxes)):
journal, tax = journal_and_tax
worksheet.write(row, 7, journal.get('journalAccountCode'))
worksheet.write(row, 8, journal.get('journalPayee'))
worksheet.write(row, 9, journal.get('journalPayer'))
worksheet.write(row, 11, tax.get('taxCode'))
worksheet.write(row, 12, tax.get('taxSource'))
# taxGuid details
tax_guides = journal.get('taxGuid', [])
if not tax_guides:
row = row + 1
continue
for tax_guide in tax_guides:
worksheet.write(row, 10, tax_guide)
row = row + 1
# finally close the created excel file
workbook.close()
One thing, instead of creating a template in the script you can make your own one and save it somewhere else. Then get the copy of that template and just add data using the script. This will give you a chance to make your own base template, otherwise, you have to format your excel using the script, such as border formattings, merge cells, etc.
I used zip_longest python built-in function from itertools to zip journal and tax objects. Just follow Python – Itertools.zip_longest() or Python's zip_longest Function article for examples. If you didn't understand anything from my code, please comment below.
Having empty cells in an Excel Grid is not something really "propper", which is why json_excel_converter beahaves like this.
So, If you want to achieve this, I'm afraid you'll have to develop it all by yourself.
I'm relatively new to Elasticsearch and am having a problem determining why the number of records from a pythondataframe is different than the indexes document count Elasticsearch.
I start by creating an index by running the following: As you can see there are 62932 records.
I'm creating an index in elasticsearch using the following:
Python code
When I check the index in Kibana Management/Index Management there are only 62630 documents. According to Stats window there were 302 deleted count. I don't know what this means.
Below is the output from the STATS window
{
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"stats": {
"uuid": "egOx_6EwTFysBr0WkJyR1Q",
"primaries": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
},
"total": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
}
}
}
why does the doc count differ from the index total? I've exported the data and the number of records matches the doc count. How can I find out why documents were deleted and make sure they are not in the future?
Possible causes:
Deleted documents tie up disk space in the index.
In-memory per-document data structures, such as norms or field data, will still consume RAM for deleted documents.
Search throughput is lower, since each search must check the deleted bitset for every potential hit. More on this below.
Aggregate term statistics, used for query scoring, will still reflect deleted terms and documents. When a merge completes, the term statistics will suddenly jump closer to their true values, changing hit scores. In practice this impact is minor, unless the deleted documents had divergent statistics from the rest of the index.
A deleted document ties up a document ID from the maximum 2.1 B documents for a single shard. If your shard is riding close to that limit (not recommended!) this could matter.
Fuzzy queries can have slightly different results, because they may match ghost terms.
https://www.elastic.co/guide/en/elasticsearch/reference/current//cat-indices.html
https://www.elastic.co/blog/lucenes-handling-of-deleted-documents
I have json output from the Linux fio command, as shown below, that I'd to parse for values like a dictionary, extracting certain values from certain keys. But the nested layer of this json output is clumping the output into huge "values" in the KVP. Any tips for how I can better parse these nested data structures?
{
"disk_util": [
{
"aggr_util": 96.278308,
"in_queue": 247376,
"write_ticks": 185440,
"read_ticks": 61924,
"write_merges": 0,
"read_merges": 0,
"write_ios": 240866,
"read_ios": 18257,
"name": "dm-0",
"util": 97.257058,
"aggr_read_ios": 18465,
"aggr_write_ios": 243642,
"aggr_read_merges": 1,
"aggr_write_merge": 72,
"aggr_read_ticks": 62420,
"aggr_write_ticks": 185796,
"aggr_in_queue": 245504
},
{
"util": 96.278308,
"name": "sda",
"read_ios": 18465,
"write_ios": 243642,
"read_merges": 1,
"write_merges": 72,
"read_ticks": 62420,
"write_ticks": 185796,
"in_queue": 245504
}
],
"jobs": [
{
"latency_window": 0,
"latency_percentile": 100,
"latency_target": 0,
"latency_depth": 64,
"latency_ms": {
">=2000": 0,
"2000": 0,
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
},
"latency_us": {
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
},
"write": {
"iops_samples": 35,
"iops_stddev": 1608.115728,
"iops_mean": 13835.571429,
"iops_max": 16612,
"iops_min": 9754,
"bw_samples": 35,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 243678,
"runtime": 17611,
"iops": 13836.692976,
"bw": 55346,
"io_kbytes": 974712,
"io_bytes": 998105088,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 39016,
"bw_max": 66448,
"bw_agg": 99.994218,
"bw_mean": 55342.8,
"bw_dev": 6432.427333
},
"read": {
"iops_samples": 35,
"iops_stddev": 126.732776,
"iops_mean": 1048.257143,
"iops_max": 1336,
"iops_min": 772,
"bw_samples": 35,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 18466,
"runtime": 17611,
"iops": 1048.549202,
"bw": 4194,
"io_kbytes": 73864,
"io_bytes": 75636736,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 3088,
"bw_max": 5344,
"bw_agg": 99.993188,
"bw_mean": 4193.714286,
"bw_dev": 506.844597
},
"job options": {
"rwmixread": "7",
"rw": "randrw",
"size": "1G",
"iodepth": "64",
"bs": "4k",
"filename": "test",
"name": "test"
},
"elapsed": 18,
"eta": 0,
"error": 0,
"groupid": 0,
"jobname": "test",
"trim": {
"iops_samples": 0,
"iops_stddev": 0,
"iops_mean": 0,
"iops_max": 0,
"iops_min": 0,
"bw_samples": 0,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 0,
"runtime": 0,
"iops": 0,
"bw": 0,
"io_kbytes": 0,
"io_bytes": 0,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 0,
"bw_max": 0,
"bw_agg": 0,
"bw_mean": 0,
"bw_dev": 0
},
"usr_cpu": 11.447391,
"sys_cpu": 74.680597,
"ctx": 28972,
"majf": 0,
"minf": 31,
"iodepth_level": {
">=64": 99.975967,
"32": 0.1,
"16": 0.1,
"8": 0.1,
"4": 0.1,
"2": 0.1,
"1": 0.1
},
"latency_ns": {
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
}
}
],
"global options": {
"gtod_reduce": "1",
"direct": "1",
"ioengine": "libaio",
"randrepeat": "1"
},
"time": "Sat Oct 14 23:18:28 2017",
"timestamp_ms": 1508023108010,
"timestamp": 1508023108,
"fio version": "fio-3.1"
}
I'm importing it from a file really simplistically:
import json
my_file = open('fio.json', 'r')
my_dict = json.load(my_file)
for k, v in my_dict.items():
print("Key: {0}, value: {1}").format(k, v)
But when iterating, it's making all the nested tables and dicts return munged output, like
Key: disk_util, value: [{u'aggr_write_ticks': 185796, u'write_merges': 0, u'write_ticks': 185440, u'write_ios': 240866, u'aggr_write_ios': 243642, u'aggr_read_ticks': 62420, u'read_ios': 18257, u'util': 97.257058, u'read_ticks': 61924, u'aggr_write_merge': 72, u'read_merges': 0, u'aggr_in_queue': 245504, u'aggr_read_ios': 18465, u'aggr_util': 96.278308, u'aggr_read_merges': 1, u'in_queue': 247376, u'name': u'dm-0'}, {u'read_merges': 1, u'name': u'sda', u'write_ios': 243642, u'read_ios': 18465, u'util': 96.278308, u'read_ticks': 62420, u'write_merges': 72, u'in_queue': 245504, u'write_ticks': 185796}]
json.load() maintain json file type.
You seem to have an syntax error.
In the wrong position ).
import json
my_file = open('fio.json', 'r')
my_dict = json.load(my_file)
for index, key in enumerate(my_dict):
print("Key: {0}, value: {1}".format(key, my_dict[key]))
If I execute aggregation query with matched expression:
>>> devices = [1,2,3,4,5,6] # devices ID's
>>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices},"f": {"$ne": 1}}},
{"$group":{'_id':"uniqueDocs",'count':{"$sum":1}}}
)
I'm getting result:
>>> list(c)
[{u'count': 2874791, u'_id': u'uniqueDocs'}]
But if execute query with expression not matched:
>>> now = datetime.utcnow().replace(tzinfo=tz.gettz('UTC'))
>>> current_hour_start = now.replace(minute=0, second=0, microsecond=0)
>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices}, "b": {"$gte": current_hour_start}, "f": {"$ne": 1}}},
{"$group": {'_id': "uniqueDocs", 'count': {"$sum": 1}}})
I'm getting empty cursor:
list(c)
[]
How me get zero count?
as:
>>> list(c)
[{u'count': 0, u'_id': u'uniqueDocs'}]
Update:
Example dataset and expected result.
>>> View.objects()
{
_id: ObjectId("578f79b877824688fc0d68ed") }, {
$set: {
"d": 1, /* device ID */
"i": 1899,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:35:56.066Z"), /* begin time */
"e": ISODate("2016-07-20T08:35:57.965Z") /* end time */
}
},
{
_id: ObjectId("578f79b877824688fc0d68ee") }, {
$set: {
"d": 2,
"i": 2456,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:37:26.066Z"),
"e": ISODate("2016-07-20T08:37:28.965Z")
}
},
{
_id: ObjectId("578f79b877824688fc0d68ef") }, {
$set: {
"d": 1000,/* !!! ignore this document (no matched device ID) */
"i": 2567,
"s": 1,
"a": 1,
"m": 0,
"f": 0,
"b": ISODate("2016-07-20T08:35:56.066Z"),
"e": ISODate("2016-07-20T08:35:57.965Z")
}
}
>>> c = View.objects.aggregate(
{"$match": {"d": {"$in": devices},"f": {"$ne": 1}}},
{"$group":{'_id':"uniqueDocs",'count':{"$sum":1}}}
).next()['count']
2