Deleted documents when using Elasticsearch API from Python - python

I'm relatively new to Elasticsearch and am having a problem determining why the number of records from a pythondataframe is different than the indexes document count Elasticsearch.
I start by creating an index by running the following: As you can see there are 62932 records.
I'm creating an index in elasticsearch using the following:
Python code
When I check the index in Kibana Management/Index Management there are only 62630 documents. According to Stats window there were 302 deleted count. I don't know what this means.
Below is the output from the STATS window
{
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"stats": {
"uuid": "egOx_6EwTFysBr0WkJyR1Q",
"primaries": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
},
"total": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
}
}
}
why does the doc count differ from the index total? I've exported the data and the number of records matches the doc count. How can I find out why documents were deleted and make sure they are not in the future?

Possible causes:
Deleted documents tie up disk space in the index.
In-memory per-document data structures, such as norms or field data, will still consume RAM for deleted documents.
Search throughput is lower, since each search must check the deleted bitset for every potential hit. More on this below.
Aggregate term statistics, used for query scoring, will still reflect deleted terms and documents. When a merge completes, the term statistics will suddenly jump closer to their true values, changing hit scores. In practice this impact is minor, unless the deleted documents had divergent statistics from the rest of the index.
A deleted document ties up a document ID from the maximum 2.1 B documents for a single shard. If your shard is riding close to that limit (not recommended!) this could matter.
Fuzzy queries can have slightly different results, because they may match ghost terms.
https://www.elastic.co/guide/en/elasticsearch/reference/current//cat-indices.html
https://www.elastic.co/blog/lucenes-handling-of-deleted-documents

Related

Join nested list to ID value

I retrieve data from my DB for a Python app and it comes in the following format (as a list, tbl):
[
{
"id": "rec2fiwnTQewTv9HC",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 19,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "xyz",
"Red": 0,
"Green": 255,
"Blue": 0
}
},
{
"id": "rec4y7vhgZVDHrhrQ",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 30,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "abc",
"Red": 0,
"Green": 255,
"Blue": 0
}
}
]
I can retrieve the values in the fields nested list by doing this:
pd.DataFrame([d['fields'] for d in tbl])
I would like to add the id field to each row of the dataframe but I can't figure out how to do this.
Try:
data = [
{
"id": "rec2fiwnTQewTv9HC",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 19,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "xyz",
"Red": 0,
"Green": 255,
"Blue": 0,
},
},
{
"id": "rec4y7vhgZVDHrhrQ",
"createdTime": "2022-06-27T08:25:47.000Z",
"fields": {
"Num": 30,
"latitude": 31.101405,
"longitude": 36.391831,
"State": 2,
"Label": "abc",
"Red": 0,
"Green": 255,
"Blue": 0,
},
},
]
df = pd.DataFrame([{"id": d["id"], **d["fields"]} for d in data])
print(df)
Prints:
id Num latitude longitude State Label Red Green Blue
0 rec2fiwnTQewTv9HC 19 31.101405 36.391831 2 xyz 0 255 0
1 rec4y7vhgZVDHrhrQ 30 31.101405 36.391831 2 abc 0 255 0

End of File Expected

This is the json file I am working with. I am new to json and after doing some basic research, I was able to dump a dictionary that I had in it with some sample data as placeholders. When I try to use the file though it says that the End of file expected json[9,1] and I have no idea how to fix this as most of the results that I have found on this topic go way over my head. Thanks
{
"923390702359048212": [
0,
0,
0
]
}
{
"462291477964259329": [
0,
0,
0
]
}
{
"803390252265242634": [
0,
0,
0
]
}
{
"832041337968263178": [
0,
0,
0
]
}
{
"824114065445486592": [
0,
0,
0
]
}
You cannot have separate objects in your json file. You need to have this as an array.
[{
"923390702359048212": [
0,
0,
0
]
},
{
"462291477964259329": [
0,
0,
0
]
}]
Missing comma between bracket section an add a level of bracket
{
{
"923390702359048212": [
0,
0,
0
]
},
{
"462291477964259329": [
0,
0,
0
]
}
}
Complete all the json like that and it will be okay

How to get specific data from JSON object in Python

I have a dict stored under the variable parsed:
{
"8119300029": {
"store": 4,
"total": 4,
"web": 4
},
"8119300030": {
"store": 2,
"total": 2,
"web": 2
},
"8119300031": {
"store": 0,
"total": 0,
"web": 0
},
"8119300032": {
"store": 1,
"total": 1,
"web": 1
},
"8119300033": {
"store": 0,
"total": 0,
"web": 0
},
"8119300034": {
"store": 2,
"total": 2,
"web": 2
},
"8119300036": {
"store": 0,
"total": 0,
"web": 0
},
"8119300037": {
"store": 0,
"total": 0,
"web": 0
},
"8119300038": {
"store": 2,
"total": 2,
"web": 2
},
"8119300039": {
"store": 3,
"total": 3,
"web": 3
},
"8119300040": {
"store": 3,
"total": 3,
"web": 3
},
"8119300041": {
"store": 0,
"total": 0,
"web": 0
}
}
I am trying to get the "web" value from each JSON entry but can only get the key values.
for x in parsed:
print(x["web"])
I tried doing this ^ but kept getting this error: "string indices must be integers". Can somebody explain why this is wrong?
because your x variable is dict key name
for x in parsed:
print(parsed[x]['web'])
A little information on your parsed data there: this is basically a dictionary of dictionaries. I won't go into too much of the nitty gritty but it would do well to read up a bit on json: https://www.w3schools.com/python/python_json.asp
In your example, for x in parsed is iterating through the keys of the parsed dictionary, e.g. 8119300029, 8119300030, etc. So x is a key (in this case, a string), not a dictionary. The reason you're getting an error about not indexing with an integer is because you're trying to index a string -- for example x[0] would give you the first character 8 of the key 8119300029.
If you need to get each web value, then you need to access that key in the parsed[x] dictionary:
for x in parsed:
print(parsed[x]["web"])
Output:
4
2
0
...

Python - Parse (fio) json output

I have json output from the Linux fio command, as shown below, that I'd to parse for values like a dictionary, extracting certain values from certain keys. But the nested layer of this json output is clumping the output into huge "values" in the KVP. Any tips for how I can better parse these nested data structures?
{
"disk_util": [
{
"aggr_util": 96.278308,
"in_queue": 247376,
"write_ticks": 185440,
"read_ticks": 61924,
"write_merges": 0,
"read_merges": 0,
"write_ios": 240866,
"read_ios": 18257,
"name": "dm-0",
"util": 97.257058,
"aggr_read_ios": 18465,
"aggr_write_ios": 243642,
"aggr_read_merges": 1,
"aggr_write_merge": 72,
"aggr_read_ticks": 62420,
"aggr_write_ticks": 185796,
"aggr_in_queue": 245504
},
{
"util": 96.278308,
"name": "sda",
"read_ios": 18465,
"write_ios": 243642,
"read_merges": 1,
"write_merges": 72,
"read_ticks": 62420,
"write_ticks": 185796,
"in_queue": 245504
}
],
"jobs": [
{
"latency_window": 0,
"latency_percentile": 100,
"latency_target": 0,
"latency_depth": 64,
"latency_ms": {
">=2000": 0,
"2000": 0,
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
},
"latency_us": {
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
},
"write": {
"iops_samples": 35,
"iops_stddev": 1608.115728,
"iops_mean": 13835.571429,
"iops_max": 16612,
"iops_min": 9754,
"bw_samples": 35,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 243678,
"runtime": 17611,
"iops": 13836.692976,
"bw": 55346,
"io_kbytes": 974712,
"io_bytes": 998105088,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 39016,
"bw_max": 66448,
"bw_agg": 99.994218,
"bw_mean": 55342.8,
"bw_dev": 6432.427333
},
"read": {
"iops_samples": 35,
"iops_stddev": 126.732776,
"iops_mean": 1048.257143,
"iops_max": 1336,
"iops_min": 772,
"bw_samples": 35,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 18466,
"runtime": 17611,
"iops": 1048.549202,
"bw": 4194,
"io_kbytes": 73864,
"io_bytes": 75636736,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 3088,
"bw_max": 5344,
"bw_agg": 99.993188,
"bw_mean": 4193.714286,
"bw_dev": 506.844597
},
"job options": {
"rwmixread": "7",
"rw": "randrw",
"size": "1G",
"iodepth": "64",
"bs": "4k",
"filename": "test",
"name": "test"
},
"elapsed": 18,
"eta": 0,
"error": 0,
"groupid": 0,
"jobname": "test",
"trim": {
"iops_samples": 0,
"iops_stddev": 0,
"iops_mean": 0,
"iops_max": 0,
"iops_min": 0,
"bw_samples": 0,
"drop_ios": 0,
"short_ios": 0,
"total_ios": 0,
"runtime": 0,
"iops": 0,
"bw": 0,
"io_kbytes": 0,
"io_bytes": 0,
"slat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"clat_ns": {
"percentile": {
"0.00": 0
},
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"lat_ns": {
"stddev": 0,
"mean": 0,
"max": 0,
"min": 0
},
"bw_min": 0,
"bw_max": 0,
"bw_agg": 0,
"bw_mean": 0,
"bw_dev": 0
},
"usr_cpu": 11.447391,
"sys_cpu": 74.680597,
"ctx": 28972,
"majf": 0,
"minf": 31,
"iodepth_level": {
">=64": 99.975967,
"32": 0.1,
"16": 0.1,
"8": 0.1,
"4": 0.1,
"2": 0.1,
"1": 0.1
},
"latency_ns": {
"1000": 0,
"750": 0,
"2": 0,
"4": 0,
"10": 0,
"20": 0,
"50": 0,
"100": 0,
"250": 0,
"500": 0
}
}
],
"global options": {
"gtod_reduce": "1",
"direct": "1",
"ioengine": "libaio",
"randrepeat": "1"
},
"time": "Sat Oct 14 23:18:28 2017",
"timestamp_ms": 1508023108010,
"timestamp": 1508023108,
"fio version": "fio-3.1"
}
I'm importing it from a file really simplistically:
import json
my_file = open('fio.json', 'r')
my_dict = json.load(my_file)
for k, v in my_dict.items():
print("Key: {0}, value: {1}").format(k, v)
But when iterating, it's making all the nested tables and dicts return munged output, like
Key: disk_util, value: [{u'aggr_write_ticks': 185796, u'write_merges': 0, u'write_ticks': 185440, u'write_ios': 240866, u'aggr_write_ios': 243642, u'aggr_read_ticks': 62420, u'read_ios': 18257, u'util': 97.257058, u'read_ticks': 61924, u'aggr_write_merge': 72, u'read_merges': 0, u'aggr_in_queue': 245504, u'aggr_read_ios': 18465, u'aggr_util': 96.278308, u'aggr_read_merges': 1, u'in_queue': 247376, u'name': u'dm-0'}, {u'read_merges': 1, u'name': u'sda', u'write_ios': 243642, u'read_ios': 18465, u'util': 96.278308, u'read_ticks': 62420, u'write_merges': 72, u'in_queue': 245504, u'write_ticks': 185796}]
json.load() maintain json file type.
You seem to have an syntax error.
In the wrong position ).
import json
my_file = open('fio.json', 'r')
my_dict = json.load(my_file)
for index, key in enumerate(my_dict):
print("Key: {0}, value: {1}".format(key, my_dict[key]))

h5py order of dataset by dataset name

I am creating an h5 files with 5 datasets ['a160'],['a1214']
How can I make it so that the datasets will be sorted by the dataset name..
For example when I do h5dump on my file I get:
HDF5 "jjjj.h5" {
GROUP "/" {
DATASET "a1214" {
DATATYPE H5T_IEEE_F32BE
DATASPACE SIMPLE { ( 1, 19 ) / ( H5S_UNLIMITED, 19 ) }
DATA {
(0,0): 160, 0, 165, 4, 2.29761, 264, 4, 1.74368, 1, 0, 17, 193, 0, 0,
(0,14): 0, 0, 0, 0, 0
}
}
DATASET "a160" {
DATATYPE H5T_IEEE_F32BE
DATASPACE SIMPLE { ( 3, 19 ) / ( H5S_UNLIMITED, 19 ) }
DATA {
(0,0): 263, 0, 262, 7, 4.90241, 201, 34, 0.348432, 1, 0, 29, 11, 0, 0,
(0,14): 0, 0, 0, 0, 0,
}
}
But I want it to be ordered by the dataset name, I need h5dump to output
HDF5 "jjjj.h5" {
GROUP "/" {
DATASET "a160" {
DATATYPE H5T_IEEE_F32BE
DATASPACE SIMPLE { ( 3, 19 ) / ( H5S_UNLIMITED, 19 ) }
DATA {
(0,0): 263, 0, 262, 7, 4.90241, 201, 34, 0.348432, 1, 0, 29, 11, 0, 0,
(0,14): 0, 0, 0, 0, 0,
}
}
DATASET "a1214" {
DATATYPE H5T_IEEE_F32BE
DATASPACE SIMPLE { ( 1, 19 ) / ( H5S_UNLIMITED, 19 ) }
DATA {
(0,0): 160, 0, 165, 4, 2.29761, 264, 4, 1.74368, 1, 0, 17, 193, 0, 0,
(0,14): 0, 0, 0, 0, 0
}
}
}
By default h5dump sorts HDF5 files' groups and attributes by their names in ascending order:
-q Q, --sort_by=Q Sort groups and attributes by index Q
-z Z, --sort_order=Z Sort groups and attributes by order Z
Q - is the sort index type. It can be "creation_order" or "name" (default)
Z - is the sort order type. It can be "descending" or "ascending" (default)
The problem in this case is that "a160" is considered greater than "a1214" because that's how dictionary sorting works ('a12' < 'a16').
There's no change you can make to the internal structure of the HDF5 file to force h5dump to sort these data structures in a different order. However, you could zero-pad your names like so:
a0040
a0160
a1214
and then the standard dictionary sort will output the file the way you want.

Categories

Resources