Get Object insinde JSON Object - python

I am pretty new to JSON and need to get an Object inside a list of JSON Objects.
This is my data structure and code so far:
{
"nhits": 15,
"parameters": {
"dataset": "100073",
"timezone": "UTC",
"q": "timestamp:[2021-02-21T23:00:00Z TO 2021-03-08T22:59:59Z]",
"rows": 10,
"start": 0,
"sort": [
"timestamp"
],
"format": "json",
"facet": [
"timestamp"
]
},
"records": [
{
"datasetid": "100073",
"recordid": "a1252522b7820edd98eb464811953d0f6ba56458",
"fields": {
"week": 10,
"ncumul_conf": 9971,
"current_quarantined": 506,
"timestamp": "2021-03-08T09:30:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9627,
"ndiff_conf": 4,
"current_quarantined_total": 623,
"current_hosp_resident": 13,
"ncumul_deceased": 192,
"current_isolated": 152,
"current_hosp": 19,
"ndiff_released": 10,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 117,
"time": "10:30",
"date": "2021-03-08",
"ndiff_deceased": 0,
"current_icu": 5,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "c1a9f3fd45008ef3c140e446303ab3c2906166e0",
"fields": {
"week": 9,
"ncumul_conf": 9967,
"current_quarantined": 468,
"timestamp": "2021-03-07T11:40:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9617,
"ndiff_conf": 13,
"current_quarantined_total": 646,
"current_hosp_resident": 14,
"ncumul_deceased": 192,
"current_isolated": 158,
"current_hosp": 20,
"ndiff_released": 16,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 178,
"time": "12:40",
"date": "2021-03-07",
"ndiff_deceased": 0,
"current_icu": 5,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "3668aa9ae4f9cf73890ad8c7f13efef7246cc461",
"fields": {
"week": 9,
"ncumul_conf": 9954,
"current_quarantined": 417,
"timestamp": "2021-03-06T11:20:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9601,
"ndiff_conf": 22,
"current_quarantined_total": 602,
"current_hosp_resident": 13,
"ncumul_deceased": 192,
"current_isolated": 161,
"current_hosp": 19,
"ndiff_released": 23,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 185,
"time": "12:20",
"date": "2021-03-06",
"ndiff_deceased": 0,
"current_icu": 5,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "96a2bfde464cb4664ae8b16723960a7141800e56",
"fields": {
"week": 9,
"ncumul_conf": 9932,
"current_quarantined": 345,
"timestamp": "2021-03-05T09:50:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9578,
"ndiff_conf": 25,
"current_quarantined_total": 550,
"current_hosp_resident": 12,
"ncumul_deceased": 192,
"current_isolated": 162,
"current_hosp": 20,
"ndiff_released": 14,
"current_hosp_non_resident": 8,
"current_quarantined_riskareatravel": 205,
"time": "10:50",
"date": "2021-03-05",
"ndiff_deceased": 0,
"current_icu": 6,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "37a9b2c6a896a7dff362b27b671c71b83f467ccd",
"fields": {
"week": 9,
"ncumul_conf": 9907,
"current_quarantined": 253,
"timestamp": "2021-03-04T09:40:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9564,
"ndiff_conf": 27,
"current_quarantined_total": 481,
"current_hosp_resident": 13,
"ncumul_deceased": 192,
"current_isolated": 151,
"current_hosp": 21,
"ndiff_released": 23,
"current_hosp_non_resident": 8,
"current_quarantined_riskareatravel": 228,
"time": "10:40",
"date": "2021-03-04",
"ndiff_deceased": 0,
"current_icu": 6,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "c7933687391ff92436f1a75503648ce9430e0baa",
"fields": {
"week": 9,
"ncumul_conf": 9880,
"current_quarantined": 241,
"timestamp": "2021-03-03T10:50:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9541,
"ndiff_conf": 13,
"current_quarantined_total": 467,
"current_hosp_resident": 15,
"ncumul_deceased": 192,
"current_isolated": 147,
"current_hosp": 23,
"ndiff_released": 15,
"current_hosp_non_resident": 8,
"current_quarantined_riskareatravel": 226,
"time": "11:50",
"date": "2021-03-03",
"ndiff_deceased": 0,
"current_icu": 7,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "dd830a16c7f18e6cc2d5f8b03f5a75437d1331d3",
"fields": {
"week": 9,
"ncumul_conf": 9867,
"current_quarantined": 197,
"timestamp": "2021-03-02T09:40:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9526,
"ndiff_conf": 28,
"current_quarantined_total": 419,
"current_hosp_resident": 15,
"ncumul_deceased": 192,
"current_isolated": 149,
"current_hosp": 22,
"ndiff_released": 27,
"current_hosp_non_resident": 7,
"current_quarantined_riskareatravel": 222,
"time": "10:40",
"date": "2021-03-02",
"ndiff_deceased": 0,
"current_icu": 7,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "4de6410562c2e0329a9395f8e7687ed098f788b6",
"fields": {
"week": 9,
"ncumul_conf": 9839,
"current_quarantined": 159,
"timestamp": "2021-03-01T09:40:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9499,
"ndiff_conf": -14,
"current_quarantined_total": 365,
"current_hosp_resident": 15,
"ncumul_deceased": 192,
"current_isolated": 148,
"current_hosp": 21,
"ndiff_released": -4,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 206,
"time": "10:40",
"date": "2021-03-01",
"ndiff_deceased": 0,
"current_icu": 7,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "90006046ef1f6627c4c742520e37c99c04eb2db3",
"fields": {
"week": 8,
"ncumul_conf": 9853,
"current_quarantined": 167,
"timestamp": "2021-02-28T08:00:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9503,
"ndiff_conf": 13,
"current_quarantined_total": 358,
"current_hosp_resident": 10,
"ncumul_deceased": 192,
"current_isolated": 158,
"current_hosp": 16,
"ndiff_released": 14,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 191,
"time": "09:00",
"date": "2021-02-28",
"ndiff_deceased": 0,
"current_icu": 7,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
},
{
"datasetid": "100073",
"recordid": "41c0f47f811b68f3ca393546e202b1b698e741c1",
"fields": {
"week": 8,
"ncumul_conf": 9840,
"current_quarantined": 177,
"timestamp": "2021-02-27T09:30:00+00:00",
"source": "https://www.gesundheit.bs.ch",
"ncumul_released": 9489,
"ndiff_conf": 21,
"current_quarantined_total": 359,
"current_hosp_resident": 10,
"ncumul_deceased": 192,
"current_isolated": 159,
"current_hosp": 16,
"ndiff_released": 14,
"current_hosp_non_resident": 6,
"current_quarantined_riskareatravel": 182,
"time": "10:30",
"date": "2021-02-27",
"ndiff_deceased": 0,
"current_icu": 7,
"abbreviation_canton_and_fl": "BS"
},
"record_timestamp": "2021-03-08T21:01:15.004000+00:00"
}
],
"facet_groups": [
{
"facets": [
{
"count": 15,
"path": "2021",
"state": "displayed",
"name": "2021"
}
],
"name": "timestamp"
}
]
}
To get the data in the "records" list i use:
import csv
from urllib.request import urlopen
import json
url = 'https://data.bs.ch/api/records/1.0/search/?dataset=100073&q=timestamp%3A%5B2021-02-21T23%3A00%3A00Z+TO+2021-03-08T22%3A59%3A59Z%5D&sort=timestamp&facet=timestamp'
ddict = {}
def getDataFromBS():
json_url = urlopen(url)
data = json.loads(json_url.read())
records = data['records']
getDataFromBS()
My problem now is, that i need to get the data inside the "fields" object. But i don't know how to extract it. Can anyone help me? Even if it is just a hint.
Every help will be much appreciated.

You need to return something from your function. The records field is a list of dictionaries. You can iterate over them and pull out the fields object
def getDataFromBS():
json_url = urlopen(url)
data = json.loads(json_url.read())
records = data['records']
fields = [r.get('fields') for r in records]
return fields

You could directly do data['records'][0]['fields'] after data = getDataFromBS()

Looks like you have the answer already, but here is another alternative if you jsut want to return 1 field. You could add a loop and iterate over the others
import json
with open('test.json') as json_file:
data = json.load(json_file)
print(data['records'][1]['fields'])

Related

Python groupby/convert join table to triple nested dictionary

From a SQL stored procedure that performs a join on 3 tables I get the data below.
data = [
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 10, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 11, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 1, "part_number": "XTY", "ticket_id": 12, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 13, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 14, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 15, "ticket_month": "JUNE"}]
I would like to group the data on so_number and item_id to return a list of dicts like below.
[
{
"so_number ": "ABC",
"po_status": "OPEN",
"line_items": [
{
"item_id": 0,
"part_number": "XTZ",
"tickets": [
{
"ticket_id": 10,
"ticket_month": "JUNE"
},
{
"ticket_id": 11,
"ticket_month": "JUNE"
}
]
},
{
"item_id": 1,
"part_number": "XTY",
"tickets": [
{
"ticket_id": 12,
"ticket_month": "JUNE"
}
]
}
]
},
{
"so_number ": "DEF",
"po_status": "OPEN",
"line_items": [
{
"item_id": 3,
"part_number": "XTU"
"tickets": [
{
"ticket_id": 13,
"ticket_month": "JUNE"
},
{
"ticket_id": 14,
"ticket_month": "JUNE"
},
{
"ticket_id": 15,
"ticket_month": "JUNE"
}
]
}
]
}
]
I wanted to know if there was an efficient way of doing this. I am open to using pandas as well.
I thought about accessing the 3 sql tables through a loop and creating this list of dicts but it will probably not be best practice or efficient.
Given the nested structure, you could use groupby in loops:
import pandas as pd
import json
data = [
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 10, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 0, "part_number": "XTZ", "ticket_id": 11, "ticket_month": "JUNE"},
{"so_number": "ABC", "po_status": "OPEN", "item_id": 1, "part_number": "XTY", "ticket_id": 12, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 13, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 14, "ticket_month": "JUNE"},
{"so_number": "DEF", "po_status": "OPEN", "item_id": 3, "part_number": "XTU", "ticket_id": 15, "ticket_month": "JUNE"}]
df = pd.DataFrame(data)
res = []
for (so, po), dfg1 in df.groupby(["so_number", "po_status"]):
d1 = {"so_number ": so,
"po_status": po,
"line_items": []
}
for (iid, pnb), dfg2 in dfg1.groupby(["item_id", "part_number"]):
d2 = {"item_id": iid,
"part_number": pnb,
"tickets": dfg2[["ticket_id", "ticket_month"]].to_dict(orient="records")
}
d1["line_items"].append(d2)
res.append(d1)
print(json.dumps(res, indent=2, default=int))
Output:
[
{
"so_number ": "ABC",
"po_status": "OPEN",
"line_items": [
{
"item_id": 0,
"part_number": "XTZ",
"tickets": [
{
"ticket_id": 10,
"ticket_month": "JUNE"
},
{
"ticket_id": 11,
"ticket_month": "JUNE"
}
]
},
{
"item_id": 1,
"part_number": "XTY",
"tickets": [
{
"ticket_id": 12,
"ticket_month": "JUNE"
}
]
}
]
},
{
"so_number ": "DEF",
"po_status": "OPEN",
"line_items": [
{
"item_id": 3,
"part_number": "XTU",
"tickets": [
{
"ticket_id": 13,
"ticket_month": "JUNE"
},
{
"ticket_id": 14,
"ticket_month": "JUNE"
},
{
"ticket_id": 15,
"ticket_month": "JUNE"
}
]
}
]
}
]
Edit following your comment: you will still have to define the grouping keys. But you can do it only once and keep all other keys at the last level:
res = []
lvl1 = ["so_number", "po_status"]
lvl2 = ["item_id", "part_number"]
for val1, dfg1 in df.groupby(lvl1):
d1 = dict(zip(lvl1, val1))
d1["line_items"]= []
for val2, dfg2 in dfg1.groupby(lvl2):
d2 = dict(zip(lvl2, val2))
d2["tickets"]= dfg2.drop(columns=lvl1+lvl2).to_dict(orient="records")
d1["line_items"].append(d2)
res.append(d1)

drop selective columns pandas dataframe while flattening

I have a created a dataframe from a JSON but want to keep only the first 5 columns of the result.
Here is a part of the JSON:
{
"lat": 52.517,
"lon": 13.3889,
"timezone": "Europe/Berlin",
"timezone_offset": 7200,
"current": {
"dt": 1628156947,
"sunrise": 1628134359,
"sunset": 1628189532,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 79,
"wind_gust": 4.92,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
]
},
"hourly": [
{
"dt": 1628154000,
"temp": 295.26,
"feels_like": 295.09,
"pressure": 1009,
"humidity": 60,
"dew_point": 287.14,
"uvi": 4.01,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.6,
"wind_deg": 83,
"wind_gust": 4.76,
"weather": [
{
"id": 500,
"main": "Rain",
"description": "light rain",
"icon": "10d"
}
],
"pop": 0.49,
"rain": {
"1h": 0.52
}
},
{
"dt": 1628157600,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.76,
"wind_deg": 85,
"wind_gust": 4.91,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
],
"pop": 0.55
},
{
"dt": 1628161200,
"temp": 295.58,
"feels_like": 295.42,
"pressure": 1009,
"humidity": 59,
"dew_point": 287.18,
"uvi": 4.9,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 95,
"wind_gust": 4.73,
"weather": [
{
"id": 802,
"main": "Clouds",
"description": "scattered clouds",
"icon": "03d"
}
],
"pop": 0.59
}
]
}
I have flattened the JSON first like this:
df_history = pd.json_normalize(data_history, max_level=1)`
That gave me this structure:
lat lon timezone timezone_offset hourly current.dt current.sunrise current.sunset current.temp current.feels_like ... current.humidity current.dew_point current.uvi current.clouds current.visibility current.wind_speed current.wind_deg current.wind_gust current.weather current.rain
0 52.517 13.3889 Europe/Berlin 7200 [{'dt': 1627776000, 'temp': 17.82, 'feels_like... 1627855200 1627874869 1627930649 16.36 16.4 ... 90 14.72 0 0 10000 3.13 254 11.18 [{'id': 500, 'main': 'Rain', 'description': 'l... {'1h': 0.17}
But I want to keep only the columns up to the column "hourly" and then flatten it.
I have tried this but to no avail:
df_history_small = pd.json_normalize(data_history, record_path='hourly',meta=['dt','temp', 'humidity'], errors='ignore')
What am I doing wrong? How can I achieve my goal?
my final goal it to have a dataframe that looks like this:
lat lon timezone timezone_offset timestamp temp feels_like humidity pressure
0 52.517 13.3889 Europe/Berlin 7200 08/01/2021 00:00:00 17.82 17.46 69 1005
Try:
cols = ['lat', 'lon', 'timezone', 'timezone_offset',
'dt', 'temp', 'feels_like', 'humidity']
out = pd.json_normalize(data_history, ['hourly'], meta=cols[:4])[cols]
>>> out
lat lon timezone timezone_offset dt temp feels_like humidity
0 52.517 13.3889 Europe/Berlin 7200 1628154000 295.26 295.09 60
1 52.517 13.3889 Europe/Berlin 7200 1628157600 295.54 295.43 61
2 52.517 13.3889 Europe/Berlin 7200 1628161200 295.58 295.42 59
Feel free to convert dt to timestamp with:
df['timestamp'] = pd.to_datetime(out['dt'], unit='s')

How to change numbers in jsonl file and save it

I have a jsonl file with content
How to read a file and change the number after the label sign to a random number 0 or 1 and save the converted file back in python
{"idx": 0, "passage": {"questions": [{"idx": 0, "answers": [{"idx": 0, "label": 0}, {"idx": 1, "label": 0}, {"idx": 2, "label": 0}, {"idx": 3, "label": 0}]}, {"idx": 1, "answers": [{"idx": 4, "label": 0}, {"idx": 5, "label": 0}, {"idx": 6, "label": 0}, {"idx": 7, "label": 0}]}, {"idx": 2, "answers": [{"idx": 8, "label": 1}, {"idx": 9, "label": 0}, {"idx": 10, "label": 0}, {"idx": 11, "label": 0}]}, {"idx": 3, "answers": [{"idx": 12, "label": 0}, {"idx": 13, "label": 0}, {"idx": 14, "label": 0}, {"idx": 15, "label": 0}]}, {"idx": 4, "answers": [{"idx": 16, "label": 0}, {"idx": 17, "label": 0}, {"idx": 18, "label": 0}, {"idx": 19, "label": 0}, {"idx": 20, "label": 0}]}, {"idx": 5, "answers": [{"idx": 21, "label": 0}, {"idx": 22, "label": 0}, {"idx": 23, "label": 0}, {"idx": 24, "label": 0}, {"idx": 25, "label": 0}]}, {"idx": 6, "answers": [{"idx": 26, "label": 0}, {"idx": 27, "label": 0}, {"idx": 28, "label": 0}, {"idx": 29, "label": 0}, {"idx": 30, "label": 0}]}, {"idx": 7, "answers": [{"idx": 31, "label": 0}, {"idx": 32, "label": 0}, {"idx": 33, "label": 0}, {"idx": 34, "label": 0}, {"idx": 35, "label": 0}]}, {"idx": 8, "answers": [{"idx": 36, "label": 0}, {"idx": 37, "label": 0}, {"idx": 38, "label": 0}, {"idx": 39, "label": 0}, {"idx": 40, "label": 0}]}, {"idx": 9, "answers": [{"idx": 41, "label": 0}, {"idx": 42, "label": 0}, {"idx": 43, "label": 0}, {"idx": 44, "label": 0}, {"idx": 45, "label": 0}]}]}}
{"idx": 1, "passage": {"questions": [{"idx": 10, "answers": [{"idx": 46, "label": 0}, {"idx": 47, "label": 0}, {"idx": 48, "label": 0}, {"idx": 49, "label": 0}, {"idx": 50, "label": 0}]}, {"idx": 11, "answers": [{"idx": 51, "label": 0}, {"idx": 52, "label": 0}, {"idx": 53, "label": 0}, {"idx": 54, "label": 0}, {"idx": 55, "label": 0}]}]}}
{"idx": 2, "passage": {"questions": [{"idx": 12, "answers": [{"idx": 56, "label": 0}, {"idx": 57, "label": 0}, {"idx": 58, "label": 0}, {"idx": 59, "label": 0}, {"idx": 60, "label": 0}]}, {"idx": 13, "answers": [{"idx": 61, "label": 0}, {"idx": 62, "label": 0}, {"idx": 63, "label": 0}, {"idx": 64, "label": 0}, {"idx": 65, "label": 0}]}, {"idx": 14, "answers": [{"idx": 66, "label": 0}, {"idx": 67, "label": 0}, {"idx": 68, "label": 0}, {"idx": 69, "label": 0}, {"idx": 70, "label": 0}]}, {"idx": 15, "answers": [{"idx": 71, "label": 0}, {"idx": 72, "label": 0}, {"idx": 73, "label": 0}, {"idx": 74, "label": 0}, {"idx": 75, "label": 0}]}, {"idx": 16, "answers": [{"idx": 76, "label": 0}, {"idx": 77, "label": 0}, {"idx": 78, "label": 0}, {"idx": 79, "label": 0}, {"idx": 80, "label": 0}]}, {"idx": 17, "answers": [{"idx": 81, "label": 0}, {"idx": 82, "label": 0}, {"idx": 83, "label": 0}, {"idx": 84, "label": 0}, {"idx": 85, "label": 0}]}, {"idx": 18, "answers": [{"idx": 86, "label": 0}, {"idx": 87, "label": 0}, {"idx": 88, "label": 0}, {"idx": 89, "label": 0}, {"idx": 90, "label": 0}]}, {"idx": 19, "answers": [{"idx": 91, "label": 0}, {"idx": 92, "label": 0}, {"idx": 93, "label": 0}, {"idx": 94, "label": 0}, {"idx": 95, "label": 0}]}]}}
{"idx": 3, "passage": {"questions": [{"idx": 20, "answers": [{"idx": 96, "label": 0}, {"idx": 97, "label": 0}, {"idx": 98, "label": 0}, {"idx": 99, "label": 0}]}, {"idx": 21, "answers": [{"idx": 100, "label": 0}, {"idx": 101, "label": 0}, {"idx": 102, "label": 0}, {"idx": 103, "label": 0}]}, {"idx": 22, "answers": [{"idx": 104, "label": 0}, {"idx": 105, "label": 0}, {"idx": 106, "label": 0}, {"idx": 107, "label": 0}]}, {"idx": 23, "answers": [{"idx": 108, "label": 0}, {"idx": 109, "label": 0}, {"idx": 110, "label": 0}, {"idx": 111, "label": 0}]}, {"idx": 24, "answers": [{"idx": 112, "label": 0}, {"idx": 113, "label": 0}, {"idx": 114, "label": 0}, {"idx": 115, "label": 0}]}, {"idx": 25, "answers": [{"idx": 116, "label": 0}, {"idx": 117, "label": 0}, {"idx": 118, "label": 0}, {"idx": 119, "label": 0}]}, {"idx": 26, "answers": [{"idx": 120, "label": 0}, {"idx": 121, "label": 0}, {"idx": 122, "label": 0}, {"idx": 123, "label": 0}]}, {"idx": 27, "answers": [{"idx": 124, "label": 0}, {"idx": 125, "label": 0}, {"idx": 126, "label": 0}, {"idx": 127, "label": 0}]}]}}
{"idx": 4, "passage": {"questions": [{"idx": 28, "answers": [{"idx": 128, "label": 1}, {"idx": 129, "label": 1}, {"idx": 130, "label": 1}, {"idx": 131, "label": 1}, {"idx": 132, "label": 1}]}, {"idx": 29, "answers": [{"idx": 133, "label": 0}, {"idx": 134, "label": 1}, {"idx": 135, "label": 1}, {"idx": 136, "label": 0}, {"idx": 137, "label": 1}]}, {"idx": 30, "answers": [{"idx": 138, "label": 0}, {"idx": 139, "label": 0}, {"idx": 140, "label": 1}, {"idx": 141, "label": 0}, {"idx": 142, "label": 0}]}, {"idx": 31, "answers": [{"idx": 143, "label": 0}, {"idx": 144, "label": 0}, {"idx": 145, "label": 0}, {"idx": 146, "label": 0}, {"idx": 147, "label": 0}]}, {"idx": 32, "answers": [{"idx": 148, "label": 0}, {"idx": 149, "label": 0}, {"idx": 150, "label": 0}, {"idx": 151, "label": 0}, {"idx": 152, "label": 0}]}, {"idx": 33, "answers": [{"idx": 153, "label": 0}, {"idx": 154, "label": 1}, {"idx": 155, "label": 1}, {"idx": 156, "label": 1}, {"idx": 157, "label": 1}]}, {"idx": 34, "answers": [{"idx": 158, "label": 0}, {"idx": 159, "label": 0}, {"idx": 160, "label": 0}, {"idx": 161, "label": 0}, {"idx": 162, "label": 0}]}, {"idx": 35, "answers": [{"idx": 163, "label": 0}, {"idx": 164, "label": 1}, {"idx": 165, "label": 1}, {"idx": 166, "label": 0}, {"idx": 167, "label": 0}]}, {"idx": 36, "answers": [{"idx": 168, "label": 0}, {"idx": 169, "label": 0}, {"idx": 170, "label": 1}, {"idx": 171, "label": 0}, {"idx": 172, "label": 0}]}, {"idx": 37, "answers": [{"idx": 173, "label": 1}, {"idx": 174, "label": 0}, {"idx": 175, "label": 0}, {"idx": 176, "label": 0}, {"idx": 177, "label": 0}]}, {"idx": 38, "answers": [{"idx": 178, "label": 0}, {"idx": 179, "label": 1}, {"idx": 180, "label": 1}, {"idx": 181, "label": 0}, {"idx": 182, "label": 1}]}, {"idx": 39, "answers": [{"idx": 183, "label": 1}, {"idx": 184, "label": 1}, {"idx": 185, "label": 1}, {"idx": 186, "label": 0}, {"idx": 187, "label": 0}]}, {"idx": 40, "answers": [{"idx": 188, "label": 0}, {"idx": 189, "label": 0}, {"idx": 190, "label": 1}, {"idx": 191, "label": 0}, {"idx": 192, "label": 0}]}, {"idx": 41, "answers": [{"idx": 193, "label": 0}, {"idx": 194, "label": 0}, {"idx": 195, "label": 0}, {"idx": 196, "label": 0}, {"idx": 197, "label": 0}]}]}}
{"idx": 5, "passage": {"questions": [{"idx": 42, "answers": [{"idx": 198, "label": 0}, {"idx": 199, "label": 0}]}, {"idx": 43, "answers": [{"idx": 200, "label": 1}, {"idx": 201, "label": 0}]}, {"idx": 44, "answers": [{"idx": 202, "label": 0}, {"idx": 203, "label": 0}, {"idx": 204, "label": 0}, {"idx": 205, "label": 0}]}, {"idx": 45, "answers": [{"idx": 206, "label": 0}, {"idx": 207, "label": 0}, {"idx": 208, "label": 0}, {"idx": 209, "label": 0}]}, {"idx": 46, "answers": [{"idx": 210, "label": 0}, {"idx": 211, "label": 0}, {"idx": 212, "label": 0}, {"idx": 213, "label": 0}]}, {"idx": 47, "answers": [{"idx": 214, "label": 0}, {"idx": 215, "label": 0}, {"idx": 216, "label": 0}, {"idx": 217, "label": 0}]}, {"idx": 48, "answers": [{"idx": 218, "label": 0}, {"idx": 219, "label": 0}, {"idx": 220, "label": 0}, {"idx": 221, "label": 0}, {"idx": 222, "label": 0}]}, {"idx": 49, "answers": [{"idx": 223, "label": 1}, {"idx": 224, "label": 0}, {"idx": 225, "label": 0}, {"idx": 226, "label": 0}, {"idx": 227, "label": 0}]}, {"idx": 50, "answers": [{"idx": 228, "label": 1}, {"idx": 229, "label": 0}, {"idx": 230, "label": 0}, {"idx": 231, "label": 0}, {"idx": 232, "label": 0}]}, {"idx": 51, "answers": [{"idx": 233, "label": 0}, {"idx": 234, "label": 0}, {"idx": 235, "label": 0}, {"idx": 236, "label": 0}, {"idx": 237, "label": 0}]}, {"idx": 52, "answers": [{"idx": 238, "label": 1}, {"idx": 239, "label": 0}, {"idx": 240, "label": 0}, {"idx": 241, "label": 0}, {"idx": 242, "label": 0}]}, {"idx": 53, "answers": [{"idx": 243, "label": 0}, {"idx": 244, "label": 0}, {"idx": 245, "label": 0}, {"idx": 246, "label": 1}, {"idx": 247, "label": 1}]}, {"idx": 54, "answers": [{"idx": 248, "label": 1}, {"idx": 249, "label": 1}, {"idx": 250, "label": 1}, {"idx": 251, "label": 1}, {"idx": 252, "label": 0}]}, {"idx": 55, "answers": [{"idx": 253, "label": 0}, {"idx": 254, "label": 0}, {"idx": 255, "label": 0}, {"idx": 256, "label": 0}, {"idx": 257, "label": 0}]}]}}
{"idx": 6, "passage": {"questions": [{"idx": 56, "answers": [{"idx": 258, "label": 1}, {"idx": 259, "label": 0}, {"idx": 260, "label": 1}, {"idx": 261, "label": 0}]}, {"idx": 57, "answers": [{"idx": 262, "label": 1}, {"idx": 263, "label": 1}, {"idx": 264, "label": 1}]}, {"idx": 58, "answers": [{"idx": 265, "label": 1}, {"idx": 266, "label": 1}, {"idx": 267, "label": 1}, {"idx": 268, "label": 1}]}, {"idx": 59, "answers": [{"idx": 269, "label": 1}, {"idx": 270, "label": 1}, {"idx": 271, "label": 1}, {"idx": 272, "label": 1}]}, {"idx": 60, "answers": [{"idx": 273, "label": 1}, {"idx": 274, "label": 1}, {"idx": 275, "label": 1}, {"idx": 276, "label": 1}]}, {"idx": 61, "answers": [{"idx": 277, "label": 1}, {"idx": 278, "label": 1}, {"idx": 279, "label": 1}, {"idx": 280, "label": 1}]}]}}
{"idx": 7, "passage": {"questions": [{"idx": 62, "answers": [{"idx": 281, "label": 0}, {"idx": 282, "label": 1}, {"idx": 283, "label": 1}, {"idx": 284, "label": 1}, {"idx": 285, "label": 0}]}, {"idx": 63, "answers": [{"idx": 286, "label": 0}, {"idx": 287, "label": 0}, {"idx": 288, "label": 0}, {"idx": 289, "label": 0}, {"idx": 290, "label": 1}]}, {"idx": 64, "answers": [{"idx": 291, "label": 0}, {"idx": 292, "label": 0}, {"idx": 293, "label": 0}, {"idx": 294, "label": 0}, {"idx": 295, "label": 0}]}, {"idx": 65, "answers": [{"idx": 296, "label": 1}, {"idx": 297, "label": 1}, {"idx": 298, "label": 1}, {"idx": 299, "label": 1}, {"idx": 300, "label": 1}]}, {"idx": 66, "answers": [{"idx": 301, "label": 1}, {"idx": 302, "label": 0}, {"idx": 303, "label": 1}, {"idx": 304, "label": 0}, {"idx": 305, "label": 1}]}, {"idx": 67, "answers": [{"idx": 306, "label": 0}, {"idx": 307, "label": 0}, {"idx": 308, "label": 0}, {"idx": 309, "label": 1}, {"idx": 310, "label": 1}]}, {"idx": 68, "answers": [{"idx": 311, "label": 0}, {"idx": 312, "label": 0}, {"idx": 313, "label": 0}, {"idx": 314, "label": 1}, {"idx": 315, "label": 0}]}, {"idx": 69, "answers": [{"idx": 316, "label": 1}, {"idx": 317, "label": 1}, {"idx": 318, "label": 1}, {"idx": 319, "label": 1}, {"idx": 320, "label": 1}]}]}}
{"idx": 8, "passage": {"questions": [{"idx": 70, "answers": [{"idx": 321, "label": 0}, {"idx": 322, "label": 0}, {"idx": 323, "label": 0}, {"idx": 324, "label": 0}]}, {"idx": 71, "answers": [{"idx": 325, "label": 1}, {"idx": 326, "label": 0}, {"idx": 327, "label": 0}, {"idx": 328, "label": 0}]}, {"idx": 72, "answers": [{"idx": 329, "label": 0}, {"idx": 330, "label": 0}, {"idx": 331, "label": 0}, {"idx": 332, "label": 0}, {"idx": 333, "label": 0}]}, {"idx": 73, "answers": [{"idx": 334, "label": 0}, {"idx": 335, "label": 0}, {"idx": 336, "label": 0}, {"idx": 337, "label": 0}, {"idx": 338, "label": 0}]}, {"idx": 74, "answers": [{"idx": 339, "label": 0}, {"idx": 340, "label": 0}, {"idx": 341, "label": 0}, {"idx": 342, "label": 1}, {"idx": 343, "label": 1}]}, {"idx": 75, "answers": [{"idx": 344, "label": 1}, {"idx": 345, "label": 1}, {"idx": 346, "label": 0}, {"idx": 347, "label": 0}, {"idx": 348, "label": 0}]}, {"idx": 76, "answers": [{"idx": 349, "label": 0}, {"idx": 350, "label": 1}, {"idx": 351, "label": 0}, {"idx": 352, "label": 0}, {"idx": 353, "label": 0}]}, {"idx": 77, "answers": [{"idx": 354, "label": 0}, {"idx": 355, "label": 0}, {"idx": 356, "label": 0}, {"idx": 357, "label": 1}, {"idx": 358, "label": 0}]}, {"idx": 78, "answers": [{"idx": 359, "label": 0}, {"idx": 360, "label": 1}, {"idx": 361, "label": 0}, {"idx": 362, "label": 0}, {"idx": 363, "label": 0}]}, {"idx": 79, "answers": [{"idx": 364, "label": 0}, {"idx": 365, "label": 0}, {"idx": 366, "label": 0}, {"idx": 367, "label": 0}, {"idx": 368, "label": 0}]}, {"idx": 80, "answers": [{"idx": 369, "label": 0}, {"idx": 370, "label": 0}, {"idx": 371, "label": 0}, {"idx": 372, "label": 0}, {"idx": 373, "label": 0}]}, {"idx": 81, "answers": [{"idx": 374, "label": 0}, {"idx": 375, "label": 0}, {"idx": 376, "label": 0}, {"idx": 377, "label": 0}, {"idx": 378, "label": 0}]}]}}
{"idx": 9, "passage": {"questions": [{"idx": 82, "answers": [{"idx": 379, "label": 0}, {"idx": 380, "label": 0}, {"idx": 381, "label": 0}, {"idx": 382, "label": 0}]}, {"idx": 83, "answers": [{"idx": 383, "label": 0}, {"idx": 384, "label": 1}, {"idx": 385, "label": 0}, {"idx": 386, "label": 0}]}, {"idx": 84, "answers": [{"idx": 387, "label": 0}, {"idx": 388, "label": 0}, {"idx": 389, "label": 0}, {"idx": 390, "label": 0}]}, {"idx": 85, "answers": [{"idx": 391, "label": 1}, {"idx": 392, "label": 0}, {"idx": 393, "label": 1}, {"idx": 394, "label": 1}]}, {"idx": 86, "answers": [{"idx": 395, "label": 0}, {"idx": 396, "label": 0}, {"idx": 397, "label": 0}, {"idx": 398, "label": 0}]}, {"idx": 87, "answers": [{"idx": 399, "label": 0}, {"idx": 400, "label": 0}, {"idx": 401, "label": 0}, {"idx": 402, "label": 1}]}, {"idx": 88, "answers": [{"idx": 403, "label": 0}, {"idx": 404, "label": 0}, {"idx": 405, "label": 1}, {"idx": 406, "label": 0}]}, {"idx": 89, "answers": [{"idx": 407, "label": 0}, {"idx": 408, "label": 0}, {"idx": 409, "label": 0}, {"idx": 410, "label": 1}]}]}}
import json
import random
# read each decoded JSON line into a list
with open('test.jsonl',encoding='utf8') as f:
data = [json.loads(line) for line in f]
# walk the structure and change the labels
for item in data:
for q in item['passage']['questions']:
for a in q['answers']:
a['label'] = random.randint(0,1)
# write each JSON line back to a new file
with open('test2.jsonl','w',encoding='utf8') as f:
for item in data:
json.dump(item,f)
print(file=f) # add a newline
You could write back to the same file, but safer to delete and rename once written successfully.
Hope this different and Fast approach will work
import json
import numpy as np
import random
def find(original, sub):
return [i+9 for i in range(len(original)) if original.startswith(sub, i)]
def split_into_parts(number, n=2):
if number % n==0:
return [int(number/2),int(number/2)]
else:
return [int(number/2),int(number/2)+1]
def get_ranodom_list(num):
o,z=split_into_parts(num,2)
ls=[0]*o+[1]*z
random.shuffle(ls)
return ls
d={"idx": 0, "passage": {"questions": [{"idx": 0, "answers": [{"idx": 0, "label": 0}, {"idx": 1, "label": 0}, {"idx": 2, "label": 0}, {"idx": 3, "label": 0}]}, {"idx": 1, "answers": [{"idx": 4, "label": 0}, {"idx": 5, "label": 0}, {"idx": 6, "label": 0}, {"idx": 7, "label": 0}]}, {"idx": 2, "answers": [{"idx": 8, "label": 1}, {"idx": 9, "label": 0}, {"idx": 10, "label": 0}, {"idx": 11, "label": 0}]}, {"idx": 3, "answers": [{"idx": 12, "label": 0}, {"idx": 13, "label": 0}, {"idx": 14, "label": 0}, {"idx": 15, "label": 0}]}, {"idx": 4, "answers": [{"idx": 16, "label": 0}, {"idx": 17, "label": 0}, {"idx": 18, "label": 0}, {"idx": 19, "label": 0}, {"idx": 20, "label": 0}]}, {"idx": 5, "answers": [{"idx": 21, "label": 0}, {"idx": 22, "label": 0}, {"idx": 23, "label": 0}, {"idx": 24, "label": 0}, {"idx": 25, "label": 0}]}, {"idx": 6, "answers": [{"idx": 26, "label": 0}, {"idx": 27, "label": 0}, {"idx": 28, "label": 0}, {"idx": 29, "label": 0}, {"idx": 30, "label": 0}]}, {"idx": 7, "answers": [{"idx": 31, "label": 0}, {"idx": 32, "label": 0}, {"idx": 33, "label": 0}, {"idx": 34, "label": 0}, {"idx": 35, "label": 0}]}, {"idx": 8, "answers": [{"idx": 36, "label": 0}, {"idx": 37, "label": 0}, {"idx": 38, "label": 0}, {"idx": 39, "label": 0}, {"idx": 40, "label": 0}]}, {"idx": 9, "answers": [{"idx": 41, "label": 0}, {"idx": 42, "label": 0}, {"idx": 43, "label": 0}, {"idx": 44, "label": 0}, {"idx": 45, "label": 0}]}]}}
original = json.dumps(d)
result=find(original,'"label": ')
num=len(result)
zo_list=get_ranodom_list(num)
temp = list(original)
counter=0
for i in range(len(original)):
if i in result:
temp[i] = str(zo_list[counter])
counter+=1
res = ''.join(temp)
print(res)

Create json file with annotation for entity from text file and also append the startIndex and endIndex into the dictionary

I would like to append the startIndex and endIndex into the "check". Can anyone please help me?
Example,
I have startIndex and endIndex in dictionary form like this:
"startIndex":3, "endIndex":5
And
I wanted to append it into the "check" in the text to become something like this.
"check": [{"startIndex":3, "endIndex":5}]
If possible, I wish to add another pair of key:value("gender":"M") into this "check" to become:
"check": [{"gender":"M", "startIndex":3, "endIndex":5}]
My code:
My input for the start and end index are:
{'startIndex': 6, 'endIndex': 10}
{'startIndex': 31, 'endIndex': 35}
{'startIndex': 15, 'endIndex': 19}
{'startIndex': 11, 'endIndex': 15}
{'startIndex': 22, 'endIndex': 26}
{'startIndex': 6, 'endIndex': 10}
{'startIndex': 4, 'endIndex': 8}
{'startIndex': 5, 'endIndex': 9}
{'startIndex': 24, 'endIndex': 28}
{'startIndex': 32, 'endIndex': 36}
{'startIndex': 12, 'endIndex': 16}
{'startIndex': 15, 'endIndex': 19}
{'startIndex': 13, 'endIndex': 17}
{'startIndex': 24, 'endIndex': 28}
{'startIndex': 23, 'endIndex': 27}
{'startIndex': 19, 'endIndex': 23}
{'startIndex': 20, 'endIndex': 24}
{'startIndex': 24, 'endIndex': 28}
{'startIndex': 16, 'endIndex': 20}
{'startIndex': 10, 'endIndex': 14}
{'startIndex': 4, 'endIndex': 8}
{'startIndex': 28, 'endIndex': 32}
{'startIndex': 27, 'endIndex': 31}
{'startIndex': 25, 'endIndex': 29}
{'startIndex': 21, 'endIndex': 25}
{'startIndex': 3, 'endIndex': 7}
{'startIndex': 20, 'endIndex': 24}
{'startIndex': 25, 'endIndex': 29}
{'startIndex': 9, 'endIndex': 13}
TestSmart = [{"text": "I am smart boy", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [], "id": 0}, {"text": "I am so smart last time", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [], "id": 1}, {"text": "They always call me smart boy", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [], "id": 2}]
import json
import re
with open("TestSmart.txt", "r") as f:
test = f.read().splitlines()
lemmas = []
sea= []
for x in test:
for m in re.finditer("smart",x):
start=m.start()
end=m.end()
se = dict([("startIndex", start), ("endIndex", end)] )
sea.append(se)
i = 0
null = None
for x in test:
text = dict([("text",x), ("name",null), ("originalText",null), ("location",[]), ("person",[]),
("sound", ""), ("check",[]), ("id",i)])
lemmas.append(text)
i+=1
new = []
for x in text:
for se in sea:
if len(text["check"]) == 0 or len(text["check"]) >= 1:
text["check"].append(se)
print(lemmas)
with open("new.json", "w", encoding="utf-8") as outfile:
json.dump(lemmas, outfile, ensure_ascii=False)
I wish it to become like this:(by looping and inserting my startIndex and endIndex into the list when "smart" appears in the sentence and if "smart" does not appear then it should not append into "check" and "check" will remain as empty list".
[{"text": "I am smart boy", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [{"gender":"M", "startIndex":3, "endIndex":4}], "id": 0}, {"text": "I am so smart last time", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [{"gender":"M", "startIndex":10, "endIndex":12}], "id": 1}, {"text": "They always call me smart boy", "name": "null", "original": null, "location": [], "person": [], "sound": "", "check": [{"gender":"M", "startIndex":5, "endIndex":7}], "id": 2}]
You can do everything in one pass through the file and save yourself having to correlate the data in sea with the line from the text file.
It's a lot quicker (IMO) to create a dictionary using this shorthand (and it looks like JSON!):
d = { "key": "value", "key2": "value2" }
rather than
d = dict([ ("key", "value") ], [ ("key2", "value2") ])
Here's how you could redo your script:
lemmas = []
i = 0
null = None
for x in test:
# find the position of 'smart' in the text, and store it in the list `checks`
# checks will be empty if the word 'smart' does not appear in the line
checks = []
for m in re.finditer("smart",x):
# creating a dict and append it to the list in one step
checks.append( { "startIndex": m.start(), "endIndex": m.end(), "Gender": 'M' } )
# create a dict and append the data for this line to `lemmas`:
lemmas.append( { "text": x, "name": null, "originalText": null, "location": [], "person": [],
"sound": "", "check": checks, "id": i } )
i+=1
print(lemmas)
...and then dump it out as JSON.
I hope that what I've done is clear -- if not, please ask.

Illegal_argument_exception when importing Twitter into Elasticsearch

I am new to Elasticsearch and am attempting to do some data analysis of Twitter data by importing it into Elasticsearch and running Kibana on it. I'm getting stuck when importing Twitter data into Elasticsearch. Any help is appreciated!
Here's a sample working program that produces the error.
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
data = json.loads(open("data.json").read())
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)
Here's the error:
Traceback (most recent call last):
File "elasticsearch_import_test.py", line 5, in <module>
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/client/utils.py", line 69, in _wrapped
return func(*args, params=params, **kwargs)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/client/__init__.py", line 279, in index
_make_path(index, doc_type, id), params=params, body=body)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/transport.py", line 329, in perform_request
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/connection/http_urllib3.py", line 109, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/connection/base.py", line 108, in _raise_error
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info)
elasticsearch.exceptions.RequestError: TransportError(400, u'illegal_argument_exception', u'[Raza][127.0.0.1:9300][indices:data/write/index[p]]')
Here's an example Twitter JSON file (data.json)
{
"_id": {
"$oid": "570597358c68d71c16b3b722"
},
"contributors": null,
"coordinates": null,
"created_at": "Wed Apr 06 23:09:41 +0000 2016",
"entities": {
"hashtags": [
{
"indices": [
68,
72
],
"text": "dnd"
},
{
"indices": [
73,
79
],
"text": "Nat20"
},
{
"indices": [
80,
93
],
"text": "CriticalRole"
},
{
"indices": [
94,
103
],
"text": "d20babes"
}
],
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
],
"symbols": [],
"urls": [
{
"display_url": "darkcastlecollectibles.com",
"expanded_url": "http://www.darkcastlecollectibles.com/",
"indices": [
44,
67
],
"url": "https://shortened.url/SJgFTE0o8h"
}
],
"user_mentions": [
{
"id": 2375847847,
"id_str": "2375847847",
"indices": [
3,
19
],
"name": "Zack Chini",
"screen_name": "Zenttsilverwing"
}
]
},
"extended_entities": {
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
},
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953295727009793,
"id_str": "715953295727009793",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
]
},
"favorite_count": 0,
"favorited": false,
"filter_level": "low",
"geo": null,
"id": 717851801417031680,
"id_str": "717851801417031680",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 0,
"retweeted": false,
"retweeted_status": {
"contributors": null,
"coordinates": null,
"created_at": "Fri Apr 01 17:25:42 +0000 2016",
"entities": {
"hashtags": [
{
"indices": [
47,
51
],
"text": "dnd"
},
{
"indices": [
52,
58
],
"text": "Nat20"
},
{
"indices": [
59,
72
],
"text": "CriticalRole"
},
{
"indices": [
73,
82
],
"text": "d20babes"
}
],
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
],
"symbols": [],
"urls": [
{
"display_url": "darkcastlecollectibles.com",
"expanded_url": "http://www.darkcastlecollectibles.com/",
"indices": [
23,
46
],
"url": "https://shortened.url/SJgFTE0o8h"
}
],
"user_mentions": []
},
"extended_entities": {
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
},
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953295727009793,
"id_str": "715953295727009793",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
]
},
"favorite_count": 5,
"favorited": false,
"filter_level": "low",
"geo": null,
"id": 715953298076012545,
"id_str": "715953298076012545",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 1,
"retweeted": false,
"source": "Twitter Web Client",
"text": "coins came in!! Thanks https://shortened.url/SJgFTE0o8h #dnd #Nat20 #CriticalRole #d20babes https://shortened.url/YQoxEuEAXV",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Thu Mar 06 19:59:14 +0000 2014",
"default_profile": true,
"default_profile_image": false,
"description": "DM Geek Critter Con-man. I am here to like your art ^.^",
"favourites_count": 4990,
"follow_request_sent": null,
"followers_count": 57,
"following": null,
"friends_count": 183,
"geo_enabled": false,
"id": 2375847847,
"id_str": "2375847847",
"is_translator": false,
"lang": "en",
"listed_count": 7,
"location": "Flower Mound, TX",
"name": "Zack Chini",
"notifications": null,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2375847847/1430928759",
"profile_image_url": "http://pbs.twimg.com/profile_images/708816622358663168/mNF4Ysr5_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/708816622358663168/mNF4Ysr5_normal.jpg",
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "Zenttsilverwing",
"statuses_count": 551,
"time_zone": null,
"url": null,
"utc_offset": null,
"verified": false
}
},
"source": "Twitter Web Client",
"text": "RT #Zenttsilverwing: coins came in!! Thanks https://shortened.url/SJgFTE0o8h #dnd #Nat20 #CriticalRole #d20babes https://shortened.url/YQoxEuEAXV",
"timestamp_ms": "1459984181156",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Tue Feb 10 04:31:18 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "I use Twitter to primarily retweet Critter artwork of Critical Role and their own creations. I maintain a list of all the Critter artists I've come across.",
"favourites_count": 17586,
"follow_request_sent": null,
"followers_count": 318,
"following": null,
"friends_count": 651,
"geo_enabled": true,
"id": 20491914,
"id_str": "20491914",
"is_translator": false,
"lang": "en",
"listed_count": 33,
"location": "SanDiego, CA",
"name": "UnknownOutrider",
"notifications": null,
"profile_background_color": "EDECE9",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme3/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme3/bg.gif",
"profile_background_tile": false,
"profile_image_url": "http://pbs.twimg.com/profile_images/224346493/cartoon_dragon_tattoo_designs_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/224346493/cartoon_dragon_tattoo_designs_normal.jpg",
"profile_link_color": "088253",
"profile_sidebar_border_color": "D3D2CF",
"profile_sidebar_fill_color": "E3E2DE",
"profile_text_color": "634047",
"profile_use_background_image": true,
"protected": false,
"screen_name": "UnknownOutrider",
"statuses_count": 12760,
"time_zone": "Pacific Time (US & Canada)",
"url": null,
"utc_offset": -25200,
"verified": false
}
}
The reason that don't work is that you are trying to index document with a field named _id which is already exist as a default field. So delete that field or change field name:
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
data = json.loads(open("data.json").read())
# data['id_'] = data['_id'] <= You can change _id as id_
del data['_id']
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)

Categories

Resources