Related
I have a created a dataframe from a JSON but want to keep only the first 5 columns of the result.
Here is a part of the JSON:
{
"lat": 52.517,
"lon": 13.3889,
"timezone": "Europe/Berlin",
"timezone_offset": 7200,
"current": {
"dt": 1628156947,
"sunrise": 1628134359,
"sunset": 1628189532,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 79,
"wind_gust": 4.92,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
]
},
"hourly": [
{
"dt": 1628154000,
"temp": 295.26,
"feels_like": 295.09,
"pressure": 1009,
"humidity": 60,
"dew_point": 287.14,
"uvi": 4.01,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.6,
"wind_deg": 83,
"wind_gust": 4.76,
"weather": [
{
"id": 500,
"main": "Rain",
"description": "light rain",
"icon": "10d"
}
],
"pop": 0.49,
"rain": {
"1h": 0.52
}
},
{
"dt": 1628157600,
"temp": 295.54,
"feels_like": 295.43,
"pressure": 1009,
"humidity": 61,
"dew_point": 287.66,
"uvi": 4.53,
"clouds": 20,
"visibility": 10000,
"wind_speed": 3.76,
"wind_deg": 85,
"wind_gust": 4.91,
"weather": [
{
"id": 801,
"main": "Clouds",
"description": "few clouds",
"icon": "02d"
}
],
"pop": 0.55
},
{
"dt": 1628161200,
"temp": 295.58,
"feels_like": 295.42,
"pressure": 1009,
"humidity": 59,
"dew_point": 287.18,
"uvi": 4.9,
"clouds": 36,
"visibility": 10000,
"wind_speed": 3.58,
"wind_deg": 95,
"wind_gust": 4.73,
"weather": [
{
"id": 802,
"main": "Clouds",
"description": "scattered clouds",
"icon": "03d"
}
],
"pop": 0.59
}
]
}
I have flattened the JSON first like this:
df_history = pd.json_normalize(data_history, max_level=1)`
That gave me this structure:
lat lon timezone timezone_offset hourly current.dt current.sunrise current.sunset current.temp current.feels_like ... current.humidity current.dew_point current.uvi current.clouds current.visibility current.wind_speed current.wind_deg current.wind_gust current.weather current.rain
0 52.517 13.3889 Europe/Berlin 7200 [{'dt': 1627776000, 'temp': 17.82, 'feels_like... 1627855200 1627874869 1627930649 16.36 16.4 ... 90 14.72 0 0 10000 3.13 254 11.18 [{'id': 500, 'main': 'Rain', 'description': 'l... {'1h': 0.17}
But I want to keep only the columns up to the column "hourly" and then flatten it.
I have tried this but to no avail:
df_history_small = pd.json_normalize(data_history, record_path='hourly',meta=['dt','temp', 'humidity'], errors='ignore')
What am I doing wrong? How can I achieve my goal?
my final goal it to have a dataframe that looks like this:
lat lon timezone timezone_offset timestamp temp feels_like humidity pressure
0 52.517 13.3889 Europe/Berlin 7200 08/01/2021 00:00:00 17.82 17.46 69 1005
Try:
cols = ['lat', 'lon', 'timezone', 'timezone_offset',
'dt', 'temp', 'feels_like', 'humidity']
out = pd.json_normalize(data_history, ['hourly'], meta=cols[:4])[cols]
>>> out
lat lon timezone timezone_offset dt temp feels_like humidity
0 52.517 13.3889 Europe/Berlin 7200 1628154000 295.26 295.09 60
1 52.517 13.3889 Europe/Berlin 7200 1628157600 295.54 295.43 61
2 52.517 13.3889 Europe/Berlin 7200 1628161200 295.58 295.42 59
Feel free to convert dt to timestamp with:
df['timestamp'] = pd.to_datetime(out['dt'], unit='s')
In the following data structure:
[
{
"id": 28,
"country": "Brazil",
"country_code": "BR",
"country_population": 201103330,
"province": "",
"last_updated": "2020-04-03T01:40:00.724616Z",
"coordinates": {
"latitude": "-14.235",
"longitude": "-51.9253"
},
"latest": {
"confirmed": 8044,
"deaths": 324,
"recovered": 0
},
"timelines": {
"confirmed": {
"latest": 8044,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"deaths": {
"latest": 324,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"recovered": {
"latest": 0,
"timeline": {}
}
}
}
]
How do I get "timeline" items, from "timelines" key?
You should provide at least a piece of code of what you tried for now..
d = [
{
"id": 28,
"country": "Brazil",
"country_code": "BR",
"country_population": 201103330,
"province": "",
"last_updated": "2020-04-03T01:40:00.724616Z",
"coordinates": {
"latitude": "-14.235",
"longitude": "-51.9253"
},
"latest": {
"confirmed": 8044,
"deaths": 324,
"recovered": 0
},
"timelines": {
"confirmed": {
"latest": 8044,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"deaths": {
"latest": 324,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"recovered": {
"latest": 0,
"timeline": {}
}
}
}
]
print(d[0]["timelines"]["confirmed"]["timeline"])
By the way :
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
Looks weird for me does timeline should be an array instead of a object ?
print([[i, data[0]['timelines'][i]['timeline']] for i in data[0]['timelines']])
Your JSON does indeed have an issue
"JSONDecodeError: Expecting property name enclosed in double quotes: line 24 column 9 (char 558)"
Which in turn is your timeline as posted above
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0, <----
"2020-01-24T00:00:00Z": 0,
JSONs have many formatting issues, and you may have to develop your own method of reading them if they are out of the general norm, I've had to do this a few times.
import json
x = """[{
"id": 28,
"country": "Brazil",
"country_code": "BR",
"country_population": 201103330,
"province": "",
"last_updated": "2020-04-03T01:40:00.724616Z",
"coordinates": {
"latitude": "-14.235",
"longitude": "-51.9253"
},
"latest": {
"confirmed": 8044,
"deaths": 324,
"recovered": 0
},
"timelines": {
"confirmed": {
"latest": 8044,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"deaths": {
"latest": 324,
"timeline": {
"2020-01-22T00:00:00Z": 0,
"2020-01-23T00:00:00Z": 0,
"2020-01-24T00:00:00Z": 0,
}
},
"recovered": {
"latest": 0,
"timeline": {}
}
}
}]"""
y = json.loads(x)
print(y)
I have a graph below made in plotly(python). I have set graph y-axis in log scale. but I don't know why there are intermediate values between 1000 and 10k like (2,3,4,5,6...). What they are representing? and how they are removed? I must want to remove.
]1
The properties of axis are given below
trace1 = {
"x": [1, 2, 3, 4, 5],
"y": [500, 900, 1100, 10000, 300],
"marker": {
"color": 'rgb(255,140,0)',
},
data = Data([trace1])
layout = {
"autosize": False,
"bargap": 0.0,
"height": 480,
"hovermode": "closest",
"margin": {
"r": 63,
"t": 57,
"b": 52,
"l": 80,
"pad": 0
},
"showlegend": False,
"titlefont": {
"color": "#000000",
"size": 12.0
},
"width": 640,
"xaxis": {
"anchor": "y",
"domain": [0.0, 1.0],
"mirror": "ticks",
"nticks": 9,
"range": [0.5, 10.5],
"showgrid": False,
"showline": True,
"side": "bottom",
"tickfont": {"size": 20.0},
"ticks": "inside",
"title": "x-axes",
"titlefont": {
"color": "black",
"size": 22.0
},
"type": "linear",
"zeroline": False
},
"yaxis": {
"tickmode":'auto',
"ticks":'outside',
"tick0":0,
"dtick":100,
"ticklen":8,
"tickwidth":4,
"anchor": "x",
"mirror": "ticks",
"domain": [0.0, 1.0],
"nticks": 6,
"showgrid": False,
"showline": True,
"side": "left",
"tickfont": {"size": 20.0},
"ticks": "inside",
"title": "y-axis",
"titlefont": {
"color": "black",
"size": 22.0
},
"type": "log",
"zeroline": False,
}
}
fig = dict(data=data, layout=layout)
py.plot(fig, filename='"log-no-log"')
The numbers represent the intermediate multiples until you get to the next order of magnitude. The first 3,4,5 etc. represent multiples of 100 until you reach the tick 1000. Then it will be the markers for multiples of 1000 until you reach 10k. Because of the logarithmic scale, they are not drawn equidistand.
So they might be useful if you want to read from the graph what your value at e.g. x=2 is.
I have the following JSON script which i got from Xero.
It is a nested JSON script and im trying to create a flat table and then export it to CSV.
I have written this python code but im struggling to flatten the nested JSON script.
Initially i get the the data from Xero and i use the json.dumps so as to serialise the datetime. The JSON export which is displayed here comes from Postman software. When i get the JSON script using python the date format is the following 'UpdatedDateUTC': datetime.datetime(2018, 10, 24, 12, 53, 55, 930000). So i use json.dumps so as to serialise it.
When i produce the first export:
df = pd.read_json(b_str)
df.to_csv(path+'invoices.csv')
The CSV file looks like this:
The next step is to flatten the Contact and CreditNotes columns and make them part of the main table. So instead of the Contact column will have 8 new columns: ContactID, ContactNumber, Name, Addresses, Phones, ContactGroups, ContactPersons, HasValidationErrors. Similar process for CreditNotes column
Im trying to replicate the methodology on this link but with no luck. I get an export which looks like this. The contacts_with_id dataframe is shown on multiple rows and not multiple columns. I cant figure out what i am doing wrong.
I have also used the flatten_json function but with no luck either.
I dont really need to make this methodology work. I just want to find a way to export the nested json script to a readable csv file.
Python Code:
from xero import Xero
from xero.auth import PrivateCredentials
with open("E:\\privatekey.pem") as keyfile:
rsa_key = keyfile.read()
credentials = PrivateCredentials('BHK1ZBEKIL4WM0BLKLIOT65PSIA43N', rsa_key)
xero = Xero(credentials)
import json
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df
# The following is a list
a_list = xero.invoices.all()
# The following is a string. Serialised Datetime
b_str = json.dumps(a_list, default=str)
path='E:\\MyDrive\\Python Workspaces\\'
df = pd.read_json(b_str)
df.to_csv(path+'invoices.csv')
# ********************* FLATTEN JSON *****************
dd = json.loads(b_str)
contacts_with_id = pd.io.json.json_normalize(dd, record_path='Contact', meta='InvoiceID',
record_prefix='Contact.')
df_final = pd.merge(contacts_with_id, df, how='inner', on='InvoiceID')
df_final.to_csv(path+'invoices_final.csv')
Json Script Below:
{
"Id": "568d1686-7c53-4f22-a93f-754589a246a7",
"Status": "OK",
"ProviderName": "Rest API",
"DateTimeUTC": "/Date(1552234854959)/",
"Invoices": [
{
"Type": "ACCPAY",
"InvoiceID": "8289ab9d-2134-4601-8622-e7fdae4b6d89",
"InvoiceNumber": "10522",
"Reference": "10522",
"Payments": [],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 102,
"AmountPaid": 0,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": false,
"Contact": {
"ContactID": "d1dba397-0f0b-4819-a6ce-2839b7be5008",
"ContactNumber": "c03bbcb5-fb0b-4f46-83f0-8687f754488b",
"Name": "Micro",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2017-02-06T00:00:00",
"Date": "/Date(1486339200000+0000)/",
"DueDateString": "2017-03-08T00:00:00",
"DueDate": "/Date(1488931200000+0000)/",
"Status": "AUTHORISED",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 85,
"TotalTax": 17,
"Total": 102,
"UpdatedDateUTC": "/Date(1529940362110+0000)/",
"CurrencyCode": "GBP"
},
{
"Type": "ACCREC",
"InvoiceID": "9e37150f-88a5-4213-a085-b30c5e01c2bf",
"InvoiceNumber": "(13)",
"Reference": "",
"Payments": [],
"CreditNotes": [
{
"CreditNoteID": "3c5c7dec-534a-46e0-ad1b-f0f69822cfd5",
"CreditNoteNumber": "(12)",
"ID": "3c5c7dec-534a-46e0-ad1b-f0f69822cfd5",
"AppliedAmount": 1200,
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"LineItems": [],
"Total": 7800
},
{
"CreditNoteID": "af38e37f-4ba3-4208-a193-a32b418c2bbc",
"CreditNoteNumber": "(14)",
"ID": "af38e37f-4ba3-4208-a193-a32b418c2bbc",
"AppliedAmount": 2600,
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"LineItems": [],
"Total": 2600
}
],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 0,
"AmountPaid": 0,
"AmountCredited": 3800,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": false,
"Contact": {
"ContactID": "58164bd6-5225-4f30-ad89-35140db5b624",
"ContactNumber": "d0b420b8-4a58-40d1-9717-8525edda7658",
"Name": "FSales (1)",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"DueDateString": "2011-06-03T00:00:00",
"DueDate": "/Date(1307059200000+0000)/",
"Status": "PAID",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 3166.67,
"TotalTax": 633.33,
"Total": 3800,
"UpdatedDateUTC": "/Date(1529943661150+0000)/",
"CurrencyCode": "GBP",
"FullyPaidOnDate": "/Date(1304467200000+0000)/"
},
{
"Type": "ACCPAY",
"InvoiceID": "1ddea7ec-a0d5-457a-a8fd-cfcdc2099d51",
"InvoiceNumber": "01596057543",
"Reference": "",
"Payments": [
{
"PaymentID": "fd639da3-c009-47df-a4bf-98ccd5c68e43",
"Date": "/Date(1551657600000+0000)/",
"Amount": 173.86,
"Reference": "",
"CurrencyRate": 1,
"HasAccount": false,
"HasValidationErrors": false
}
],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 0,
"AmountPaid": 173.86,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": true,
"Contact": {
"ContactID": "309afb74-0a3b-4d68-85e8-2259ca5acd13",
"ContactNumber": "91eef1f0-5fe6-45d7-b739-1ab5352a5523",
"Name": "Company AAA",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2019-02-23T00:00:00",
"Date": "/Date(1550880000000+0000)/",
"DueDateString": "2019-03-21T00:00:00",
"DueDate": "/Date(1553126400000+0000)/",
"Status": "PAID",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 144.88,
"TotalTax": 28.98,
"Total": 173.86,
"UpdatedDateUTC": "/Date(1551777481907+0000)/",
"CurrencyCode": "GBP",
"FullyPaidOnDate": "/Date(1551657600000+0000)/"
},
{
"Type": "ACCPAY",
"InvoiceID": "ba5ff3b1-1058-4645-80da-5475c23da949",
"InvoiceNumber": "Q0603",
"Reference": "",
"Payments": [],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 213.24,
"AmountPaid": 0,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": true,
"Contact": {
"ContactID": "f0473b41-da92-4397-9d2c-741812f2475c",
"ContactNumber": "1f124969-de8d-40b8-8140-d4997511b0dc",
"Name": "BTelcom",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2019-03-05T00:00:00",
"Date": "/Date(1551744000000+0000)/",
"DueDateString": "2019-03-21T00:00:00",
"DueDate": "/Date(1553126400000+0000)/",
"Status": "SUBMITTED",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 177.7,
"TotalTax": 35.54,
"Total": 213.24,
"UpdatedDateUTC": "/Date(1552068778417+0000)/",
"CurrencyCode": "GBP"
}
]
}
I've had to do something like this before:
Basically flattened out the entire nested json, then iterate through those columns (which uses a pattern to include which row it would be constructed into a table) to create the new rows.
There are 4 invoices, and this creates 4 rows (for each of the invoices). Hopefully this is what you are looking for.
NOTE Where you might run into some issues:
One of the things to consider if trying to flatten out a json file where there is nested lists, and the nested lists are of different lengths, anytime a single row has ONE value for any given column, it has to create that column even if all the other rows are null. In that Payments Key, there are lists with additional 7 elements. So if there are 8 payments for some IDs (as opposed to all the others only having 1 payment), it'll have to create 56 additional columns to store all those in separate columns / flat file.
jsonStr = '''{
"Id": "568d1686-7c53-4f22-a93f-754589a246a7",
"Status": "OK",
"ProviderName": "Rest API",
"DateTimeUTC": "/Date(1552234854959)/",
"Invoices": [
{
"Type": "ACCPAY",
"InvoiceID": "8289ab9d-2134-4601-8622-e7fdae4b6d89",
"InvoiceNumber": "10522",
"Reference": "10522",
"Payments": [],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 102,
"AmountPaid": 0,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": false,
"Contact": {
"ContactID": "d1dba397-0f0b-4819-a6ce-2839b7be5008",
"ContactNumber": "c03bbcb5-fb0b-4f46-83f0-8687f754488b",
"Name": "Micro",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2017-02-06T00:00:00",
"Date": "/Date(1486339200000+0000)/",
"DueDateString": "2017-03-08T00:00:00",
"DueDate": "/Date(1488931200000+0000)/",
"Status": "AUTHORISED",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 85,
"TotalTax": 17,
"Total": 102,
"UpdatedDateUTC": "/Date(1529940362110+0000)/",
"CurrencyCode": "GBP"
},
{
"Type": "ACCREC",
"InvoiceID": "9e37150f-88a5-4213-a085-b30c5e01c2bf",
"InvoiceNumber": "(13)",
"Reference": "",
"Payments": [],
"CreditNotes": [
{
"CreditNoteID": "3c5c7dec-534a-46e0-ad1b-f0f69822cfd5",
"CreditNoteNumber": "(12)",
"ID": "3c5c7dec-534a-46e0-ad1b-f0f69822cfd5",
"AppliedAmount": 1200,
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"LineItems": [],
"Total": 7800
},
{
"CreditNoteID": "af38e37f-4ba3-4208-a193-a32b418c2bbc",
"CreditNoteNumber": "(14)",
"ID": "af38e37f-4ba3-4208-a193-a32b418c2bbc",
"AppliedAmount": 2600,
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"LineItems": [],
"Total": 2600
}
],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 0,
"AmountPaid": 0,
"AmountCredited": 3800,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": false,
"Contact": {
"ContactID": "58164bd6-5225-4f30-ad89-35140db5b624",
"ContactNumber": "d0b420b8-4a58-40d1-9717-8525edda7658",
"Name": "FSales (1)",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2011-05-04T00:00:00",
"Date": "/Date(1304467200000+0000)/",
"DueDateString": "2011-06-03T00:00:00",
"DueDate": "/Date(1307059200000+0000)/",
"Status": "PAID",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 3166.67,
"TotalTax": 633.33,
"Total": 3800,
"UpdatedDateUTC": "/Date(1529943661150+0000)/",
"CurrencyCode": "GBP",
"FullyPaidOnDate": "/Date(1304467200000+0000)/"
},
{
"Type": "ACCPAY",
"InvoiceID": "1ddea7ec-a0d5-457a-a8fd-cfcdc2099d51",
"InvoiceNumber": "01596057543",
"Reference": "",
"Payments": [
{
"PaymentID": "fd639da3-c009-47df-a4bf-98ccd5c68e43",
"Date": "/Date(1551657600000+0000)/",
"Amount": 173.86,
"Reference": "",
"CurrencyRate": 1,
"HasAccount": false,
"HasValidationErrors": false
}
],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 0,
"AmountPaid": 173.86,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": true,
"Contact": {
"ContactID": "309afb74-0a3b-4d68-85e8-2259ca5acd13",
"ContactNumber": "91eef1f0-5fe6-45d7-b739-1ab5352a5523",
"Name": "Company AAA",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2019-02-23T00:00:00",
"Date": "/Date(1550880000000+0000)/",
"DueDateString": "2019-03-21T00:00:00",
"DueDate": "/Date(1553126400000+0000)/",
"Status": "PAID",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 144.88,
"TotalTax": 28.98,
"Total": 173.86,
"UpdatedDateUTC": "/Date(1551777481907+0000)/",
"CurrencyCode": "GBP",
"FullyPaidOnDate": "/Date(1551657600000+0000)/"
},
{
"Type": "ACCPAY",
"InvoiceID": "ba5ff3b1-1058-4645-80da-5475c23da949",
"InvoiceNumber": "Q0603",
"Reference": "",
"Payments": [],
"CreditNotes": [],
"Prepayments": [],
"Overpayments": [],
"AmountDue": 213.24,
"AmountPaid": 0,
"AmountCredited": 0,
"CurrencyRate": 1,
"HasErrors": false,
"IsDiscounted": false,
"HasAttachments": true,
"Contact": {
"ContactID": "f0473b41-da92-4397-9d2c-741812f2475c",
"ContactNumber": "1f124969-de8d-40b8-8140-d4997511b0dc",
"Name": "BTelcom",
"Addresses": [],
"Phones": [],
"ContactGroups": [],
"ContactPersons": [],
"HasValidationErrors": false
},
"DateString": "2019-03-05T00:00:00",
"Date": "/Date(1551744000000+0000)/",
"DueDateString": "2019-03-21T00:00:00",
"DueDate": "/Date(1553126400000+0000)/",
"Status": "SUBMITTED",
"LineAmountTypes": "Exclusive",
"LineItems": [],
"SubTotal": 177.7,
"TotalTax": 35.54,
"Total": 213.24,
"UpdatedDateUTC": "/Date(1552068778417+0000)/",
"CurrencyCode": "GBP"
}
]
}'''
import json
import pandas as pd
import re
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
jsonObj = json.loads(jsonStr)
flat = flatten_json(jsonObj)
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output:
print (results.to_string())
Type InvoiceID InvoiceNumber Reference AmountDue AmountPaid AmountCredited CurrencyRate HasErrors IsDiscounted HasAttachments ContactContactID ContactContactNumber ContactName ContactHasValidationErrors DateString Date DueDateString DueDate Status LineAmountTypes SubTotal TotalTax Total UpdatedDateUTC CurrencyCode CreditNotes0CreditNoteID CreditNotes0CreditNoteNumber CreditNotes0ID CreditNotes0AppliedAmount CreditNotes0DateString CreditNotes0Date CreditNotes0Total CreditNotes1CreditNoteID CreditNotes1CreditNoteNumber CreditNotes1ID CreditNotes1AppliedAmount CreditNotes1DateString CreditNotes1Date CreditNotes1Total FullyPaidOnDate Payments0PaymentID Payments0Date Payments0Amount Payments0Reference Payments0CurrencyRate Payments0HasAccount Payments0HasValidationErrors Id ProviderName DateTimeUTC
0 ACCPAY 8289ab9d-2134-4601-8622-e7fdae4b6d89 10522 10522 102.00 0.00 0.0 1.0 False False False d1dba397-0f0b-4819-a6ce-2839b7be5008 c03bbcb5-fb0b-4f46-83f0-8687f754488b Micro False 2017-02-06T00:00:00 /Date(1486339200000+0000)/ 2017-03-08T00:00:00 /Date(1488931200000+0000)/ OK Exclusive 85.00 17.00 102.00 /Date(1529940362110+0000)/ GBP NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 568d1686-7c53-4f22-a93f-754589a246a7 Rest API /Date(1552234854959)/
1 ACCREC 9e37150f-88a5-4213-a085-b30c5e01c2bf (13) 0.00 0.00 3800.0 1.0 False False False 58164bd6-5225-4f30-ad89-35140db5b624 d0b420b8-4a58-40d1-9717-8525edda7658 FSales (1) False 2011-05-04T00:00:00 /Date(1304467200000+0000)/ 2011-06-03T00:00:00 /Date(1307059200000+0000)/ OK Exclusive 3166.67 633.33 3800.00 /Date(1529943661150+0000)/ GBP 3c5c7dec-534a-46e0-ad1b-f0f69822cfd5 (12) 3c5c7dec-534a-46e0-ad1b-f0f69822cfd5 1200.0 2011-05-04T00:00:00 /Date(1304467200000+0000)/ 7800.0 af38e37f-4ba3-4208-a193-a32b418c2bbc (14) af38e37f-4ba3-4208-a193-a32b418c2bbc 2600.0 2011-05-04T00:00:00 /Date(1304467200000+0000)/ 2600.0 /Date(1304467200000+0000)/ NaN NaN NaN NaN NaN NaN NaN 568d1686-7c53-4f22-a93f-754589a246a7 Rest API /Date(1552234854959)/
2 ACCPAY 1ddea7ec-a0d5-457a-a8fd-cfcdc2099d51 01596057543 0.00 173.86 0.0 1.0 False False True 309afb74-0a3b-4d68-85e8-2259ca5acd13 91eef1f0-5fe6-45d7-b739-1ab5352a5523 Company AAA False 2019-02-23T00:00:00 /Date(1550880000000+0000)/ 2019-03-21T00:00:00 /Date(1553126400000+0000)/ OK Exclusive 144.88 28.98 173.86 /Date(1551777481907+0000)/ GBP NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN /Date(1551657600000+0000)/ fd639da3-c009-47df-a4bf-98ccd5c68e43 /Date(1551657600000+0000)/ 173.86 1.0 False False 568d1686-7c53-4f22-a93f-754589a246a7 Rest API /Date(1552234854959)/
3 ACCPAY ba5ff3b1-1058-4645-80da-5475c23da949 Q0603 213.24 0.00 0.0 1.0 False False True f0473b41-da92-4397-9d2c-741812f2475c 1f124969-de8d-40b8-8140-d4997511b0dc BTelcom False 2019-03-05T00:00:00 /Date(1551744000000+0000)/ 2019-03-21T00:00:00 /Date(1553126400000+0000)/ OK Exclusive 177.70 35.54 213.24 /Date(1552068778417+0000)/ GBP NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 568d1686-7c53-4f22-a93f-754589a246a7 Rest API /Date(1552234854959)/
I am new to Elasticsearch and am attempting to do some data analysis of Twitter data by importing it into Elasticsearch and running Kibana on it. I'm getting stuck when importing Twitter data into Elasticsearch. Any help is appreciated!
Here's a sample working program that produces the error.
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
data = json.loads(open("data.json").read())
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)
Here's the error:
Traceback (most recent call last):
File "elasticsearch_import_test.py", line 5, in <module>
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/client/utils.py", line 69, in _wrapped
return func(*args, params=params, **kwargs)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/client/__init__.py", line 279, in index
_make_path(index, doc_type, id), params=params, body=body)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/transport.py", line 329, in perform_request
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/connection/http_urllib3.py", line 109, in perform_request
self._raise_error(response.status, raw_data)
File "/usr/local/lib/python2.7/site-packages/elasticsearch/connection/base.py", line 108, in _raise_error
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info)
elasticsearch.exceptions.RequestError: TransportError(400, u'illegal_argument_exception', u'[Raza][127.0.0.1:9300][indices:data/write/index[p]]')
Here's an example Twitter JSON file (data.json)
{
"_id": {
"$oid": "570597358c68d71c16b3b722"
},
"contributors": null,
"coordinates": null,
"created_at": "Wed Apr 06 23:09:41 +0000 2016",
"entities": {
"hashtags": [
{
"indices": [
68,
72
],
"text": "dnd"
},
{
"indices": [
73,
79
],
"text": "Nat20"
},
{
"indices": [
80,
93
],
"text": "CriticalRole"
},
{
"indices": [
94,
103
],
"text": "d20babes"
}
],
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
],
"symbols": [],
"urls": [
{
"display_url": "darkcastlecollectibles.com",
"expanded_url": "http://www.darkcastlecollectibles.com/",
"indices": [
44,
67
],
"url": "https://shortened.url/SJgFTE0o8h"
}
],
"user_mentions": [
{
"id": 2375847847,
"id_str": "2375847847",
"indices": [
3,
19
],
"name": "Zack Chini",
"screen_name": "Zenttsilverwing"
}
]
},
"extended_entities": {
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
},
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953295727009793,
"id_str": "715953295727009793",
"indices": [
104,
127
],
"media_url": "http://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 715953298076012545,
"source_status_id_str": "715953298076012545",
"source_user_id": 2375847847,
"source_user_id_str": "2375847847",
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
]
},
"favorite_count": 0,
"favorited": false,
"filter_level": "low",
"geo": null,
"id": 717851801417031680,
"id_str": "717851801417031680",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 0,
"retweeted": false,
"retweeted_status": {
"contributors": null,
"coordinates": null,
"created_at": "Fri Apr 01 17:25:42 +0000 2016",
"entities": {
"hashtags": [
{
"indices": [
47,
51
],
"text": "dnd"
},
{
"indices": [
52,
58
],
"text": "Nat20"
},
{
"indices": [
59,
72
],
"text": "CriticalRole"
},
{
"indices": [
73,
82
],
"text": "d20babes"
}
],
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
],
"symbols": [],
"urls": [
{
"display_url": "darkcastlecollectibles.com",
"expanded_url": "http://www.darkcastlecollectibles.com/",
"indices": [
23,
46
],
"url": "https://shortened.url/SJgFTE0o8h"
}
],
"user_mentions": []
},
"extended_entities": {
"media": [
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953292849754112,
"id_str": "715953292849754112",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TugAUsAASZht.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
},
{
"display_url": "pic.twitter.com/YQoxEuEAXV",
"expanded_url": "http://twitter.com/Zenttsilverwing/status/715953298076012545/photo/1",
"id": 715953295727009793,
"id_str": "715953295727009793",
"indices": [
83,
106
],
"media_url": "http://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"media_url_https": "https://pbs.twimg.com/media/Ce-TuquUIAEsVn9.jpg",
"sizes": {
"large": {
"h": 768,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 450,
"resize": "fit",
"w": 600
},
"small": {
"h": 255,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://shortened.url/YQoxEuEAXV"
}
]
},
"favorite_count": 5,
"favorited": false,
"filter_level": "low",
"geo": null,
"id": 715953298076012545,
"id_str": "715953298076012545",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 1,
"retweeted": false,
"source": "Twitter Web Client",
"text": "coins came in!! Thanks https://shortened.url/SJgFTE0o8h #dnd #Nat20 #CriticalRole #d20babes https://shortened.url/YQoxEuEAXV",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Thu Mar 06 19:59:14 +0000 2014",
"default_profile": true,
"default_profile_image": false,
"description": "DM Geek Critter Con-man. I am here to like your art ^.^",
"favourites_count": 4990,
"follow_request_sent": null,
"followers_count": 57,
"following": null,
"friends_count": 183,
"geo_enabled": false,
"id": 2375847847,
"id_str": "2375847847",
"is_translator": false,
"lang": "en",
"listed_count": 7,
"location": "Flower Mound, TX",
"name": "Zack Chini",
"notifications": null,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2375847847/1430928759",
"profile_image_url": "http://pbs.twimg.com/profile_images/708816622358663168/mNF4Ysr5_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/708816622358663168/mNF4Ysr5_normal.jpg",
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "Zenttsilverwing",
"statuses_count": 551,
"time_zone": null,
"url": null,
"utc_offset": null,
"verified": false
}
},
"source": "Twitter Web Client",
"text": "RT #Zenttsilverwing: coins came in!! Thanks https://shortened.url/SJgFTE0o8h #dnd #Nat20 #CriticalRole #d20babes https://shortened.url/YQoxEuEAXV",
"timestamp_ms": "1459984181156",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Tue Feb 10 04:31:18 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "I use Twitter to primarily retweet Critter artwork of Critical Role and their own creations. I maintain a list of all the Critter artists I've come across.",
"favourites_count": 17586,
"follow_request_sent": null,
"followers_count": 318,
"following": null,
"friends_count": 651,
"geo_enabled": true,
"id": 20491914,
"id_str": "20491914",
"is_translator": false,
"lang": "en",
"listed_count": 33,
"location": "SanDiego, CA",
"name": "UnknownOutrider",
"notifications": null,
"profile_background_color": "EDECE9",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme3/bg.gif",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme3/bg.gif",
"profile_background_tile": false,
"profile_image_url": "http://pbs.twimg.com/profile_images/224346493/cartoon_dragon_tattoo_designs_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/224346493/cartoon_dragon_tattoo_designs_normal.jpg",
"profile_link_color": "088253",
"profile_sidebar_border_color": "D3D2CF",
"profile_sidebar_fill_color": "E3E2DE",
"profile_text_color": "634047",
"profile_use_background_image": true,
"protected": false,
"screen_name": "UnknownOutrider",
"statuses_count": 12760,
"time_zone": "Pacific Time (US & Canada)",
"url": null,
"utc_offset": -25200,
"verified": false
}
}
The reason that don't work is that you are trying to index document with a field named _id which is already exist as a default field. So delete that field or change field name:
import json
from elasticsearch import Elasticsearch
es = Elasticsearch()
data = json.loads(open("data.json").read())
# data['id_'] = data['_id'] <= You can change _id as id_
del data['_id']
es.index(index='tweets5', doc_type='tweets', id=data['id'], body=data)