I have this JSON file, Since the JSON data is nested I flattened it, but now when I try to upload it to the SQL server it says the error.
[1 rows x 21 columns]
Traceback (most recent call last):
File "c:\Users\Anjan\OneDrive\Desktop\BIG_DATA\dataframe.py", line 50, in <module>
df = pd.read_json(final_df)
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\util\_decorators.py", line 211, in wrapper
return func(*args, **kwargs)
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\util\_decorators.py", line 331, in wrapper
return func(*args, **kwargs)
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\io\json\_json.py", line 733, in read_json
json_reader = JsonReader(
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\io\json\_json.py", line 818, in __init__
data = self._get_data_from_filepath(filepath_or_buffer)
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\io\json\_json.py", line 858, in _get_data_from_filepath
self.handles = get_handle(
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\io\common.py", line 704, in get_handle
if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
File "C:\Users\Anjan\AppData\Roaming\Python\Python310\site-packages\pandas\io\common.py", line 1163, in _is_binary_mode
return isinstance(handle, _get_binary_io_classes()) or "b" in getattr(
TypeError: argument of type 'method' is not iterable
Here is the JSON data present inside the file.
{
"reporting_entity_name": "medicare",
"reporting_entity_type": "medicare",
"plan_name": "medicaid",
"plan_id_type": "hios",
"plan_id": "1111111111",
"plan_market_type": "individual",
"last_updated_on": "2020-08-27",
"version": "1.0.0",
"in_network": [
{
"negotiation_arrangement": "ffs",
"name": "Knee Replacement",
"billing_code_type": "CPT",
"billing_code_type_version": "2020",
"billing_code": "27447",
"description": "Arthroplasty, knee condyle and plateau, medial and lateral compartments",
"negotiated_rates": [
{
"provider_groups": [
{
"npi": [0],
"tin": {
"type": "ein",
"value": "11-1111111"
}
}
],
"negotiated_prices": [
{
"negotiated_type": "negotiated",
"negotiated_rate": 123.45,
"expiration_date": "2022-01-01",
"billing_class": "institutional"
}
]
}
]
}
]
}
I have flattened JSON data and tried uploading it to the SQL using the python code.
import json
import pandas as pd
from sqlalchemy import create_engine
from functools import reduce
with open('new_ravi_test.json', 'r') as f:
data = json.loads(f.read())
df_main = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network"]
).drop(columns="negotiated_rates")
df_provider = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network", "negotiated_rates", "provider_groups"]
)
df_prices = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network", "negotiated_rates", "negotiated_prices"]
)
dfs = [df_main, df_provider, df_prices]
final_df = reduce(lambda left, right: pd.merge(
left,
right,
on=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"]
), dfs).explode("npi")
print(final_df)
engine = create_engine('mysql+pymysql://root:#localhost/json')
df = pd.read_json(final_df)
df.to_sql("test_file_01", con=engine, if_exists='replace', index=False)
Here is what JSON table looks like
negotiation_arrangement name billing_code_type billing_code_type_version billing_code description reporting_entity_name reporting_entity_type plan_name plan_id_type plan_id plan_market_type last_updated_on version npi tin.type tin.value negotiated_type negotiated_rate expiration_date billing_class
0 ffs Knee Replacement CPT 2020 27447 Arthroplasty, knee condyle and plateau, medial and lateral compartments medicare medicare medicaid hios 1111111111 individual 2020-08-27 1.0.0 0 ein 11-1111111 negotiated 123.45 2022-01-01 institutional
Can anyone please provide with the block of code that i can use it to upload JSON data to SQL.
Your problem appears to be with this line:
df = pd.read_json(final_df)
The read_json method takes a source from which to get JSON data. It can either be a string containing a path, a buffer of data, or an open file handle. It appears to me that final_df is none of these but is rather a dataframe. I expect that to be why you're getting the error you're seeing.
Related
I am having a Nested JSON file, How can I flatten it so that I can make it compatible to upload it in SQL. This JSON file is nested and till the "in_network" key it works fine but the value "in_network" has a list of dicts so I guess my code cannot determine how to perform the task after that. I might be missing some lines of code. A help in code will be very much helpful.
{
"reporting_entity_name": "medicare",
"reporting_entity_type": "medicare",
"plan_name": "medicaid",
"plan_id_type": "hios",
"plan_id": "1111111111",
"plan_market_type": "individual",
"last_updated_on": "2020-08-27",
"version": "1.0.0",
"in_network": [
{
"negotiation_arrangement": "ffs",
"name": "Knee Replacement",
"billing_code_type": "CPT",
"billing_code_type_version": "2020",
"billing_code": "27447",
"description": "Arthroplasty, knee condyle and plateau, medial and lateral compartments",
"negotiated_rates": [
{
"provider_groups": [
{
"npi": [0],
"tin": {
"type": "ein",
"value": "11-1111111"
}
}
],
"negotiated_prices": [
{
"negotiated_type": "negotiated",
"negotiated_rate": 123.45,
"expiration_date": "2022-01-01",
"billing_class": "institutional"
}
]
}
]
}
]
}
Here is the python code I am using.
import json
import pandas as pd
with open('new_test.json', 'r') as f:
data = json.loads(f.read())
nested_data = pd.json_normalize(data, max_level=10)
After the current code the dataframe looks like
What ever is inside in_network is stored as it is but I want to store it like in_network.negotiation_arrangement a new column with value or in_network.name with its value.
Something like every key has its own column.
Here is what the table
reporting_entity_name reporting_entity_type plan_name plan_id_type plan_id plan_market_type last_updated_on version in_network
0 medicare medicare medicaid hios 1111111111 individual 2020-08-27 1.0.0 [{'negotiation_arrangement': 'ffs', 'name': 'Knee Replacement', 'billing_code_type': 'CPT', 'billing_code_type_version': '2020', 'billing_code': '27447', 'description': 'Arthroplasty, knee condyle and plateau, medial and lateral compartments', 'negotiated_rates': [{'provider_groups': [{'npi': [0], 'tin': {'type': 'ein', 'value': '11-1111111'}}], 'negotiated_prices': [{'negotiated_type': 'negotiated', 'negotiated_rate': 123.45, 'expiration_date': '2022-01-01', 'billing_class': 'institutional'}]}]}]
Using json_normalize() to parse and functools to merge:
from functools import reduce
import pandas as pd
df_main = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network"]
).drop(columns="negotiated_rates")
df_provider = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network", "negotiated_rates", "provider_groups"]
)
df_prices = pd.json_normalize(
data=data,
meta=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"],
record_path=["in_network", "negotiated_rates", "negotiated_prices"]
)
dfs = [df_main, df_provider, df_prices]
final_df = reduce(lambda left, right: pd.merge(
left,
right,
on=["reporting_entity_name", "reporting_entity_type", "plan_name", "plan_id_type",
"plan_id", "plan_market_type", "last_updated_on", "version"]
), dfs).explode("npi")
print(final_df)
Output:
negotiation_arrangement name billing_code_type billing_code_type_version billing_code description reporting_entity_name reporting_entity_type plan_name plan_id_type plan_id plan_market_type last_updated_on version npi tin.type tin.value negotiated_type negotiated_rate expiration_date billing_class
0 ffs Knee Replacement CPT 2020 27447 Arthroplasty, knee condyle and plateau, medial and lateral compartments medicare medicare medicaid hios 1111111111 individual 2020-08-27 1.0.0 0 ein 11-1111111 negotiated 123.45 2022-01-01 institutional
I have a data dump in this format:
[
{
"vaultId": "429d60edc11df0a576cd9173e8c0d0de8792538862db0122848b87a96ecdf537",
"loanSchemeId": "MIN150",
"ownerAddress": "df1q7crh6d3dscj3sajpklehpnwdwhvs0l0jm9fr7s",
"state": "inLiquidation",
"liquidationHeight": 1488540,
"batchCount": 2,
"liquidationPenalty": 5,
"batches": [
{
"index": 0,
"collaterals": [
"282.34948388#DFI"
],
"loan": "6.38045570#BABA"
},
{
"index": 1,
"collaterals": [
"300.16175126#DFI"
],
"loan": "0.26745972#GOOGL"
}
]
},
]
which I have written to a txt, I am trying to use Pandas to make a dataframe and then write to csv using this code however I am reciving this errorValueError: Length mismatch: Expected axis has 1729 elements, new values have 5 elements. This is the code I have used to form the dataframe many thanks in advance.
import pandas as pd
import simplejson as json
df = pd.read_json('auctions.txt')
df[['index', 'collaterals', 'loan']] = df.batches.apply(pd.Series)
Edit:
This is my error with changed code
Traceback (most recent call last):
File "c:\Users\iones\Documents\DeFI Chain Auction Bot\start.py", line 3, in <module>
df = pd.read_json('auctions.json')
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\util\_decorators.py", line 207, in wrapper
return func(*args, **kwargs)
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\io\json\_json.py", line 614, in read_json
return json_reader.read()
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\io\json\_json.py", line 748, in read
obj = self._get_object_parser(self.data)
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\io\json\_json.py", line 770, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\io\json\_json.py", line 885, in parse
self._parse_no_numpy()
File "C:\Users\iones\Documents\DeFI Chain Auction Bot\.venv\lib\site-packages\pandas\io\json\_json.py", line 1140, in _parse_no_numpy
loads(json, precise_float=self.precise_float), dtype=None
ValueError: Expected object or value
Load the JSON using read_json and then convert the batch dictionary into columns
import pandas as pd
df = pd.read_json('data.json')
df[['index', 'collaterals', 'loan']] = df.batches.apply(pd.Series)
If you want the collaterals in separate rows
df = df.explode('collaterals')
Output
print(df[['vaultId', 'liquidationHeight', 'index', 'collaterals', 'loan']])
vaultId liquidationHeight index collaterals loan
0 6af21886adcb92c4669a8a901975eb9b9d5544c67e4292... 1489770 0 2326.00000000#DFI 2.24978028#GOOGL
1 6af21886adcb92c4669a8a901975eb9b9d5544c67e4292... 1489770 1 2326.00000000#DFI 2.24978028#GOOGL
EDIT:
According to what you have said, it looks like the data is corrupt i.e. in an invalid JSON format. Please correct it at source. I have used the below data sample.
{
"vaultId": "6af21886adcb92c4669a8a901975eb9b9d5544c67e429267841491649810958a",
"ownerAddress": "df1qhh9ek2d98mxjeh58xdsfj7ad2k7q4d4kwshsxr",
"liquidationHeight": 1489770,
"batchCount": 2,
"batches": [
{
"index": 0,
"collaterals": [
"2326.00000000#DFI"
],
"loan": "2.24978028#GOOGL"
},
{
"index": 1,
"collaterals": [
"2326.00000000#DFI"
],
"loan": "2.24978028#GOOGL"
}
]
}
Fixed by using this
def jsontocsv():
df = pd.read_json('auctions.json')
df = df.explode('batches')
df[['index', 'collaterals', 'loan']] = df.batches.apply(pd.Series)
print(df[['vaultId', 'liquidationHeight', 'index', 'collaterals','loan']])
df.to_csv('auctions.csv')
`
I am using Pandas 0.22.0 on Python 2.7 with PyCharm as the IDE.
I am trying to sort multiple Data Frames using a Loop. These data frames were created from .csv files and then to be converted to xlsx using the 'xlsxwriter' in pandas.
I have created a sorting list that has all the sorting requirements inside it, so that when I run my loop, it will pick up a csv file, convert it to a data frame, 'sort it' (where I'm getting stuck), and then output the entire thing as a .xlsx file so it can be played around in MSEXCEL.
If I use df = df.sort_values(by=['SITE', 'DEPARTMENT', 'LOCATION', 'ASSET_TYPE', 'ASSET_NAME']) then there are no issues.
But, if I use this: df = df.sort_values(by=sorts[0]), the code comes crashing down.
Traceback (most recent call last):
File "D:/OneDrive/Programming Practice/Python/Rubaiyat/test1.py", line 55, in <module>
df = df.sort_values(by=(sorts[0]))
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 3619, in sort_values
k = self.xs(by, axis=other_axis).values
File "C:\Python27\lib\site-packages\pandas\core\generic.py", line 2335, in xs
return self[key]
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Python27\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Python27\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Python27\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Python27\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: "'SITE', 'DEPARTMENT', 'LOCATION', 'ASSET_TYPE', 'ASSET_NAME'"
The entire code is as follows:
import pandas
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
reportDF = ["assetReport", "assetTypeReport", "assetStatusReport", "locationReport", "departmentReport", "siteReport",
"userReport"]
sheetNames = ["Asset Report", "Asset Types", "Asset Status", "Locations", "Cost Centers", "Sites", "Users"]
columnNames = [("EPC", "Creation Date", "Modification Date", "Inventory Date", "Asset Name", "Asset Status",
"Asset Type", "Asset User", "Location", "Site", "Cost Center", "Description"),
"Asset Type Name",
("Asset Status", "Asset Status Description"),
("Location Name", "EPC", "Floor", "GPS", "Capacity", "Lead Time", "Site Name"),
"Cost Center",
("Site", "Country", "Postal Code", "City", "Address", "GPS"),
("User Name", "User Role", "First Name", "Last Name", "Email", "User Disabled?")]
sorts = ["'SITE', 'DEPARTMENT', 'LOCATION', 'ASSET_TYPE', 'ASSET_NAME'",
'ASSET_TYPE_NAME',
'ASSET_STATUS_NAME',
"'SITE_NAME', 'LOCATION_NAME'",
'DEPARTMENT_NAME',
'SITE_NAME',
'USER_NAME']
writer = pandas.ExcelWriter('mergedSheet.xlsx')
for i in range(0, 7):
df = pandas.read_csv(reportDF[i], delimiter=';')
df = df.sort_values(by=sorts[i])
df.to_excel(writer, sheet_name=sheetNames[i], engine='xlsxwriter', header=columnNames[i], freeze_panes=(1, 0))
writer.save()
writer.close()
Any help or guidance will be very much appreciated.
Thank You.
you creating one string which is: "'SITE', 'DEPARTMENT', 'LOCATION', 'ASSET_TYPE', 'ASSET_NAME'".
I think it should look like this:
sorts = [['SITE', 'DEPARTMENT', 'LOCATION', 'ASSET_TYPE', 'ASSET_NAME'],
'ASSET_TYPE_NAME',
'ASSET_STATUS_NAME',
['SITE_NAME', 'LOCATION_NAME'],
'DEPARTMENT_NAME',
'SITE_NAME',
'USER_NAME']
I have a huge number of keywords written in a file. I put them in an array and tried to run a query for each of the item in the array and retrieve the documents that have any of the keywords. It shows me the number of returned documents only for 50 of them and at the end I get a couple of errors.
Here is my code:
A subset of keywords:
C0001396 SYNDROME STOKES-ADAMS
C0001396 Syndrome, Adams-Stokes
C0001396 Syndrome, Stokes-Adams
C0002962 3-12 ANGINAL SYNDROMES
C0002962 ANGINA
The CODE:
from elasticsearch import Elasticsearch
import json
count=0
keyword_array = []
es = Elasticsearch(['http://IP:9200/'])
with open('localDrive\\C0577785C.txt') as my_keywordfile:
for keyword in my_keywordfile.readlines():
keyword_ex=keyword[9:]
print(keyword_ex)
keyword_array.append(keyword_ex.strip().strip("'"))
with open('localDrive\\out.txt', 'wb') as f:
for x in keyword_array:
doc = {
"from" : 0, "size" : 1000000,
"query":{
"query_string":{
"fields" : ["description", "title"],
"query" : x
}
}
}
res = es.search(index='INDEXED_REPO', body=doc)
print("Got %d Hits:" % res['hits']['total'])
count += 1
print(count)
f.write(json.dumps(res).encode("utf-8"))
f.flush()
f.close()
Errors:
GET http://INDEX_REPO/indexed/_search [status:400 request:0.012s]
Traceback (most recent call last):
File
"localDrive/PycharmProjects/extract_keywords/elastic_serach5.py", line
32, in <module>
res = es.search(index='INDEXED_REPO', body=doc)
File "......\Local\Programs\Python\Python36-32\lib\site-
packages\elasticsearch\client\utils.py", line 73, in _wrapped
return func(*args, params=params, **kwargs)
File "....\AppData\Local\Programs\Python\Python36-32\lib\site-
packages\elasticsearch\client\__init__.py", line 623, in search
doc_type, '_search'), params=params, body=body)
File "......\AppData\Local\Programs\Python\Python36-32\lib\site-
packages\elasticsearch\transport.py", line 312, in perform_request
status, headers, data = connection.perform_request(method, url, params,
body, ignore=ignore, timeout=timeout)
File "......\AppData\Local\Programs\Python\Python36-32\lib\site-
packages\elasticsearch\connection\http_urllib3.py", line 128, in
perform_request
self._raise_error(response.status, raw_data)
File "......\AppData\Local\Programs\Python\Python36-32\lib\site-
packages\elasticsearch\connection\base.py", line 125, in _raise_error
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code,
error_message, additional_info)
elasticsearch.exceptions.RequestError: <exception str() failed>
Any idea why it is happening?
Thanks,
The elasticsearch query was not in the right format. I changed it to the following and it worked!:
doc = {
"query": {
"multi_match": {
"query": x,
"type": "phrase",
"fields": ["title", "description"],
"operator": "and"
}
}
}
I am currently exporting a database from firebase into a JSON and want to upload this to Bigquery. However, some of the fieldnames in the database have nested information and Bigquery does not accept it this way. How can I delete 'Peripherals' from every dataset that it is present in in my JSON. It is not present in every dataset though. I provided an example of what the JSON code looks like below. Thanks for the help!
{"AppName": "DataWorks", "foundedPeripheralCount": 1, "version": "1.6.1(8056)", "deviceType": "iPhone 6", "createdAt": "2017-04-05T07:05:30.408Z", "updatedAt": "2017-04-05T07:08:49.569Z", "Peripherals": {"1CA726ED-32B1-43B4-9071-B58BBACE20A8": "Arduino"}, "connectedPeripheralCount": 1, "iOSVersion": "10.2.1"}
{"objectId": "20H5Hg2INB", "foundedPeripheralCount": 0, "DeviceVendorID": "5B7F085E-B3B6-4270-97DC-F42903CDEAC1", "version": "1.3.5(5801)", "deviceType": "iPhone 6", "createdAt": "2015-11-10T06:16:45.459Z", "updatedAt": "2015-11-10T06:16:45.459Z", "connectedPeripheralCount": 0, "iOSVersion": "9.1"}
{"AppName": "DataWorks", "foundedPeripheralCount": 2, "version": "1.6.2(8069)", "deviceType": "iPhone 6s", "createdAt": "2017-04-12T10:05:05.937Z", "updatedAt": "2017-07-06T07:33:02.006Z", "Peripherals": {"060EBAFD-3120-4AAD-8B0A-EC14A323FA25": "28902 ", "identifierInternalSensors": "Internal Sensors", "0521A273-FAA5-462E-B9EC-FBB3D60F5E99": "28895 "}, "connectedPeripheralCount": 8, "iOSVersion": "10.2.1"}
I have tried this
import json
with open('firetobq_peripheral.json') as out_file:
out = json.load(out_file)
for element in out:
del element['Peripherals']
print(out)
but I receive this error
Traceback (most recent call last):
File "editjson.py", line 3, in <module>
out = json.load(out_file)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 290, in load
**kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.py", line 369, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 2 column 1 - line 629 column 1 (char 311 - 203056)
It looks like the data in 'firetobq_peripheral.json' is not valid json. If each row is on a new line you can use this code:
with open('firetobq_peripheral.json', 'r') as in_file:
dicts = []
for line in in_file.readlines() :
d = json.loads(line.strip())
if d.get('Peripherals'):
del d['Peripherals']
dicts += [d]
with open('firetobq_peripheral.json', 'w') as out_file:
out_file.write('[\n')
for i,v in enumerate(dicts):
out_file.write(json.dumps(v)+('\n' if i == len(dicts)-1 else ',\n'))
out_file.write(']')
Use this code for properly formatted json data:
with open('firetobq_peripheral.json', 'r') as in_file:
dicts = json.load(in_file)
for d in dicts:
if d.get('Peripherals'):
del d['Peripherals']
with open('firetobq_peripheral.json', 'w') as out_file:
out_file.write(json.dumps(dicts, indent=2))