CSV to elasticsearch with python SerializationError - python

When i try to send the bulk_data to the local elasticsearch, my data isn't loaded because of the SerializationError.
I already tried to fill the empty cells in the csv file, but that wasn't the solution.
from elasticsearch import Elasticsearch
bulk_data = []
header = []
count = 0
for row in csv_file_object:
if count > 0 :
data_dict = {}
for i in range(len(row)):
row = row.rstrip()
data_dict[header[i]] = row[i]
op_dict = {
"index": {
"_index": INDEX_NAME,
"_type": TYPE_NAME,
}
}
bulk_data.append(op_dict)
bulk_data.append(data_dict)
else:
header = row
count = count+1
# create ES client, create index
es = Elasticsearch(hosts = [ES_HOST])
if es.indices.exists(INDEX_NAME):
print("deleting '%s' index..." % (INDEX_NAME))
res = es.indices.delete(index = INDEX_NAME)
res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
See image for the SerializationError and bulk_data values:
Please note: the \n is added by the serialization process itself.

I try to repond to you but I can't understand one thing. How you retrieve your field name from data? In your code I see that you retrieve it from a list called header that is empty? I can't understand how you take this value.. Check my answer i don't know if i understand well
from elasticsearch import Elasticsearch
from elasticsearch import helpers
index_name = "your_index_name"
doc_type = "your_doc_type"
esConnector = Elasticsearch(["http://192.168.1.1:9200/"])
# change your ip here
count = 0
def generate_data(csv_file_object)
with open(csv_file_object, "r") as f:
for line in f:
line = line.split(",").rstrip()
data_dict = {header[count]: line}
obj={
'_op_type': 'index',
'_index': index_name,
'_type': doc_type,
'_id': count+1,
'_source': data_dict
}
count +=1
yield obj
for success, info in helpers.parallel_bulk(client=esConnector, actions=generate_data(csv_file_object), thread_count=4):
if not success:
print 'Doc failed', info

Related

IP URL Mapping in JSON log file

I have a JSON log file and want to print and count the number of times a URL(requestURL) has been hit by an IP in the same log file. The output should be like the below:
IP(remoteIp): URL1-(Count), URL2-(Count), URL3...
127.0.0.1: http://www.google.com - 12, www.bing.com/servlet-server.jsp - 2, etc..
The Sample of the Logfile is like below
"insertId": "kdkddkdmdkd",
"jsonPayload": {
"#type": "type.googleapis.com/google.cloud.loadbalancing.type.LoadBalancerLogEntry",
"enforcedSecurityPolicy": {
"configuredAction": "DENY",
"outcome": "DENY",
"preconfiguredExprIds": [
"owasp-crs-v030001-id942220-sqli"
],
"name": "shbdbbddjdjdjd",
"priority": 2000
},
"statusDetails": "body_denied_by_security_policy"
},
"httpRequest": {
"requestMethod": "POST",
"requestUrl": "https://dknnkkdkddkd/token",
"requestSize": "3004",
"status": 403,
"responseSize": "274",
"userAgent": "okhttp/3.12.2",
"remoteIp": "127.0.0.1",
"serverIp": "123.123.33.31",
"latency": "0.018728s"
}
The solution that I am using is below. I am able to get the total hits per IP or how many total times a URL has been hit etc.
import json
from collections import Counter
unique_ip = {}
request_url = {}
def getAndSaveValueSafely(freqTable, searchDict, key):
try:
tmp = searchDict['httpRequest'][key]
if tmp in freqTable:
freqTable[tmp] += 1
else:
freqTable[tmp] = 1
except KeyError:
if 'not_present' in freqTable:
freqTable['not_present'] += 1
else:
freqTable['not_present'] = 1
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
getAndSaveValueSafely(unique_ip, d2, 'remoteIp')
getAndSaveValueSafely(request_url, d2, 'requestUrl')
mc_unique_ip = (dict(Counter(unique_ip).most_common()))
mc_request_url = (dict(Counter(request_url).most_common()))
def printing():
a = str(len(unique_ip))
b = str(len(request_url))
with open("output.txt", "w") as f1:
print(
f' Start Time of log = {minTs}'
f' \n\n End Time of log = {maxTs} \n\n\n {a} Unique IP List = {mc_unique_ip} \n\n\n {b} Unique URL = {mc_request_url},file=f1)
I dont think you need to use counter and are unlikely to see any benifit
from collections import defaultdict
result = {} # start empty
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(url,defaultdict(int))[ip] += 1
print(result)
# {"/endpoint.html": {"127.2.3.4":15,"222.11.31.22":2}}
if instead you want it the other way thats easy also
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(ip,defaultdict(int))[url] += 1
#{"127.1.2.3",{"/endpoint1.html":15,"/endpoint2.php":1},"33.44.55.66":{"/endpoint1.html":5}, ...}
instead of using defaultdict you could add a line
# result.setdefault(ip,defaultdict(int))[url] += 1
result.setdefault(ip,{})
result[ip][url] = result[ip].get(url,0) + 1
which arguably is more readable anyway...

Error when posting Payload data string to Hubspot using an AWS Lambda Python API call

I have recently uploaded contact records to HubSpot using Postman. Here is a raw JSON data example and POST method that I use to successfully upload a contact:
https://api.hubapi.com/crm/v3/objects/contacts?hapikey={{hapikey}}
{properties": {
"smbapi": "yes",
"email": "fcgrinding#junkstermail.com",
"business_name":"Forest City Grinding Inc",
"srvc_address_1":"3544 Stenstrom Rd",
"srvc_city_1":"",
"srvc_state_1":"IL",
"srvc_zip_1":"61109",
"proposal_date":"2021-12-07",
"proposal_start_date": "2022-01-01",
"udc_code_1": "COMED",
"eog":"electric",
"fixedprice_1_gas_mcf": 6.63,
"fixedprice_2_gas_mcf": 6.11,
"fixedprice_3_gas_mcf": 5.9,
"term_1": 12,
"term_2": 24,
"term_3": 36,
"smb_bdm_name": "Timothy Chin",
"smb_bdm_phone": "833-999-9999",
"smb_bdm_email": "tim.chin#junkstermail.com"
}
}
Next, I then created a python lambda function to automate this process because we want to ingest CSV files that may have many records to extract. So, I had constructed the dictionary to look the same as the string above which had worked great/fine with Postman. However, when I try and do a Post method API call to HubSpot, using my dictionary payload, I am getting this error:
Invalid input JSON : Cannot build ObjectSchemaEgg, Invalid input JSON
on line 1, column 2: Cannot build ObjectSchemaEgg, some of required
attributes are not set [name, labels]
Here is the processed dictionary string that my code constructed for the API call:
{'properties': '{"smbapi": "yes", "business_name": "Forest City Grinding Inc", "srvc_address_1": "4844 Stenstrom Rd", "srvc_state_1": "IL", "srvc_zip_1": "61109", "proposal_date": "2021-12-07", "proposal_start_date": "2022-01-01", "udc_code_1": "COMED", "fixedprice_1": "6.63", "fixedprice_2": "6.11", "fixedprice_3": "5.9", "term_2": "24", "term_3": "36", "smb_bdm_name": "Gary Wysong", "smb_bdm_phone": "833-389-0881", "smb_bdm_email": "gary.wysong#constellation.com"}'}
Here is my Lambda code in full (give special attention to both the call to post_to_hubspot() and also the post_to_hubspot() function itself). The code that loads the dynamo table is working correctly.:
import boto3
import json
import decimal
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key, Attr
import re
import pandas as pd
import numpy as np
import os
import datetime
from os import urandom
import email
import base64
import requests
from datetime import datetime, timedelta, timezone
import mailparser
import calendar
global payload_data
landing_zone_bucket_name = str(os.environ['BUCKETNAME'])
s3 = boto3.resource('s3')
landing_zone_bucket = s3.Bucket(landing_zone_bucket_name )
s3r = boto3.client('s3')
dynamodb = boto3.resource('dynamodb', region_name='us-west-2')
table = dynamodb.Table(str(os.environ['DYNAMOTABLE']))
unprocessed_records_table = dynamodb.Table(str(os.environ['UNPROCESSEDTABLE']))
email_table = dynamodb.Table(str(os.environ['EMAILSTATUSTABLE']))
endpoint_url=os.environ['ENDPOINT_URL']
access_key = os.environ['ACCESSKEY']
now = datetime.now()
today_date = datetime.strftime(now,'%d')
today_month = datetime.strftime(now,'%m')
today_year = datetime.strftime(now,'%Y')
time_stamp = datetime.now().strftime('%Y%m%d%H%M%S')
payload_data = {}
#WRITE RECORDS TO DYNAMO
def dynamoPut(dObj,table_name=None):
try:
for each in list(dObj['Information']):
if dObj['Information'][each]:
dObj['Information'][each] = str(dObj['Information'][each])
else:
del dObj['Information'][each]
dObj['Month'] = today_month
dObj['Year'] = today_year
dObj['Day'] = today_date
for each in list(dObj):
if dObj[each] != '':
dObj[each] = dObj[each]
else:
del dObj[each]
if table_name != None:
response = unprocessed_records_table.put_item(Item = dObj)
else:
response = table.put_item(Item = dObj)
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
return True
else:
return False
except Exception as e:
print(e)
return False
def dynamoPutFileName(filename,source_type):
try:
dObj = {}
dObj['id'] = urandom(20).hex()
dObj['CreatedAt'] = str(datetime.now())
dObj['FileName'] = filename
dObj['Type'] = source_type
dObj['EmailSent'] = False
response = email_table.put_item(Item = dObj)
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
return True
else:
return False
except Exception as e:
print(e)
return False
def parse_csv_hubspot(event, obj):
#parsing CSV file to write to dynamo
try:
def auto_truncate(val):
return val[:255 ]
print('<< IN PARSE CSV HUBSPOT >>')
print(event)
csv = pd.read_csv(obj['Body'], encoding = "ISO-8859-1")
csv_nn = csv.replace(np.nan, 'null', regex=True)
d = csv_nn.to_dict(orient='records')
source_id = urandom(20).hex()
file_name = event['file_path'].split('/')[-1]
print('<< FILE NAME >>', file_name)
for each in d:
try:
dbObj = {}
#PASSING THE EXTERNAL KEY
UniqueKey = ''
if 'smbapi' in each and each['smbapi'] != 'null':
dbObj['smbapi' ] = each['smbapi']
print('<< SMB API>>', dbObj['smbapi' ])
if 'business_name' in each and each['business_name'] != 'null':
dbObj['business_name'] = each['business_name']
print('<< BUSINESS NAME >>', dbObj['business_name'])
if 'srvc_address_1' in each and each['srvc_address_1'] != 'null':
dbObj['srvc_address_1'] = each['srvc_address_1']
print('<< ADDRESS 1 >>', dbObj['srvc_address_1'])
if 'srvc_city_1' in each and each['srvc_city_1'] != 'null':
dbObj['srvc_city_1'] = each['srvc_city_1']
if 'srvc_state_1' in each and each['srvc_state_1'] != 'null':
dbObj['srvc_state_1'] = each['srvc_state_1']
if 'srvc_zip_1' in each and each['srvc_zip_1'] != 'null':
dbObj['srvc_zip_1']= str(each['srvc_zip_1']).zfill(5)
if 'proposal_date' in each and each['proposal_date'] != 'null':
dbObj['proposal_date']= try_parsing_date(each['proposal_date']).date().isoformat()
if 'proposal_start_date' in each and each['proposal_start_date'] != 'null':
dbObj['proposal_start_date']= try_parsing_date(each['proposal_start_date']).date().isoformat()
if 'udc_code_1' in each and each['udc_code_1'] != 'null':
dbObj['udc_code_1']= each['udc_code_1']
if 'eog' in each and each['eog'] != 'null':
dbObj['eog']= each['eog']
if 'fixedprice_1' in each and each['fixedprice_1'] != 'null':
dbObj['fixedprice_1']= each['fixedprice_1']
if 'fixedprice_2' in each and each['fixedprice_2'] != 'null':
dbObj['fixedprice_2']= each['fixedprice_2']
if 'fixedprice_3' in each and each['fixedprice_3'] != 'null':
dbObj['fixedprice_3']= each['fixedprice_3']
if 'fixedprice_1_gas_therm' in each and each['fixedprice_1_gas_therm'] != 'null':
dbObj['fixedprice_1_gas_therm']= each['fixedprice_1_gas_therm']
if 'fixedprice_2_gas_therm' in each and each['fixedprice_2_gas_therm'] != 'null':
dbObj['fixedprice_2_gas_therm']= each['fixedprice_2_gas_therm']
if 'fixedprice_3_gas_therm' in each and each['fixedprice_3_gas_therm'] != 'null':
dbObj['fixedprice_3_gas_therm']= each['fixedprice_3_gas_therm']
if 'fixedprice_1_gas_ccf' in each and each['fixedprice_1_gas_ccf'] != 'null':
dbObj['fixedprice_1_gas_ccf']= each['fixedprice_1_gas_ccf']
if 'fixedprice_2_gas_ccf' in each and each['fixedprice_2_gas_ccf'] != 'null':
dbObj['fixedprice_2_gas_ccf']= each['fixedprice_2_gas_ccf']
if 'fixedprice_3_gas_ccf' in each and each['fixedprice_3_gas_ccf'] != 'null':
dbObj['fixedprice_3_gas_ccf']= each['fixedprice_3_gas_ccf']
if 'fixedprice_1_gas_dth' in each and each['fixedprice_1_gas_dth'] != 'null':
dbObj['fixedprice_1_gas_dth']= each['fixedprice_1_gas_dth']
if 'fixedprice_2_gas_dth' in each and each['fixedprice_2_gas_dth'] != 'null':
dbObj['fixedprice_2_gas_dth']= each['fixedprice_2_gas_dth']
if 'fixedprice_3_gas_dth' in each and each['fixedprice_3_gas_dth'] != 'null':
dbObj['fixedprice_3_gas_dth']= each['fixedprice_3_gas_dth']
if 'fixedprice_1_gas_mcf' in each and each['fixedprice_1_gas_mcf'] != 'null':
dbObj['fixedprice_1_gas_mcf']= each['fixedprice_1_gas_mcf']
if 'fixedprice_2_gas_mcf' in each and each['fixedprice_2_gas_mcf'] != 'null':
dbObj['fixedprice_2_gas_mcf']= each['fixedprice_2_gas_mcf']
if 'fixedprice_3_gas_mcf' in each and each['fixedprice_3_gas_mcf'] != 'null':
dbObj['fixedprice_3_gas_mcf']= each['fixedprice_3_gas_mcf']
if 'term_1' in each and each['term_1'] != 'null':
dbObj['term_1']= each['term_1']
if 'term_2' in each and each['term_2'] != 'null':
dbObj['term_2']= each['term_2']
if 'term_3' in each and each['term_3'] != 'null':
dbObj['term_3']= each['term_3']
if 'smb_bdm_name' in each and each['smb_bdm_name'] != 'null':
dbObj['smb_bdm_name']= each['smb_bdm_name']
if 'smb_bdm_phone' in each and each['smb_bdm_phone'] != 'null':
if '.' in str(each['smb_bdm_phone']):
dbObj['smb_bdm_phone']= str(int(float(each['smb_bdm_phone'])))
else:
dbObj['smb_bdm_phone']= str(each['smb_bdm_phone'])
if 'smb_bdm_email' in each and each['smb_bdm_email'] != 'null' and each['smb_bdm_email'].strip() != '' and each['smb_bdm_email'] != None:
dbObj['smb_bdm_email']= each['smb_bdm_email']
print('<< OBJ >> ',dbObj)
N = urandom(20).hex()
now = str(datetime.now())
#<< END of HUBSPOT INGESTION >>
# table.put_item(
Item = {
'CogId' : str(N),
'CreatedAt': now,
'ExternalId': UniqueKey,
'Information' : dbObj,
'SourceBucket': landing_zone_bucket_name,
'SourcePath' : event['file_path'],
'Source' : 'HubSpot',
'SourceId' : source_id,
'SourceFileName': time_stamp + '_' + file_name
}
#WRITE-TO-DYNAMO
files_processing = dynamoPut(Item)
if not files_processing:
print('Writing {} record to dynamodb Failed'.format(Item))
except Exception as e:
print(e)
N = urandom(20).hex()
Item = {
'CogId' : str(N),
'CreatedAt': now,
'Information' : each,
'SourceBucket': landing_zone_bucket_name,
'SourcePath' : event['file_path'],
'Source' : 'HubSpot',
'message': str(e),
'SourceId' : source_id,
'ExternalId': UniqueKey
}
files_processing = dynamoPut(Item,'Fail')
pass
temp_file_name = time_stamp + '_' + file_name
isert_file_name = dynamoPutFileName(temp_file_name,'HubSpot')
post_to_hubspot(dbObj)
return True
except Exception as e:
print(e)
new_folder_path = os.environ['CSV_NEW_FOLDER_HUBSPOT']
unprocessed_folder_path = os.environ['CSV_ERROR_FOLDER_HUBSPOT']
# MOVING PROCESSED FILES FROM NEW TO UNPROCESSED FOLDER
move_file_to_processed = moving_files_new_to_processed(event, new_folder_path,unprocessed_folder_path)
return False
def try_parsing_date(text):
for fmt in ('%m/%d/%Y','%Y-%m-%dT%H:%M:%S-%f', '%m/%d/%y', '%Y-%m-%d', '%m.%d.%Y','%Y-%m-%dT%I', '%Y-%m-%dT%I%p', '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%S.%f+','%Y-%m-%dT%H:%M:%S'):#2018-11-20T08:05:54-0500
try:
return datetime.strptime(text, fmt)
except ValueError:
print('in except')
pass
return ValueError('no valid date format found')
def post_to_hubspot(list_contacts):
print('<< IN POST-To-HUBSPOT >>')
data_string = **json.dumps(list_contacts)**
payload_data = {"properties": data_string}
print('<< dbOBJ LIST >> ',payload_data)
response = requests.request("POST", endpoint_url+access_key, headers={'Content-Type': 'application/json'}, data=payload_data)
token_response=json.loads(response.text)
print('<< TOKEN RESPONSE >>',token_response)
def moving_files_new_to_processed(event, new_folder,processed_folder):
#MOVING-FILES-TO-PROCESSED
try:
copy_source = {
'Bucket': landing_zone_bucket_name,
'Key': event['file_path']
}
path = event['file_path']
processed_folder = processed_folder + time_stamp + '_'
new_key = path.replace(new_folder, processed_folder)
new_obj = landing_zone_bucket.Object(new_key)
new_obj.copy(copy_source)
s3.Object(landing_zone_bucket_name, event['file_path']).delete()
return True
except Exception as e:
print(e)
return False
def lambda_handler(event,context):
print("Starting to Push Records to Dynamo Lambda")
print(event)
try:
parse_flag = False
new_folder_path = ''
processed_folder_path = ''
#Gets file path and calls required function to parse it out
key = str(os.environ['CSV_NEW_FOLDER_HUBSPOT'])
obj = s3r.get_object(Bucket=landing_zone_bucket_name, Key=event['file_path'])
print('after obj')
print(os.environ['CSV_NEW_FOLDER_HUBSPOT'])
print('in HubSpot parse_csv')
parse_csv_func = parse_csv_hubspot(event, obj)
# Checks if parse_csv return empty dictionary
if parse_csv_func:
parse_flag = True
new_folder_path = os.environ['CSV_NEW_FOLDER_HUBSPOT']
processed_folder_path = os.environ['CSV_PROCESSED_FOLDER_HUBSPOT']
else:
print('File Format not Supported for {}'.format(event['file_path']))
if parse_flag:
# UPLOADING CONTACT.MOVING PROCESSED FILES FROM NEW TO PROCESSED FOLDER
#print('<< PAYLOAD >> ',payload)
#response = requests.request("POST", "https://api.hubapi.com/crm/v3/schemas/?hapikey="+access_key, headers={'Content-Type': 'application/json'}, data=json.dumps(str(payload)))
#token_response=json.loads(response.text)
#print('<< TOKEN RESPONSE >>',token_response)
#MOVING PROCESSED FILES FROM NEW TO PROCESSED FOLDER
move_file_to_processed = moving_files_new_to_processed(event, new_folder_path,processed_folder_path)
if move_file_to_processed:
print('File {} moved Successfully from {} to {}'.format(event['file_path'],new_folder_path,processed_folder_path))
else:
print('Moving {} file from new to processing folder Failed'.format(event['file_path']))
except Exception as e:
print(e)
What could be the problem? Thanks for your help.
The problem was caused by two issues:
The dictionary should have been placed in json.dumps() to convert it to JSON string when doing a POST so the dictionary didn't need to change its structure. Here's the response from the POST:
<< TOKEN RESPONSE >> {
"id": "135120801",
"properties": {
"business_name": "Millers Brand Oats",
"createdate": "2021-12-21T02:31:12.452Z",
"fixedprice_1": "6.63",
"fixedprice_2": "6.11",
"fixedprice_3": "5.9",
"hs_all_contact_vids": "135120801",
"hs_is_contact": "true",
"hs_is_unworked": "true",
"hs_marketable_until_renewal": "false",
"hs_object_id": "135120801",
"hs_pipeline": "contacts-lifecycle-pipeline",
"lastmodifieddate": "2021-12-21T02:31:12.452Z",
"proposal_date": "2021-12-07",
"proposal_start_date": "2022-01-01",
"smb_bdm_email": "Tim.Chu#junkster.com",
"smb_bdm_name": "Tim Chu",
"smb_bdm_phone": "833-999-9999",
"smbapi": "yes",
"srvc_address_1": "4844 Stenstrom Rd",
"srvc_state_1": "IL",
"srvc_zip_1": "61109",
"term_2": "24",
"term_3": "36",
"udc_code_1": "COMED"
},
"createdAt": "2021-12-21T02:31:12.452Z",
"updatedAt": "2021-12-21T02:31:12.452Z",
"archived": false
}
I was using the wrong endpoint:
https://api.hubapi.com/crm/v3/schemas/
instead of:
https://api.hubapi.com/crm/v3/objects/contacts/
Now I just need to find out why the AWS Lambda POSTs allow for duplicate contacts to be created in HubSpot while Postman POSTs prohibit duplicate contacts to be created.

constructing a message format from the fetchall result in python

*New to Programming
Question: I need to use the below "Data" (two rows as arrays) queried from sql and use it to create the message structure below.
data from sql using fetchall()
Data = [[100,1,4,5],[101,1,4,6]]
##expected message structure
message = {
"name":"Tom",
"Job":"IT",
"info": [
{
"id_1":"100",
"id_2":"1",
"id_3":"4",
"id_4":"5"
},
{
"id_1":"101",
"id_2":"1",
"id_3":"4",
"id_4":"6"
},
]
}
I tried to create below method to iterate over the rows and then input the values, this is was just a starting, but this was also not working
def create_message(data)
for row in data:
{
"id_1":str(data[0][0],
"id_2":str(data[0][1],
"id_3":str(data[0][2],
"id_4":str(data[0][3],
}
Latest Code
def create_info(data):
info = []
for row in data:
temp_dict = {"id_1_tom":"","id_2_hell":"","id_3_trip":"","id_4_clap":""}
for i in range(0,1):
temp_dict["id_1_tom"] = str(row[i])
temp_dict["id_2_hell"] = str(row[i+1])
temp_dict["id_3_trip"] = str(row[i+2])
temp_dict["id_4_clap"] = str(row[i+3])
info.append(temp_dict)
return info
Edit: Updated answer based on updates to the question and comment by original poster.
This function might work for the example you've given to get the desired output, based on the attempt you've provided:
def create_info(data):
info = []
for row in data:
temp_dict = {}
temp_dict['id_1_tom'] = str(row[0])
temp_dict['id_2_hell'] = str(row[1])
temp_dict['id_3_trip'] = str(row[2])
temp_dict['id_4_clap'] = str(row[3])
info.append(temp_dict)
return info
For the input:
[[100, 1, 4, 5],[101,1,4,6]]
This function will return a list of dictionaries:
[{"id_1_tom":"100","id_2_hell":"1","id_3_trip":"4","id_4_clap":"5"},
{"id_1_tom":"101","id_2_hell":"1","id_3_trip":"4","id_4_clap":"6"}]
This can serve as the value for the key info in your dictionary message. Note that you would still have to construct the message dictionary.

Python Append Data from Loop into Data frame

I created this code where I am able to pull the data I want but not able to sort it as it should be. I am guessing it has to do with the way I am appending each item by ignoring index but I can't find my way around it.
This is my code:
import json
import pandas as pd
#load json object
with open("c:\Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
print("Type:", type(data2))
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df=df.append({'Image':image},ignore_index = True)
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df=df.append({'Email':email}, ignore_index = True)
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df=df.append({'User':user}, ignore_index = True)
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df=df.append({'Members':members}, ignore_index = True)
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df=df.append({'Time':time}, ignore_index = True)
#df['Time'] = time
df
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'c:\sample.csv', encoding='utf-8')
The code gives me this as the result
I am looking to get this
Data of the file is this:
{
"mydata": [
{
"attachments": [
{
"filename": "image.png",
"id": "888888888"
}
],
"author_user_email": "email#email.com",
"author_user_id": "91",
"author_user_name": "Marlone",
"message": "",
"room_id": "999",
"room_members": [
{
"room_member_id": "91",
"room_member_name": "Marlone"
},
{
"room_member_id": "9191",
"room_member_name": " +16309438985"
}
],
"room_name": "SMS [Marlone] [ +7777777777]",
"room_type": "sms",
"ts": 55,
"ts_iso": "2021-06-13T18:17:32.877369+00:00"
},
{
"author_user_email": "email#email.com",
"author_user_id": "21",
"author_user_name": "Chris",
"message": "Hi",
"room_id": "100",
"room_members": [
{
"room_member_id": "21",
"room_member_name": "Joe"
},
{
"room_member_id": "21",
"room_member_name": "Chris"
}
],
"room_name": "Direct [Chris] [Joe]",
"room_type": "direct",
"ts": 12345678910,
"ts_iso": "2021-06-14T14:42:07.572479+00:00"
}]}
Any help would be appreciated. I am new to python and am learning on my own.
Try:
import json
import pandas as pd
with open("your_data.json", "r") as f_in:
data = json.load(f_in)
tmp = []
for d in data["mydata"]:
image = d.get("attachments", [{"id": None}])[0]["id"]
email = d.get("author_user_email")
user = d.get("author_user_name")
members = d.get("room_name")
time = d.get("ts_iso")
tmp.append((image, email, user, members, time))
df = pd.DataFrame(tmp, columns=["Image", "Email", "User", "Members", "Time"])
print(df)
Prints:
Image Email User Members Time
0 888888888 email#email.com Marlone SMS [Marlone] [ +7777777777] 2021-06-13T18:17:32.877369+00:00
1 None email#email.com Chris Direct [Chris] [Joe] 2021-06-14T14:42:07.572479+00:00
Although the other answer does work, pandas has a built in reader for json files pd.read_json: https://pandas.pydata.org/pandas-docs/version/1.1.3/reference/api/pandas.read_json.html
It has the benefit of being able to handle very large datasets via chunking, as well as processing quite a few different formats. The other answer would not be performant for a large dataset.
This would get you started:
import pandas as pd
df = pd.read_json("c:\Sample.json")
The probblem is that append() adds a new row. So, you have to use at[] https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html specifying the index/row. Se below. Some print/debug messages were left and path to input and output files was changed a little because I'm on Linux.
import json
import pandas as pd
import pprint as pp
#load json object
with open("Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
#pp.pprint(data2)
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
pp.pprint(df)
index = 0
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df.at[index, 'Image'] = image
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df.at[index, 'Email'] = email
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df.at[index, 'User'] = user
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df.at[index, 'Members'] = members
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df.at[index, 'Time'] = time
#df['Time'] = time
index += 1
# start indexing from 0
df.reset_index()
# replace empty str/cells witn None
df.fillna('None', inplace=True)
pp.pprint(df)
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'sample.csv', encoding='utf-8')

JSON Parsing help in Python

I have below data in JSON format, I have started with code below which throws a KEY ERROR.
Not sure how to get all data listed in headers section.
I know I am not doing it right in json_obj['offers'][0]['pkg']['Info']: but not sure how to do it correctly.
how can I get to different nodes like info,PricingInfo,Flt_Info etc?
{
"offerInfo":{
"siteID":"1",
"language":"en_US",
"currency":"USD"
},
"offers":{
"pkg":[
{
"offerDateRange":{
"StartDate":[
2015,
11,
8
],
"EndDate":[
2015,
11,
14
]
},
"Info":{
"Id":"111"
},
"PricingInfo":{
"BaseRate":1932.6
},
"flt_Info":{
"Carrier":"AA"
}
}
]
}
}
import os
import json
import csv
f = open('api.csv','w')
writer = csv.writer(f,delimiter = '~')
headers = ['Id' , 'StartDate', 'EndDate', 'Id', 'BaseRate', 'Carrier']
default = ''
writer.writerow(headers)
string = open('data.json').read().decode('utf-8')
json_obj = json.loads(string)
for pkg in json_obj['offers'][0]['pkg']['Info']:
row = []
row.append(json_obj['id']) # just to test,but I need column values listed in header section
writer.writerow(row)
It looks like you're accessing the json incorrectly. After you have accessed json_obj['offers'], you accessed [0], but there is no array there. json_obj['offers'] gives you another dictionary.
For example, to get PricingInfo like you asked, access like this:
json_obj['offers']['pkg'][0]['PricingInfo']
or 11 from the StartDate like this:
json_obj['offers']['pkg'][0]['offerDateRange']['StartDate'][1]
And I believe you get the KEY ERROR because you access [0] in the dictionary, which since that isn't a key, you get the error.
try to substitute this piece of code:
for pkg in json_obj['offers'][0]['pkg']['Info']:
row = []
row.append(json_obj['id']) # just to test,but I need column values listed in header section
writer.writerow(row)
With this:
for pkg in json_obj['offers']['pkg']:
row.append(pkg['Info']['Id'])
year = pkg['offerDateRange']['StartDate'][0]
month = pkg['offerDateRange']['StartDate'][1]
day = pkg['offerDateRange']['StartDate'][2]
StartDate = "%d-%d-%d" % (year,month,day)
print StartDate
writer.writerow(row)
Try this
import os
import json
import csv
string = open('data.json').read().decode('utf-8')
json_obj = json.loads(string)
print json_obj["offers"]["pkg"][0]["Info"]["Id"]
print str(json_obj["offers"]["pkg"][0]["offerDateRange"]["StartDate"][0]) +'-'+ str(json_obj["offers"]["pkg"][0]["offerDateRange"]["StartDate"][1])+'-'+str(json_obj["offers"]["pkg"][0]
["offerDateRange"]["StartDate"][2])
print str(json_obj["offers"]["pkg"][0]["offerDateRange"]["EndDate"][0]) +'-'+ str(json_obj["offers"]["pkg"][0]["offerDateRange"]["EndDate"][1])+'-'+str(json_obj["offers"]["pkg"][0]
["offerDateRange"]["EndDate"][2])
print json_obj["offers"]["pkg"][0]["Info"]["Id"]
print json_obj["offers"]["pkg"][0]["PricingInfo"]["BaseRate"]
print json_obj["offers"]["pkg"][0]["flt_Info"]["Carrier"]

Categories

Resources