How to split a large json file into smaller ones in python

How to split a large json file into smaller ones in python - python

I have a base.json file that I would like to split into smaller ones containing only filtered data by my python script where I am using json module. At the moment I can filter the data but I don't know how to save them to the appropriate files like: user, computer, group, printer, etc.
I would like to achieve the effect that after checking the base file, the data was properly saved to smaller files:
users --> users.json
computers --> computers.json
groups --> groups.json
etc.
First, I would like to know how to save data to separate files, then I know that what is repeated can be closed in the function def save_to_string(data_to_save)
My base json looks like this:
{
"entries": [
{
"attributes": {
all attributes from ldap like cn, sn, objectClass, etc
},
"dn": "CN=group1,OU=Groups,OU=UNIVERSUM,DC=universum,DC=local"
},
{
"attributes": {
all attributes from ldap like cn, sn, objectClass, etc
},
"dn": "CN=cmptr-01,OU=Computers,OU=UNIVERSUM,DC=universum,DC=local"
},
{
"attributes": {
all attributes from ldap like cn, sn, objectClass, etc
},
"dn": "CN=John Doe,OU=Users,OU=UNIVERSUM,DC=universum,DC=local"
}
]
}
and my code looks:
# --- reading encoded accounts
encoded_retrieved_accounts = root_path + data_dir + encoded_accounts_file_name
with open(encoded_retrieved_accounts, 'r', encoding="UTF-8") as file:
data = json.load(file)
retrieved_accounts = data['entries']
def save_to_json(data_to_save, fname):
with open(f"./{fname}.json", 'w', encoding="UTF-8") as file:
return json.dump(data_to_save, file, ensure_ascii=False, indent=4, sort_keys=True)
for account in retrieved_accounts:
attributes = account['attributes']
objectCategory = attributes['objectCategory']
if str(objectCategory[3:-46]) == 'Person':
u_data = account
save_to_json(u_data, 'Person')
elif str(objectCategory[3:-46]) == 'Computer':
c_data = account
save_to_json(c_data, 'Computer')
elif str(objectCategory[3:-46]) == 'Organizational-Unit':
ou_data = account
save_to_json(ou_data, 'Organizational-Unit')
elif str(objectCategory[3:-46]) == 'Group':
g_data = account
save_to_json(g_data, 'Group')
elif str(objectCategory[3:-46]) == 'Print-Queue':
pq_data = account
save_to_json(pq_data, 'Print-Queue')
elif str(objectCategory[3:-46]) == 'MSMQ-Configuration':
msmq_data = account
save_to_json(msmq_data, 'MSMQ-Configuration')
else:
unknow_data = account
save_to_json(unknow_data, 'to-clarify')
but this saves only the last found account in the base file, for example users are 'n', groups are 'm', printers are 'i' and hosts are 'j'

I changed the code and now everything works fine
import os
import json
from dotenv import load_dotenv
# --- loading .env file
load_dotenv('.env')
# ------ variables
root_path = os.environ.get('ROOT_PATH')
data_dir = os.environ.get('DATA_DIR')
tmp_dir = os.environ.get('TMP_DIR')
encoded_accounts_file_name = os.environ.get('ENCODED_ACCOUNTS_FILE_NAME')
print(" - .env loaded")
# --- reading encoded accounts
encoded_retrieved_accounts = root_path + data_dir + tmp_dir + encoded_accounts_file_name
with open(encoded_retrieved_accounts, 'r', encoding="UTF-8") as file:
data = json.load(file)
retrieved_accounts = data['entries']
def save_to_json(data_to_save, fname):
with open(f"./{fname}.json", 'a', encoding="UTF-8") as file:
return json.dump(data_to_save, file, ensure_ascii=False, indent=4, sort_keys=True)
for account in retrieved_accounts:
attributes = account['attributes']
objectCategory = attributes['objectCategory']
if str(objectCategory[3:-46]) == 'Person':
u_data = account
save_to_json(u_data, 'Person')
elif str(objectCategory[3:-46]) == 'Computer':
c_data = account
save_to_json(c_data, 'Computer')
elif str(objectCategory[3:-46]) == 'Organizational-Unit':
ou_data = account
save_to_json(ou_data, 'Organizational-Unit')
elif str(objectCategory[3:-46]) == 'Group':
g_data = account
save_to_json(g_data, 'Group')
elif str(objectCategory[3:-46]) == 'Print-Queue':
pq_data = account
save_to_json(pq_data, 'Print-Queue')
elif str(objectCategory[3:-46]) == 'MSMQ-Configuration':
msmq_data = account
save_to_json(msmq_data, 'MSMQ-Configuration')
else:
unknow_data = account
save_to_json(unknow_data, 'to-clarify')

Related

IP URL Mapping in JSON log file

I have a JSON log file and want to print and count the number of times a URL(requestURL) has been hit by an IP in the same log file. The output should be like the below:
IP(remoteIp): URL1-(Count), URL2-(Count), URL3...
127.0.0.1: http://www.google.com - 12, www.bing.com/servlet-server.jsp - 2, etc..
The Sample of the Logfile is like below
"insertId": "kdkddkdmdkd",
"jsonPayload": {
"#type": "type.googleapis.com/google.cloud.loadbalancing.type.LoadBalancerLogEntry",
"enforcedSecurityPolicy": {
"configuredAction": "DENY",
"outcome": "DENY",
"preconfiguredExprIds": [
"owasp-crs-v030001-id942220-sqli"
],
"name": "shbdbbddjdjdjd",
"priority": 2000
},
"statusDetails": "body_denied_by_security_policy"
},
"httpRequest": {
"requestMethod": "POST",
"requestUrl": "https://dknnkkdkddkd/token",
"requestSize": "3004",
"status": 403,
"responseSize": "274",
"userAgent": "okhttp/3.12.2",
"remoteIp": "127.0.0.1",
"serverIp": "123.123.33.31",
"latency": "0.018728s"
}
The solution that I am using is below. I am able to get the total hits per IP or how many total times a URL has been hit etc.
import json
from collections import Counter
unique_ip = {}
request_url = {}
def getAndSaveValueSafely(freqTable, searchDict, key):
try:
tmp = searchDict['httpRequest'][key]
if tmp in freqTable:
freqTable[tmp] += 1
else:
freqTable[tmp] = 1
except KeyError:
if 'not_present' in freqTable:
freqTable['not_present'] += 1
else:
freqTable['not_present'] = 1
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
getAndSaveValueSafely(unique_ip, d2, 'remoteIp')
getAndSaveValueSafely(request_url, d2, 'requestUrl')
mc_unique_ip = (dict(Counter(unique_ip).most_common()))
mc_request_url = (dict(Counter(request_url).most_common()))
def printing():
a = str(len(unique_ip))
b = str(len(request_url))
with open("output.txt", "w") as f1:
print(
f' Start Time of log = {minTs}'
f' \n\n End Time of log = {maxTs} \n\n\n {a} Unique IP List = {mc_unique_ip} \n\n\n {b} Unique URL = {mc_request_url},file=f1)

I dont think you need to use counter and are unlikely to see any benifit
from collections import defaultdict
result = {} # start empty
with open("threat_intel_1.json") as file:
data = json.load(file)
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(url,defaultdict(int))[ip] += 1
print(result)
# {"/endpoint.html": {"127.2.3.4":15,"222.11.31.22":2}}
if instead you want it the other way thats easy also
for d2 in data:
req = d2.get('httpRequest',None)
if not req:
continue
url = req['requestUrl']
ip = req['remoteIp']
result.setdefault(ip,defaultdict(int))[url] += 1
#{"127.1.2.3",{"/endpoint1.html":15,"/endpoint2.php":1},"33.44.55.66":{"/endpoint1.html":5}, ...}
instead of using defaultdict you could add a line
# result.setdefault(ip,defaultdict(int))[url] += 1
result.setdefault(ip,{})
result[ip][url] = result[ip].get(url,0) + 1
which arguably is more readable anyway...

Error when posting Payload data string to Hubspot using an AWS Lambda Python API call

I have recently uploaded contact records to HubSpot using Postman. Here is a raw JSON data example and POST method that I use to successfully upload a contact:
https://api.hubapi.com/crm/v3/objects/contacts?hapikey={{hapikey}}
{properties": {
"smbapi": "yes",
"email": "fcgrinding#junkstermail.com",
"business_name":"Forest City Grinding Inc",
"srvc_address_1":"3544 Stenstrom Rd",
"srvc_city_1":"",
"srvc_state_1":"IL",
"srvc_zip_1":"61109",
"proposal_date":"2021-12-07",
"proposal_start_date": "2022-01-01",
"udc_code_1": "COMED",
"eog":"electric",
"fixedprice_1_gas_mcf": 6.63,
"fixedprice_2_gas_mcf": 6.11,
"fixedprice_3_gas_mcf": 5.9,
"term_1": 12,
"term_2": 24,
"term_3": 36,
"smb_bdm_name": "Timothy Chin",
"smb_bdm_phone": "833-999-9999",
"smb_bdm_email": "tim.chin#junkstermail.com"
}
}
Next, I then created a python lambda function to automate this process because we want to ingest CSV files that may have many records to extract. So, I had constructed the dictionary to look the same as the string above which had worked great/fine with Postman. However, when I try and do a Post method API call to HubSpot, using my dictionary payload, I am getting this error:
Invalid input JSON : Cannot build ObjectSchemaEgg, Invalid input JSON
on line 1, column 2: Cannot build ObjectSchemaEgg, some of required
attributes are not set [name, labels]
Here is the processed dictionary string that my code constructed for the API call:
{'properties': '{"smbapi": "yes", "business_name": "Forest City Grinding Inc", "srvc_address_1": "4844 Stenstrom Rd", "srvc_state_1": "IL", "srvc_zip_1": "61109", "proposal_date": "2021-12-07", "proposal_start_date": "2022-01-01", "udc_code_1": "COMED", "fixedprice_1": "6.63", "fixedprice_2": "6.11", "fixedprice_3": "5.9", "term_2": "24", "term_3": "36", "smb_bdm_name": "Gary Wysong", "smb_bdm_phone": "833-389-0881", "smb_bdm_email": "gary.wysong#constellation.com"}'}
Here is my Lambda code in full (give special attention to both the call to post_to_hubspot() and also the post_to_hubspot() function itself). The code that loads the dynamo table is working correctly.:
import boto3
import json
import decimal
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key, Attr
import re
import pandas as pd
import numpy as np
import os
import datetime
from os import urandom
import email
import base64
import requests
from datetime import datetime, timedelta, timezone
import mailparser
import calendar
global payload_data
landing_zone_bucket_name = str(os.environ['BUCKETNAME'])
s3 = boto3.resource('s3')
landing_zone_bucket = s3.Bucket(landing_zone_bucket_name )
s3r = boto3.client('s3')
dynamodb = boto3.resource('dynamodb', region_name='us-west-2')
table = dynamodb.Table(str(os.environ['DYNAMOTABLE']))
unprocessed_records_table = dynamodb.Table(str(os.environ['UNPROCESSEDTABLE']))
email_table = dynamodb.Table(str(os.environ['EMAILSTATUSTABLE']))
endpoint_url=os.environ['ENDPOINT_URL']
access_key = os.environ['ACCESSKEY']
now = datetime.now()
today_date = datetime.strftime(now,'%d')
today_month = datetime.strftime(now,'%m')
today_year = datetime.strftime(now,'%Y')
time_stamp = datetime.now().strftime('%Y%m%d%H%M%S')
payload_data = {}
#WRITE RECORDS TO DYNAMO
def dynamoPut(dObj,table_name=None):
try:
for each in list(dObj['Information']):
if dObj['Information'][each]:
dObj['Information'][each] = str(dObj['Information'][each])
else:
del dObj['Information'][each]
dObj['Month'] = today_month
dObj['Year'] = today_year
dObj['Day'] = today_date
for each in list(dObj):
if dObj[each] != '':
dObj[each] = dObj[each]
else:
del dObj[each]
if table_name != None:
response = unprocessed_records_table.put_item(Item = dObj)
else:
response = table.put_item(Item = dObj)
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
return True
else:
return False
except Exception as e:
print(e)
return False
def dynamoPutFileName(filename,source_type):
try:
dObj = {}
dObj['id'] = urandom(20).hex()
dObj['CreatedAt'] = str(datetime.now())
dObj['FileName'] = filename
dObj['Type'] = source_type
dObj['EmailSent'] = False
response = email_table.put_item(Item = dObj)
if response['ResponseMetadata']['HTTPStatusCode'] == 200:
return True
else:
return False
except Exception as e:
print(e)
return False
def parse_csv_hubspot(event, obj):
#parsing CSV file to write to dynamo
try:
def auto_truncate(val):
return val[:255 ]
print('<< IN PARSE CSV HUBSPOT >>')
print(event)
csv = pd.read_csv(obj['Body'], encoding = "ISO-8859-1")
csv_nn = csv.replace(np.nan, 'null', regex=True)
d = csv_nn.to_dict(orient='records')
source_id = urandom(20).hex()
file_name = event['file_path'].split('/')[-1]
print('<< FILE NAME >>', file_name)
for each in d:
try:
dbObj = {}
#PASSING THE EXTERNAL KEY
UniqueKey = ''
if 'smbapi' in each and each['smbapi'] != 'null':
dbObj['smbapi' ] = each['smbapi']
print('<< SMB API>>', dbObj['smbapi' ])
if 'business_name' in each and each['business_name'] != 'null':
dbObj['business_name'] = each['business_name']
print('<< BUSINESS NAME >>', dbObj['business_name'])
if 'srvc_address_1' in each and each['srvc_address_1'] != 'null':
dbObj['srvc_address_1'] = each['srvc_address_1']
print('<< ADDRESS 1 >>', dbObj['srvc_address_1'])
if 'srvc_city_1' in each and each['srvc_city_1'] != 'null':
dbObj['srvc_city_1'] = each['srvc_city_1']
if 'srvc_state_1' in each and each['srvc_state_1'] != 'null':
dbObj['srvc_state_1'] = each['srvc_state_1']
if 'srvc_zip_1' in each and each['srvc_zip_1'] != 'null':
dbObj['srvc_zip_1']= str(each['srvc_zip_1']).zfill(5)
if 'proposal_date' in each and each['proposal_date'] != 'null':
dbObj['proposal_date']= try_parsing_date(each['proposal_date']).date().isoformat()
if 'proposal_start_date' in each and each['proposal_start_date'] != 'null':
dbObj['proposal_start_date']= try_parsing_date(each['proposal_start_date']).date().isoformat()
if 'udc_code_1' in each and each['udc_code_1'] != 'null':
dbObj['udc_code_1']= each['udc_code_1']
if 'eog' in each and each['eog'] != 'null':
dbObj['eog']= each['eog']
if 'fixedprice_1' in each and each['fixedprice_1'] != 'null':
dbObj['fixedprice_1']= each['fixedprice_1']
if 'fixedprice_2' in each and each['fixedprice_2'] != 'null':
dbObj['fixedprice_2']= each['fixedprice_2']
if 'fixedprice_3' in each and each['fixedprice_3'] != 'null':
dbObj['fixedprice_3']= each['fixedprice_3']
if 'fixedprice_1_gas_therm' in each and each['fixedprice_1_gas_therm'] != 'null':
dbObj['fixedprice_1_gas_therm']= each['fixedprice_1_gas_therm']
if 'fixedprice_2_gas_therm' in each and each['fixedprice_2_gas_therm'] != 'null':
dbObj['fixedprice_2_gas_therm']= each['fixedprice_2_gas_therm']
if 'fixedprice_3_gas_therm' in each and each['fixedprice_3_gas_therm'] != 'null':
dbObj['fixedprice_3_gas_therm']= each['fixedprice_3_gas_therm']
if 'fixedprice_1_gas_ccf' in each and each['fixedprice_1_gas_ccf'] != 'null':
dbObj['fixedprice_1_gas_ccf']= each['fixedprice_1_gas_ccf']
if 'fixedprice_2_gas_ccf' in each and each['fixedprice_2_gas_ccf'] != 'null':
dbObj['fixedprice_2_gas_ccf']= each['fixedprice_2_gas_ccf']
if 'fixedprice_3_gas_ccf' in each and each['fixedprice_3_gas_ccf'] != 'null':
dbObj['fixedprice_3_gas_ccf']= each['fixedprice_3_gas_ccf']
if 'fixedprice_1_gas_dth' in each and each['fixedprice_1_gas_dth'] != 'null':
dbObj['fixedprice_1_gas_dth']= each['fixedprice_1_gas_dth']
if 'fixedprice_2_gas_dth' in each and each['fixedprice_2_gas_dth'] != 'null':
dbObj['fixedprice_2_gas_dth']= each['fixedprice_2_gas_dth']
if 'fixedprice_3_gas_dth' in each and each['fixedprice_3_gas_dth'] != 'null':
dbObj['fixedprice_3_gas_dth']= each['fixedprice_3_gas_dth']
if 'fixedprice_1_gas_mcf' in each and each['fixedprice_1_gas_mcf'] != 'null':
dbObj['fixedprice_1_gas_mcf']= each['fixedprice_1_gas_mcf']
if 'fixedprice_2_gas_mcf' in each and each['fixedprice_2_gas_mcf'] != 'null':
dbObj['fixedprice_2_gas_mcf']= each['fixedprice_2_gas_mcf']
if 'fixedprice_3_gas_mcf' in each and each['fixedprice_3_gas_mcf'] != 'null':
dbObj['fixedprice_3_gas_mcf']= each['fixedprice_3_gas_mcf']
if 'term_1' in each and each['term_1'] != 'null':
dbObj['term_1']= each['term_1']
if 'term_2' in each and each['term_2'] != 'null':
dbObj['term_2']= each['term_2']
if 'term_3' in each and each['term_3'] != 'null':
dbObj['term_3']= each['term_3']
if 'smb_bdm_name' in each and each['smb_bdm_name'] != 'null':
dbObj['smb_bdm_name']= each['smb_bdm_name']
if 'smb_bdm_phone' in each and each['smb_bdm_phone'] != 'null':
if '.' in str(each['smb_bdm_phone']):
dbObj['smb_bdm_phone']= str(int(float(each['smb_bdm_phone'])))
else:
dbObj['smb_bdm_phone']= str(each['smb_bdm_phone'])
if 'smb_bdm_email' in each and each['smb_bdm_email'] != 'null' and each['smb_bdm_email'].strip() != '' and each['smb_bdm_email'] != None:
dbObj['smb_bdm_email']= each['smb_bdm_email']
print('<< OBJ >> ',dbObj)
N = urandom(20).hex()
now = str(datetime.now())
#<< END of HUBSPOT INGESTION >>
# table.put_item(
Item = {
'CogId' : str(N),
'CreatedAt': now,
'ExternalId': UniqueKey,
'Information' : dbObj,
'SourceBucket': landing_zone_bucket_name,
'SourcePath' : event['file_path'],
'Source' : 'HubSpot',
'SourceId' : source_id,
'SourceFileName': time_stamp + '_' + file_name
}
#WRITE-TO-DYNAMO
files_processing = dynamoPut(Item)
if not files_processing:
print('Writing {} record to dynamodb Failed'.format(Item))
except Exception as e:
print(e)
N = urandom(20).hex()
Item = {
'CogId' : str(N),
'CreatedAt': now,
'Information' : each,
'SourceBucket': landing_zone_bucket_name,
'SourcePath' : event['file_path'],
'Source' : 'HubSpot',
'message': str(e),
'SourceId' : source_id,
'ExternalId': UniqueKey
}
files_processing = dynamoPut(Item,'Fail')
pass
temp_file_name = time_stamp + '_' + file_name
isert_file_name = dynamoPutFileName(temp_file_name,'HubSpot')
post_to_hubspot(dbObj)
return True
except Exception as e:
print(e)
new_folder_path = os.environ['CSV_NEW_FOLDER_HUBSPOT']
unprocessed_folder_path = os.environ['CSV_ERROR_FOLDER_HUBSPOT']
# MOVING PROCESSED FILES FROM NEW TO UNPROCESSED FOLDER
move_file_to_processed = moving_files_new_to_processed(event, new_folder_path,unprocessed_folder_path)
return False
def try_parsing_date(text):
for fmt in ('%m/%d/%Y','%Y-%m-%dT%H:%M:%S-%f', '%m/%d/%y', '%Y-%m-%d', '%m.%d.%Y','%Y-%m-%dT%I', '%Y-%m-%dT%I%p', '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%S.%f+','%Y-%m-%dT%H:%M:%S'):#2018-11-20T08:05:54-0500
try:
return datetime.strptime(text, fmt)
except ValueError:
print('in except')
pass
return ValueError('no valid date format found')
def post_to_hubspot(list_contacts):
print('<< IN POST-To-HUBSPOT >>')
data_string = **json.dumps(list_contacts)**
payload_data = {"properties": data_string}
print('<< dbOBJ LIST >> ',payload_data)
response = requests.request("POST", endpoint_url+access_key, headers={'Content-Type': 'application/json'}, data=payload_data)
token_response=json.loads(response.text)
print('<< TOKEN RESPONSE >>',token_response)
def moving_files_new_to_processed(event, new_folder,processed_folder):
#MOVING-FILES-TO-PROCESSED
try:
copy_source = {
'Bucket': landing_zone_bucket_name,
'Key': event['file_path']
}
path = event['file_path']
processed_folder = processed_folder + time_stamp + '_'
new_key = path.replace(new_folder, processed_folder)
new_obj = landing_zone_bucket.Object(new_key)
new_obj.copy(copy_source)
s3.Object(landing_zone_bucket_name, event['file_path']).delete()
return True
except Exception as e:
print(e)
return False
def lambda_handler(event,context):
print("Starting to Push Records to Dynamo Lambda")
print(event)
try:
parse_flag = False
new_folder_path = ''
processed_folder_path = ''
#Gets file path and calls required function to parse it out
key = str(os.environ['CSV_NEW_FOLDER_HUBSPOT'])
obj = s3r.get_object(Bucket=landing_zone_bucket_name, Key=event['file_path'])
print('after obj')
print(os.environ['CSV_NEW_FOLDER_HUBSPOT'])
print('in HubSpot parse_csv')
parse_csv_func = parse_csv_hubspot(event, obj)
# Checks if parse_csv return empty dictionary
if parse_csv_func:
parse_flag = True
new_folder_path = os.environ['CSV_NEW_FOLDER_HUBSPOT']
processed_folder_path = os.environ['CSV_PROCESSED_FOLDER_HUBSPOT']
else:
print('File Format not Supported for {}'.format(event['file_path']))
if parse_flag:
# UPLOADING CONTACT.MOVING PROCESSED FILES FROM NEW TO PROCESSED FOLDER
#print('<< PAYLOAD >> ',payload)
#response = requests.request("POST", "https://api.hubapi.com/crm/v3/schemas/?hapikey="+access_key, headers={'Content-Type': 'application/json'}, data=json.dumps(str(payload)))
#token_response=json.loads(response.text)
#print('<< TOKEN RESPONSE >>',token_response)
#MOVING PROCESSED FILES FROM NEW TO PROCESSED FOLDER
move_file_to_processed = moving_files_new_to_processed(event, new_folder_path,processed_folder_path)
if move_file_to_processed:
print('File {} moved Successfully from {} to {}'.format(event['file_path'],new_folder_path,processed_folder_path))
else:
print('Moving {} file from new to processing folder Failed'.format(event['file_path']))
except Exception as e:
print(e)
What could be the problem? Thanks for your help.

The problem was caused by two issues:
The dictionary should have been placed in json.dumps() to convert it to JSON string when doing a POST so the dictionary didn't need to change its structure. Here's the response from the POST:
<< TOKEN RESPONSE >> {
"id": "135120801",
"properties": {
"business_name": "Millers Brand Oats",
"createdate": "2021-12-21T02:31:12.452Z",
"fixedprice_1": "6.63",
"fixedprice_2": "6.11",
"fixedprice_3": "5.9",
"hs_all_contact_vids": "135120801",
"hs_is_contact": "true",
"hs_is_unworked": "true",
"hs_marketable_until_renewal": "false",
"hs_object_id": "135120801",
"hs_pipeline": "contacts-lifecycle-pipeline",
"lastmodifieddate": "2021-12-21T02:31:12.452Z",
"proposal_date": "2021-12-07",
"proposal_start_date": "2022-01-01",
"smb_bdm_email": "Tim.Chu#junkster.com",
"smb_bdm_name": "Tim Chu",
"smb_bdm_phone": "833-999-9999",
"smbapi": "yes",
"srvc_address_1": "4844 Stenstrom Rd",
"srvc_state_1": "IL",
"srvc_zip_1": "61109",
"term_2": "24",
"term_3": "36",
"udc_code_1": "COMED"
},
"createdAt": "2021-12-21T02:31:12.452Z",
"updatedAt": "2021-12-21T02:31:12.452Z",
"archived": false
}
I was using the wrong endpoint:
https://api.hubapi.com/crm/v3/schemas/
instead of:
https://api.hubapi.com/crm/v3/objects/contacts/
Now I just need to find out why the AWS Lambda POSTs allow for duplicate contacts to be created in HubSpot while Postman POSTs prohibit duplicate contacts to be created.

Saving list data in python

I am wondering how I can save whatever I added to a list when I close a python file. For example, in this "my contact" program that I wrote below, if I add information about 'Jane Doe', what could I do so that next time I open up the same file, Jane Doe still exists.
def main():
myBook = Book([{"name": 'John Doe', "phone": '123-456-7890', "address": '1000 Constitution Ave'}])
class Book:
def __init__(self, peoples):
self.peoples = peoples
self.main_menu()
def main_menu(self):
print('Main Menu')
print('1. Display Contact Names')
print('2. Search For Contacts')
print('3. Edit Contact')
print('4. New Contact')
print('5. Remove Contact')
print('6. Exit')
self.selection = input('Enter a # form the menu: ')
if (self.selection == "1"):
self.display_names()
if (self.selection == "2"):
self.search()
if (self.selection == "3"):
self.edit()
if (self.selection == "4"):
self.new()
if (self.selection == "5"):
self.delete()
if (self.selection == "6"):
self.end()
def display_names(self):
for people in self.peoples:
print("Name: " + people["name"])
self.main_menu()
def search(self):
searchname = input('What is the name of your contact: ')
for index in range(len(self.peoples)):
if (self.peoples[index]["name"] == searchname):
print("Name: " + self.peoples[index]["name"])
print("Address: " + self.peoples[index]["address"])
print("Phone: " + self.peoples[index]["phone"])
self.main_menu()
def edit(self):
searchname = input('What is the name of the contact that you want to edit: ')
for index in range(len(self.peoples)):
if (self.peoples[index]["name"] == searchname):
self.peoples.pop(index)
name = input('What is your name: ')
address = input('What is your address: ')
phone = input('What is your phone number: ')
self.peoples.append({"name": name, "phone": phone, "address": address})
self.main_menu()
def new(self):
name = input('What is your name: ')
address = input('What is your address: ')
phone = input('What is your phone number: ')
self.peoples.append({"name": name, "phone": phone, "address": address})
self.main_menu()
def delete(self):
searchname = input('What is the name of the contact that you want to delete: ')
for index in reversed(range(len(self.peoples))):
if (self.peoples[index]["name"] == searchname):
self.peoples.pop(index)
print(searchname, 'has been removed')
self.main_menu()
def end(self):
print('Thank you for using the contact book, have a nice day')
print('Copyright Carson147 2019©, All Rights Reserved')
main()

Use a module from the Data Persistence section of the standard library, or save it as json, or as a csv file.

You just convert your list to array inside in function .
np.save('path/to/save', np.array(your_list))
to load :
arr=np.load(''path/to/save.npy').tolist()
I hope it will be helpful

There are innumerable kinds of serialization options, but a time-tested favorite is JSON. JavaScript Object Notation looks like:
[
"this",
"is",
"a",
"list",
"of",
"strings",
"with",
"a",
{
"dictionary": "of",
"values": 4,
"an": "example"
},
"can strings be single-quoted?",
false,
"can objects nest?",
{
"I": {
"Think": {
"They": "can"
}
}
}
]
JSON is widely used, and the Python stdlib has a method of converting objects to and from JSON in the json package.
>>> import json
>>> data = ['a', 'list', 'full', 'of', 'entries']
>>> json.dumps(data) # dumps will dump to string
["a", "list", "full", "of", "entries"]
You can then save your Book data to json before the program shuts down, and read from json after it starts up.
# at the top
import json
from pathlib import Path
# at the bottom of your program:
if __name__ == '__main__':
persistence = Path('book.json')
if persistence.exists():
with persistence.open() as f:
data = json.load(f)
else:
data = [{"name": 'John Doe', "phone": '123-456-7890', "address": '1000 Constitution Ave'}]
book = Book(data)
with persistence.open('w') as f:
json.dump(f, indent=4)

There is no way you can do that without any external modules, such as numpy or pickle. Using pickle, you can do this: (I am assuming you want to save the myBook variable)
import pickle
pickle.dump(myBook, open("foo.bar", "wb")) #where foo is name of file and bar is extension
#also wb is saving type, you can find documentation online
To load:
pickle.load(myBook, open("foo.bar", "rb"))
EDIT:
I was wrong in my first statement. There is a way to save without importing a module. Here is how:
myBook.save(foo.bar) #foo is file name and bar is extention
To load:
myBook=open(foo.bar)

As evinced by the many other answers, there are many ways to do this, but I thought it was helpful to have a example.
By changing the top of your file as so, you can use the shelve module.
There are a variety of other things you can fix in your code if you are curious, you could try https://codereview.stackexchange.com/ if you want more feedback.
import shelve
def main():
default = [
{'name': 'John Doe', 'phone': '123-456-7890',
'address': '1000 Constitution Ave'}
]
with Book('foo', default=default) as myBook:
myBook.main_menu()
class Book:
def __init__(self, filename, default=None):
if default is None:
default = []
self._db = shelve.open(filename)
self.people = self._db.setdefault('people', default)
def __enter__(self):
return self
def __exit__(self):
self._db['people'] = self.people
self._db.close()

Add entries into JSON

I am working with an API that doesn't have all the information I need in a single call, and I need to the project code it came from into the call that I am making. Right now it appends the project data to the list, but I really need it to be part of the original call. Here is my output now:
[{"committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}, {"project_id": "2"}]
Here is the output I want to achieve:
[{"project_id": "2", "committer_email": "justin.m.boucher#example.com", "short_id": "981147b9", "title": "Added .gitignore", "author_email": "justin.m.boucher#example.com", "authored_date": "2017-08-29T08:31:11.000-07:00", "created_at": "2017-08-29T08:31:11.000-07:00", "author_name": "Justin Boucher", "parent_ids": [], "committed_date": "2017-08-29T08:31:11.000-07:00", "message": "Added .gitignore\n", "committer_name": "Justin Boucher", "id": "981147b905913a60796283ce10f915c53679df49"}]
Here is my code so far:
get_commits.py:
import gitlab
import json
gitlab = gitlab.Gitlab()
projects = gitlab.getProjectID()
for i in projects:
api_str = '/projects/' + str(i) + '/repository/commits'
connect = gitlab.connectAPI(apiCall=api_str)
data = json.dumps(connect)
# Append project id to json, since it isn't created
# in the commits from Gitlab
commit = json.loads(data)
commit.append({'project_id': str(i)})
# make it pretty again for Splunk to read
commit = json.dumps(commit)
print commit
gitlab.py
import os
import ConfigParser
import requests
import json
# Setup Splunk Environment
APPNAME = 'splunk_gitlab'
CONFIG = 'appconfig.conf'
SPLUNK_HOME = os.environ['SPLUNK_HOME']
parser = ConfigParser.SafeConfigParser()
class Gitlab():
# # Load Settings
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/local/' + CONFIG)
# if parser.has_section('Authentication'):
# pass
# else:
# parser.read(SPLUNK_HOME + '/etc/apps/' + APPNAME + '/default/' + CONFIG)
#
# GITLAB_URL = parser.get('Authentication', 'GITLAB_URL')
# API_KEY = parser.get('Authentication', 'API_KEY')
# Used for testing only
GITLAB_URL = 'http://<my_address>'
API_KEY = '<my_key>'
API_SERVER = GITLAB_URL + '/api/v4'
# Place api call to retrieve data
def connectAPI(self, apiCall='/projects'):
headers = {
'PRIVATE-TOKEN': self.API_KEY
}
final_url = self.API_SERVER + apiCall
resp = requests.get(final_url, headers=headers)
status_code = resp.status_code
resp = resp.json()
if status_code == 200:
return resp
else:
raise Exception("Something went wrong requesting (%s): %s" % (
resp['errors'][0]['errorType'], resp['errors'][0]['message']))
def getProjectID(self):
connect = self.connectAPI(apiCall='/projects')
data = json.dumps(connect)
projects = json.loads(data)
project_list = []
for i in projects:
project_list.append(i['id'])
return project_list

If you want to add a new element to the first dictionary in the list instead of appending a new dictionary to the list, try using assignment instead of append.
commit[0]['project_id'] = str(i)

converting json file to csv in Python returns nothing?

Here we have a large json file with nested content in that. We want to convert it to csv file,so that it could use it for data modeling, however I feel as the code is missing something, which I am unable to spot. I am very new to python and need help.
Following is how the content in the file looks like:
[{
"address": " -, Gulbarga-585102",
"college": "College (Architecture)",
"courses": [
{
"brief_details": "",
"college_name": "School of ArchitecturePoojya Doddappa Appa College of Engineering",
"course_branch": "B.Arch",
"course_duration": " 5-year",
"course_nature": " Full-Time",
"course_title": "",
"course_type": " B.Arch",
"no_of_seats": " 60",
"qualifications": "",
"selection_process": ""
}
],
"email": " principal#pdaengg.com",
"fax": "08472-255685",
"name": "School Of Architecturepoojya Doddappa Appa College Of Engineering",
"phone": "08472-224262 Extn. 435, 220742",
"recognition": " V.t.u. Belgaum",
"website": ""
}]
And following is my code
from bs4 import BeautifulSoup
from os import listdir
import os
from os.path import isfile, join
import fnmatch
import shelve
import json
import csv
def write_csv(read_file_path):
data = json.loads(open(read_file_path).read())
file_colleges = csv.writer(open(r"/home/maitreyee/SchoolCollege.com/collegesdb/colleges.csv", "w", newline=""))
list_colleges_headers = ['name', 'recognition','address','phone','fax','email','website']
file_colleges.writerow(list_colleges_headers)
list_courses.list_colleges_headers = ['course_title', 'course_type','course_duration','course_nature','qualifications','brief_details','selection_process', 'course_branch', 'no_of_seats']
for d in data:
file_colleges.writerow(
[d['name'], d['college'], d['recognition'], d['address'], d['phone'], d['fax'], d['website']])
file_course.writerow(list_courses_headers)
for course in d['courses']:
file_course.writerow(
[
(course['course_title'] if course['course_title'] is not None or course['course_title'] != '' else 'NA'),
(course['course_type'] if course['course_type'] is not None or course['course_type'] != '' else 'NA'),
(course['course_duration'] if course['course_duration'] is not None or course['course_duration'] != '' else 'NA'),
(course['course_nature'] if course['course_nature'] is not None or course['course_nature'] != '' else 'NA'),
(course['qualifications'] if course['qualifications'] is not None or course['qualifications'] != '' else 'NA'),
(course['brief_details'] if course['brief_details'] is not None or course['brief_details'] != '' else 'NA'),
(course['selection_process'] if course['selection_process'] is not None or course['selection_process'] != '' else 'NA'),
(course['course_branch'] if course['course_branch'] is not None or course['course_branch'] != '' else 'NA'),
(course['no_of_seats'] if course['no_of_seats'] is not None or course['no_of_seats'] != '' else 'NA')])
pass
#def write_file(file, colleges):
# db = shelve.open(file)
# for college in colleges:
# db[college.name] = college
# db.close()
read_file_path = r'/home/maitreyee/Downloads/SchoolCollege.com/collegesdb/collegesdb1.json'
#colleges = read_colleges(r"/home/maitreyee/Downloads/SchoolCollege.com1/collegedb1.json")
#new_write_file(r'/home/maitreyee/Downloads/SchoolCollege.com1/')
And the code is returning an empty file
Below is the code of #7stud.have just modified the files location.
import json
import csv
def write_csv(jsonfile, outfile):
with open(jsonfile) as f:
data = json.loads(f.read())
college_dict = data[0]
college_keys = list(college_dict.keys())
college_keys.remove('courses')
college_keys.remove('college')
courses_dict = data[0]['courses'][0]
courses_keys = list(courses_dict.keys())
courses_keys.remove('brief_details')
with open(outfile, 'w', newline='') as f:
csv_writer = csv.writer(f)
headers = college_keys + courses_keys
csv_writer.writerow(headers)
row = (
[
college_dict[key] if college_dict[key] else 'NA'
for key in college_keys
]
+
[
courses_dict[key] if courses_dict[key] else 'NA'
for key in courses_keys
]
)
csv_writer.writerow(row)
jsonfile = '/home/maitreyee/Downloads/SchoolCollege.com/collegesdb/collegesdb1.json'
outfile = '/home/maitreyee/Downloads/SchoolCollege.com/collegesdb/collegesout.csv'
write_csv(jsonfile, outfile)
And below is the error
maitreyee#Maitreyee:~/Downloads/SchoolCollege.com$ python json2csv4.py
Traceback (most recent call last):
File "json2csv4.py", line 41, in <module>
write_csv(jsonfile, outfile)
File "json2csv4.py", line 15, in write_csv
courses_dict = data[0]['courses'][0]
IndexError: list index out of range

Do you plan on calling your write_csv() function in your program?
If you do call write_csv(), you will get the error:
NameError: name 'list_courses' is not defined
If you just do this:
import json
import csv
def write_csv(read_file_path):
data = json.loads(open(read_file_path).read())
file_colleges = csv.writer(open('out.txt', "w", newline=""))
list_colleges_headers = ['name', 'recognition','address','phone','fax','email','website']
file_colleges.writerow(list_colleges_headers)
infile = "json.txt"
write_csv(infile)
you'll see that the file does contain output:
$ cat out.txt
name,recognition,address,phone,fax,email,website
Edit:
If the column order in the csv file isn't important:
import json
import csv
def write_csv(jsonfile, outfile):
with open(jsonfile) as f:
data = json.loads(f.read())
college_dict = data[0]
college_keys = list(college_dict.keys())
college_keys.remove('courses')
college_keys.remove('college')
courses_dict = data[0]['courses'][0]
courses_keys = list(courses_dict.keys())
courses_keys.remove('brief_details')
with open(outfile, 'w', newline='') as f:
csv_writer = csv.writer(f)
headers = college_keys + courses_keys
csv_writer.writerow(headers)
row = (
[
college_dict[key] if college_dict[key] else 'NA'
for key in college_keys
]
+
[
courses_dict[key] if courses_dict[key] else 'NA'
for key in courses_keys
]
)
csv_writer.writerow(row)
jsonfile = 'data.json'
outfile = 'out.csv'
write_csv(jsonfile, outfile)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to split a large json file into smaller ones in python - python

Related

IP URL Mapping in JSON log file

Error when posting Payload data string to Hubspot using an AWS Lambda Python API call

Saving list data in python

Add entries into JSON

converting json file to csv in Python returns nothing?

Categories

Resources