How to extract nested json from api - python

I need help to get some json data from Pipedrive API(just a resquest.get), however, as expected, the json file is not formatted at all.
As you can see in my python script bellow, I'll upload that file to my S3 Bucket, so I can create tables in AWS Glue.
I'm having a lot of trouble to make this json file "readable" in AWS structure.
Sample of json file I'm working: [I need the "deals" section]
{"success":true,"data":[{"period_start":"2020-11-12 00:00:00","period_end":"2020-11-12 23:59:59","deals":[{"id":xx,"creator_user_id":xx,"user_id":XX,"person_id":XX,"org_id":XX,"stage_id":X}]}]}
Python script:
from datetime import datetime
import boto3
import requests
from jsonpath_ng import parse
import pandas as pd
now = datetime.now()
day_int = now.strftime("%d")
month_int = now.strftime("%m")
year_int = now.strftime("%Y")
yesterday = (int(day_int) - 1)
if yesterday == 0:
yesterday = 31
#today = now.strftime("%Y" + "-" + "%m" + "-" + str(yesterday))
today = '2020-11-12'
response_01 = requests.get(
'https://api.pipedrive.com/v1/deals/timeline?start_date=' + str(today) + '&interval=day&amount=' + str(
days) + '&field_key=update_time&totals_convert_currency=' + currency + '&api_token=' + token_api)
raw_data = response_01.json()
x = ([match.value for match in parse('$..deals.[*].[*]').find(raw_data)])
y = json.dumps(x, default= str)
s3 = boto3.resource(
's3',
region_name=data_region,
aws_access_key_id=key_id,
aws_secret_access_key=access_key
)
client = boto3.client('s3')
client.put_object(Body=str(raw_data), Bucket='bucket_key', Key='object_key') ```

Related

AWS Lambda loop through files using boto3

I have the following lambda function that will search my my s3 bucket with the prefix being the current time in milliseconds. I have about 600-800k files per hour that I would like to do some manipulation to. This code works as intended but takes forever to scan the prefix. I have a feeling that this part of my code is not efficient. Since this lambda function is scheduled to run every 10 mins I have my min range set to go back up to 11 mins in milliseconds. I would greatly appreciate if someone could help me make this piece more efficient if possible.
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
keys = []
result = []
now = int(round(time.time() * 1000))
now_min = now - 660000 # 11 mins
times = list(range(now_min,now+1))
for t in times:
prefix = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(t)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
if page.get('KeyCount') != 0:
for obj in page['Contents']:
keys.append(obj['Key'])
for key in keys[1:]:
The goal is take these 800k files and condense them into multiple larger files instead of having 800k small files.
for key in keys[1:]:
local_filepath = os.path.join(tempfile.gettempdir(), key)
regex_local_filepath = '/tmp/' + re.search('([^\/]+$)', local_filepath).group(0)
re_key = re.search('([^-/]+$)', key).group(0)
re_key = re_key.replace('.json','')
s3_resource.Bucket('bucket').download_file(key,regex_local_filepath)
with open (regex_local_filepath,'r') as infile:
result.append(json.load(infile))
file_name = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(now) + '.json'
s3object = s3_resource.Object('new-bucket', file_name)
s3object.put(
Body=(bytes(json.dumps(result, indent=2, sort_keys=True).encode('UTF-8')))
)
return None
I have figured out the correct way to efficiently loop through. It seems I was looping through multiple times and appending times to the keys.
If one needs to condense s3 files into larger single files. This approach works amazingly well. Cheers!
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
now = int(round(time.time() * 1000))
min_now = now - 360000 # Go back 6 mins since lambda function runs every 5 mins
max_now = now + 60000 # This is to handle minute 59 after the hour.
keys = []
regex_keys = []
result = []
content_keys = []
my_bucket = s3_resource.Bucket('bucket')
prefix = 'Uploads/'
key_objects = iter(my_bucket.objects.filter(Prefix=prefix))
next(key_objects)
for object_summary in key_objects:
obj_key = object_summary.key # This gives me all the keys in the above prefix
keys.append(obj_key)
for key in keys:
regex_key = re.search('\/(.*?)\-', key).group(0).replace('/','').replace('-','') # I just want the timestamp (miliseconds)
regex_keys.append(regex_key)
for regex_key in regex_keys:
if min_now <= int(regex_key) <= max_now:
prefix = 'Uploads/' + str(regex_key)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
for obj in page['Contents']:
content_keys.append(obj['Key'])
print(len(content_keys))
return None

Getting BTC historical data by Kraken API

I'm trying to getting data from kraken exchange by the krakenex API. But i'm facing several problems, 'cause, I want getting the data in a range time bigger than the alllowed by the API.
The API only allows getting a dataframe with 720 rows, so 'cause that I need to do a loop while to getting more data and concat in another dataframe.
I've already read other topics about it, but I'm still not reaching good results.
import krakenex
import time
import krakenex
import pandas as pd
from pykrakenapi import KrakenAPI
from datetime import datetime
k = krakenex.API()
start = '28/01/2021 00:00:00'
start = datetime.strptime(start, "%d/%m/%Y %H:%M:%S")
start = int(time.mktime(start.timetuple()))
stop = '03/02/2021 00:00:00'
stop = datetime.strptime(stop, "%d/%m/%Y %H:%M:%S")
stop = int(time.mktime(stop.timetuple()))
prices = pd.DataFrame()
while start < stop:
time.sleep(5)
data = k.query_public('OHLC', {'pair':'XXBTZUSD', 'interval':1, 'since':start})
df = pd.DataFrame( data['result']['XXBTZUSD'])
daily_prices = df[0].to_list()
start = int(daily_prices[0])
prices = pd.concat([precos , df])
For weeks I have been working on a script that does exactly that. In my case I collect all pairs with BTC and ETH but you can use the script with any pair. To do this I used the REST API and defined some functions that automate everything. I download the data with 1 minute timeframe but it can be used for any timeframe.
First I defined a function that downloads the data in full or from a specific date, it's necessary because at the first run it will download all the data and then it will download only the new data. The parameter 'interval' defines the number of minutes of the timeframe while 'since' defines the beginning of the data to download.
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'volume', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
return data
Then I defined a function to load the .json file that was saved into memory. The function returns the dataframe with the old data and a timestamp that indicates from where to download the new data. I also created a function for calculate the timestamp.
def load_data(pair, path):
data = pd.read_json(path + pair + '.json' , orient='split')
tmp = data.tail(1).index
tmp = tmp.strftime('%Y-%m-%d %H:%M:%S')
dt = str_to_datetime(tmp[0])
ts = dt.timestamp()
return data, ts
def str_to_datetime(datestr):
Y = int(datestr[0:4])
M = int(datestr[5:7])
D = int(datestr[8:10])
H = int(datestr[11:13])
m = int(datestr[14:16])
return datetime.datetime(Y, M, D, H, m, 0, tzinfo=tz.gettz("Etc/GMT"))
Now your main should be something like:
from countdown import countdown
import pandas as pd
import datetime
import os
path = os.getcwd() + '/historical_data/'
pair = 'XBTUSD'
while True:
if os.path.exists(path + pair + '.json') == False:
data = get_ohlc(pair, 1) # 1 minute timeframe
data.to_json(path + pair + '.json', orient='split')
else:
data1, ts = load_data(pair, path)
data2 = get_ohlc(pair, 1, ts)
data3 = pd.concat([data1, data2])
data3.drop(data3.tail(1).index,inplace=True) # delete last record because it's not ended
data3.to_json(path + pair + '.json', orient='split')
countdown(60) # update every hour
I delete the last record because when you download it it's not ended so we will download at the next update. I haven't tested if it works because I took pieces of code from my program, if it doesn't work let me know and I'll fix it.

Python ftplib and downloading filemasks

I've made a lot of progress on this, and can now download 3 of 4 files just fine, however, one, the Wisconsin file contains timestamps that I can't have removed, and vary day to day and I'm struggling to figure out how to get the wildcards to work on those values with regular expressions. I've posted my revised code below:
Examples of the file names are:
BCW_Daily SDP Yield.rpt2020-02-17***-09-02-32***.csv
hbc_platelet_daily_02102020.csv
MBC_ROLLING_YIELD_02172020.CSV
IBC_SDP_Rolling_7Days_021720.CSV
Any help is appreciated.
import datetime
import ftplib
import os
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
filenameWI = ('BCW_SDP_Rolling_7Days.rpt'***+widate+'*'+***'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
dlfiles = [filenameMI,filenameIN,filenameWI,filenameIL]
connection = ftplib.FTP(host='xxx',user='xxx',passwd='xxx')
welcome = ftplib.FTP.getwelcome(connection)
print(welcome)
connection.cwd(ftpdir)
ftp_list = connection.nlst()
print(ftp_list)
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
else:
print(x+' fail')
connection.quit()
Solved it:
# import modules
import fnmatch
import datetime
import ftplib
import os
#define variables
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
filedir = "C:/DailyData/SDPS/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
pattern = ('BCW_SDP_Rolling_7Days.rpt'+widate+'*'+'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
#create FTP connection
connection = ftplib.FTP(xxxxxxx)
connection.cwd(ftpdir)
#generate file list on FTP
ftp_list = connection.nlst()
#create wildcard string for WI file
wistring = fnmatch.filter(ftp_list,pattern)
filenameWI = str(wistring[0])
dlfiles = [filenameMI,filenameIN,filenameIL,filenameWI]
#download files from FTP to local
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
connection.quit()

AWS Lambda - S3 put_object Invalid type for parameter Body

I have a hosted zone in route 53 and would like to have the contents of the hostzone object stored in S3 but I am getting an error. I am thinking Body is the correct parameter but maybe this is because the object is in JSON format?
import boto3
import json
def allwork():
client = boto3.client('route53')
hostzone = client.list_hosted_zones()
bucket_name = "testlambda"
file_name = "r53data.txt"
lambda_path = "/tmp/" + file_name
s3_path = "10102018/" + file_name
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=hostzone)
allwork()
Here is the error:
module initialization error: Parameter validation failed:
Invalid type for parameter Body, value: {u'HostedZones':
[{u'ResourceRecordSetCount': 7, u'CallerReference': '814E3.........
tkausl answered the question in the comments:
Looks like it returns a dict, so you need to json encode it manually before passing it to put_object
update:
import boto3
import json
def allwork():
client = boto3.client('route53')
hostzone = client.list_hosted_zones()
bucket_name = "testlambda"
file_name = "r53data.txt"
lambda_path = "/tmp/" + file_name
s3_path = "10102018/" + file_name
hostzone2=json.dumps(hostzone, ensure_ascii=False)
s3 = boto3.resource("s3")
s3.Bucket(bucket_name).put_object(Key=s3_path, Body=hostzone2)
allwork()

Need to return result of for loop as lambda result

I'm new to AWS lambda; I have to return the result of my for loop as result of the lambda function,
Below code will be giving users who have not used there AWS password for last 60 days; it has been tested in the jupyter notebook and its working fine
import json
import boto3
import datetime
from dateutil.tz import tzutc
resource = boto3.resource('iam')
today = datetime.datetime.now()
for user in resource.users.all():
if user.password_last_used is not None:
delta = (today - user.password_last_used.replace(tzinfo=None)).days
if delta > 60:
print("username: ", [user.user_name][0]," - ",delta , "days")
How can I return the result as below output from the jupyter notebook
Username: abc- 96 Days
Username: def- 64 Days
Username: ghi- 184 Days
Username: mno- 158 Days
Username: xyz- 95 Days
Lambda code which I tried to append output to an array and return the array but returning Null:-
import json
import boto3
import datetime
from dateutil.tz import tzutc
def lambda_handler(event, context):
resource = boto3.resource('iam')
client = boto3.client('iam')
today = datetime.datetime.now()
userName = []
for user in resource.users.all():
if user.password_last_used is not None:
delta = (today - user.password_last_used.replace(tzinfo=None)).days
if delta > 30:
userName.append("username: ", [user.user_name][0]," - ",delta , "days")
return userName
The problem is in how you are appending the results. Append does not take comma separated values like print does. Use the following line to make it work.
userName.append(f"username: {[user.user_name][0]}- {delta} days")
or the following if you are using python 2
userName.append("username: {}- {} days".format([user.user_name][0],delta))
Also make sure that the policy allows both IAM:GetUser and IAM:ListUsers
import json
import boto3
import datetime
from dateutil.tz import tzutc
def lambda_handler(event, context):
resource = boto3.resource('iam')
client = boto3.client('iam')
sns = boto3.client('sns')
today = datetime.datetime.now()
physicalString ='Below users not logged into AWS console for 90 days\n'
final_report = ''
final_result = ''
number =int(1)
for user in resource.users.all():
if user.password_last_used is not None:
delta = (today - user.password_last_used.replace(tzinfo=None)).days
if delta >= 0:
final_result= str(number) + " username: "+ [user.user_name][0]+" - "+str(delta) + "days\n"
final_report = final_report+final_result
number=number+1
response = sns.publish(
TopicArn='arn:aws:sns:XXXXXXXXXXXX:AWS_KMSKEY',
Message= physicalString + final_report,
Subject='IAM USER'
)
above code is working we are able to get list of users output as below:-
1 username: abc- 102days
2 username: efg- 190days
3 username: zyz- 164days

Categories

Resources