I've made a lot of progress on this, and can now download 3 of 4 files just fine, however, one, the Wisconsin file contains timestamps that I can't have removed, and vary day to day and I'm struggling to figure out how to get the wildcards to work on those values with regular expressions. I've posted my revised code below:
Examples of the file names are:
BCW_Daily SDP Yield.rpt2020-02-17***-09-02-32***.csv
hbc_platelet_daily_02102020.csv
MBC_ROLLING_YIELD_02172020.CSV
IBC_SDP_Rolling_7Days_021720.CSV
Any help is appreciated.
import datetime
import ftplib
import os
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
filenameWI = ('BCW_SDP_Rolling_7Days.rpt'***+widate+'*'+***'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
dlfiles = [filenameMI,filenameIN,filenameWI,filenameIL]
connection = ftplib.FTP(host='xxx',user='xxx',passwd='xxx')
welcome = ftplib.FTP.getwelcome(connection)
print(welcome)
connection.cwd(ftpdir)
ftp_list = connection.nlst()
print(ftp_list)
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
else:
print(x+' fail')
connection.quit()
Solved it:
# import modules
import fnmatch
import datetime
import ftplib
import os
#define variables
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
filedir = "C:/DailyData/SDPS/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
pattern = ('BCW_SDP_Rolling_7Days.rpt'+widate+'*'+'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
#create FTP connection
connection = ftplib.FTP(xxxxxxx)
connection.cwd(ftpdir)
#generate file list on FTP
ftp_list = connection.nlst()
#create wildcard string for WI file
wistring = fnmatch.filter(ftp_list,pattern)
filenameWI = str(wistring[0])
dlfiles = [filenameMI,filenameIN,filenameIL,filenameWI]
#download files from FTP to local
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
connection.quit()
Related
I have the following lambda function that will search my my s3 bucket with the prefix being the current time in milliseconds. I have about 600-800k files per hour that I would like to do some manipulation to. This code works as intended but takes forever to scan the prefix. I have a feeling that this part of my code is not efficient. Since this lambda function is scheduled to run every 10 mins I have my min range set to go back up to 11 mins in milliseconds. I would greatly appreciate if someone could help me make this piece more efficient if possible.
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
keys = []
result = []
now = int(round(time.time() * 1000))
now_min = now - 660000 # 11 mins
times = list(range(now_min,now+1))
for t in times:
prefix = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(t)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
if page.get('KeyCount') != 0:
for obj in page['Contents']:
keys.append(obj['Key'])
for key in keys[1:]:
The goal is take these 800k files and condense them into multiple larger files instead of having 800k small files.
for key in keys[1:]:
local_filepath = os.path.join(tempfile.gettempdir(), key)
regex_local_filepath = '/tmp/' + re.search('([^\/]+$)', local_filepath).group(0)
re_key = re.search('([^-/]+$)', key).group(0)
re_key = re_key.replace('.json','')
s3_resource.Bucket('bucket').download_file(key,regex_local_filepath)
with open (regex_local_filepath,'r') as infile:
result.append(json.load(infile))
file_name = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(now) + '.json'
s3object = s3_resource.Object('new-bucket', file_name)
s3object.put(
Body=(bytes(json.dumps(result, indent=2, sort_keys=True).encode('UTF-8')))
)
return None
I have figured out the correct way to efficiently loop through. It seems I was looping through multiple times and appending times to the keys.
If one needs to condense s3 files into larger single files. This approach works amazingly well. Cheers!
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
now = int(round(time.time() * 1000))
min_now = now - 360000 # Go back 6 mins since lambda function runs every 5 mins
max_now = now + 60000 # This is to handle minute 59 after the hour.
keys = []
regex_keys = []
result = []
content_keys = []
my_bucket = s3_resource.Bucket('bucket')
prefix = 'Uploads/'
key_objects = iter(my_bucket.objects.filter(Prefix=prefix))
next(key_objects)
for object_summary in key_objects:
obj_key = object_summary.key # This gives me all the keys in the above prefix
keys.append(obj_key)
for key in keys:
regex_key = re.search('\/(.*?)\-', key).group(0).replace('/','').replace('-','') # I just want the timestamp (miliseconds)
regex_keys.append(regex_key)
for regex_key in regex_keys:
if min_now <= int(regex_key) <= max_now:
prefix = 'Uploads/' + str(regex_key)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
for obj in page['Contents']:
content_keys.append(obj['Key'])
print(len(content_keys))
return None
I need help to get some json data from Pipedrive API(just a resquest.get), however, as expected, the json file is not formatted at all.
As you can see in my python script bellow, I'll upload that file to my S3 Bucket, so I can create tables in AWS Glue.
I'm having a lot of trouble to make this json file "readable" in AWS structure.
Sample of json file I'm working: [I need the "deals" section]
{"success":true,"data":[{"period_start":"2020-11-12 00:00:00","period_end":"2020-11-12 23:59:59","deals":[{"id":xx,"creator_user_id":xx,"user_id":XX,"person_id":XX,"org_id":XX,"stage_id":X}]}]}
Python script:
from datetime import datetime
import boto3
import requests
from jsonpath_ng import parse
import pandas as pd
now = datetime.now()
day_int = now.strftime("%d")
month_int = now.strftime("%m")
year_int = now.strftime("%Y")
yesterday = (int(day_int) - 1)
if yesterday == 0:
yesterday = 31
#today = now.strftime("%Y" + "-" + "%m" + "-" + str(yesterday))
today = '2020-11-12'
response_01 = requests.get(
'https://api.pipedrive.com/v1/deals/timeline?start_date=' + str(today) + '&interval=day&amount=' + str(
days) + '&field_key=update_time&totals_convert_currency=' + currency + '&api_token=' + token_api)
raw_data = response_01.json()
x = ([match.value for match in parse('$..deals.[*].[*]').find(raw_data)])
y = json.dumps(x, default= str)
s3 = boto3.resource(
's3',
region_name=data_region,
aws_access_key_id=key_id,
aws_secret_access_key=access_key
)
client = boto3.client('s3')
client.put_object(Body=str(raw_data), Bucket='bucket_key', Key='object_key') ```
I´ve written a script that connects to Sharepoint and downloads the metadata from all the new files added into a dataframe.
I have developed it in Spyder and it works just fine. But after I compile it into an .exe file with pyinstaller and I run it, I keep getting the following error: maximum recursion depth exceeded
I´m using Windows10 and Python 3.6.10 (conda)
from office365.runtime.auth.clientCredential import ClientCredential
from office365.sharepoint.client_context import ClientContext
import pandas as pd
clien_id = XXX
cient_secret = XXX
site_url = XXX
ctx = ClientContext.connect_with_credentials(site_url, ClientCredential(client_id, client_secret))
lib = ctx.web.lists.get_by_title("REO")
doc = 1
while cuenta_error < 10:
try:
item = lib.get_item_by_id(doc)
ctx.load(item)
ctx.execute_query()
id_documento = "{0}".format(item.properties["IdDocumentoSistemaOrigen"])
id_sharepoint = "{0}".format(item.properties["Id"])
ts_alta = datetime.strptime(str(datetime.now()), '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d %H:%M:%S')
origen = "{0}".format(item.properties["AplicacionOrigen"])
tipo_doc_SH = "{0}".format(item.properties["TipoDocumento"]['Label'])
fecha_alta_origen = datetime.strptime("{0}".format(item.properties["Created"])[0:10], '%Y-%m-%d').strftime('%Y-%m-%d')
id_inmueble = "{0}".format(item.properties["IdActivoPrinex"])
dni = "{0}".format(item.properties["NumeroIdentificacion"])
id_promocion = "{0}".format(item.properties["CodigoPromocion"])
file = item.file
ctx.load(file)
ctx.execute_query()
nombre_documento = "{0}".format(file.properties["Name"])
fecha_envio = None
IAObjeto = None
#print(id_documento)
doc +=1
cuenta_error = 0
fecha_envio = None
ind_error = None
to_append = [id_documento, id_sharepoint, tipo_documento, ts_alta, nombre_documento,
origen, tipo_doc_SH, fecha_alta_origen, id_inmueble, fecha_envio, ind_error,dni, id_promocion, IAObjeto]
a_series = pd.Series(to_append, index = df.columns)
df = df.append(a_series, ignore_index=True)
export_doc +=1
except Exception as e:
print(e)
I have tryied with sys.setrecursionlimit(1500) but I still get the same error and with sys.setrecursionlimit(1000**6) the code end up crashing.
Does anybody have any suggestion about how to fix this?
Good Morning
Finally I figured out the problem
ClientContext works in a recursive way and after a few calls crush,
I edited my code so I call the context each time I call a new item and now it's working.
I created two functions that I call every time I want to get the metadata from each file.
With this functions I close the context and open it again after each call, probably is not the best way, but I run out of ideas and this work.
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.client_context import ClientContext
import pandas as pd
from datetime import datetime, date
def call_lib(site_url,client_id, client_secret,serv):
ctx = ClientContext.connect_with_credentials(site_url, ClientCredential(client_id, client_secret))
if serv == 'PRN':
lib = ctx.web.lists.get_by_title("REO")
elif serv == 'GIA':
lib = ctx.web.lists.get_by_title("DESINVERSIÓN")
return ctx, lib
def get_metadata_from_sh (ctx,lib,docnum):
item = lib.get_item_by_id(docnum)
ctx.load(item)
ctx.execute_query()
id_documento = "{0}".format(item.properties["IdDocumentoSistemaOrigen"])
id_sharepoint = "{0}".format(item.properties["Id"])
ts_alta = datetime.strptime(str(datetime.now()), '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d %H:%M:%S')
origen = "{0}".format(item.properties["AplicacionOrigen"])
tipo_doc_SH = "{0}".format(item.properties["TipoDocumento"]['Label'])
fecha_alta_origen = datetime.strptime("{0}".format(item.properties["Created"])[0:10], '%Y-%m-%d').strftime('%Y-%m-%d')
id_inmueble = "{0}".format(item.properties["IdActivoPrinex"])
dni = "{0}".format(item.properties["NumeroIdentificacion"])
id_promocion = "{0}".format(item.properties["CodigoPromocion"])
file = item.file
ctx.load(file)
ctx.execute_query()
nombre_documento = "{0}".format(file.properties["Name"])
fecha_envio = None
IAObjeto = None
return id_documento, id_sharepoint, ts_alta, origen, tipo_doc_SH, fecha_alta_origen, id_inmueble, nombre_documento, fecha_envio, dni, int(id_promocion), IAObjeto
ctx, lib = call_lib(site_url,client_id, client_secret,serv_origen[i])
id_documento, id_sharepoint, ts_alta, origen, tipo_doc_SH, fecha_alta_origen, id_inmueble, nombre_documento, fecha_envio, dni,id_promocion, IAObjeto = get_metadata_from_sh (ctx,lib,doc)
I am trying to consume hp nnmi web services. But here you can see I am using milliseconds as value in filter2.value = d_in_ms, and this is not working for me. I am able to see result when I use values like filter2.value = "1493078400000", Please tell me if we can use int values like below:
#!/usr/bin/python
from suds.client import Client
from suds.transport.http import HttpAuthenticated
import datetime
import time
now = datetime.datetime.now()
currenttime = now - datetime.timedelta(hours=12)
epochtime = time.mktime(currenttime.timetuple())
print epochtime
d_in_ms = int(epochtime)*1000
t = HttpAuthenticated(username='xxxxx', password='xxxx')
url = 'http://example.com/IncidentBeanService/IncidentBean?wsdl'
client = Client(url, transport=t)
filter1 = client.factory.create('ns2:condition')
filter1.name = "sourceNodeName"
filter1.operator = "EQ"
filter1.value = "DEVICE"
filter2 = client.factory.create('ns2:condition')
filter2.name = "lastOccurrenceTime"
filter2.operator = "GT"
filter2.value = d_in_ms
filter = client.factory.create('ns2:expression')
filter.operator = "AND"
filter.subFilters = [filter1, filter2]
allincidents = client.service.getIncidents(filter)
print "Nodes in topology:", len(allincidents.item)
for i in allincidents.item[:]:
print i
I am able to see result when I use values like filter2.value = "1493078400000"
According to this statement, it looks filter2.value should be a string. That would suggest that you need to use:
filter2.value = str(d_in_ms)
So I've managed to finally figure out how to select a portion of a pandas dataframe, but now I am lost on how to use the data with in it. I want to be able to add together all of the entries within a minute.
for minute in rrule.rrule(rrule.MINUTELY, dtstart=pd.datetime.strptime(dateToday+"T15:30","%Y-%m-%dT%H:%M"), until=pd.datetime.strptime(dateToday+"T22:00","%Y-%m-%dT%H:%M")):
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=60))]
print(temp)
Figured it out. What I needed was a loop cycling through the created subpandas (whatever you call them).
#Refrences
from time import *
from dateutil import rrule
from datetime import timedelta
import urllib.request as web
import pandas as pd
import os
forToday = 'http://netfonds.no/quotes/tradedump.php?csv_format=csv&paper='
dateToday = strftime("%Y-%m-%d", localtime())
#dateToday = "2014-10-07"
def pullToday(exchange,stock):
hold=[]
fileName=('data/'+exchange+'/'+stock+'/'+dateToday+'.txt')
try:
if not os.path.isdir(os.path.dirname(fileName)):
os.makedirs(os.path.dirname(fileName))
except OSError:
print("Something went very wrong. Review the dir creation section")
pageBuffer=web.urlopen(forToday+stock+'.'+exchange)
pageData=pd.read_csv(pageBuffer,usecols=['time','price','quantity'])
for i in pageData.index:
pageData['time'][i]=pd.datetime.strptime(pageData['time'][i],'%Y%m%dT%H%M%S')
print(hold)
#pageData = pageData.set_index('time',drop=False)
for minute in rrule.rrule(rrule.MINUTELY, dtstart=pd.datetime.strptime(dateToday+"T15:30","%Y-%m-%dT%H:%M"), until=pd.datetime.strptime(dateToday+"T21:58","%Y-%m-%dT%H:%M")):
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=60))]
volume=0
priceSum=0
low=123456
high=0
for i in temp.index:
volume+=temp['quantity'][i]
priceSum+=temp['quantity'][i]*temp['price'][i]
if temp['price'][i]>high:
high=temp['price'][i]
if temp['price'][i]<low:
low=temp['price'][i]
priceSum/=volume
hold.append([minute,volume,low,high,round(priceSum,4)])
minute=pd.datetime.strptime(dateToday+"T21:59","%Y-%m-%dT%H:%M")
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=180))]
volume=0
priceSum=0
low=123456
high=0
for i in temp.index:
volume+=temp['quantity'][i]
priceSum+=temp['quantity'][i]*temp['price'][i]
if temp['price'][i]>high:
high=temp['price'][i]
if temp['price'][i]<low:
low=temp['price'][i]
priceSum/=volume
hold.append([minute,volume,low,high,round(priceSum,4)])
compiledData=pd.DataFrame(hold ,columns=['TimeStamp', 'Volume', 'Low', 'High', 'Average'])
#for i in compiledData.index:
#compiledData['TimeStamp'][i]-=pd.datetime.strptime(dateToday+"TZ06","%Y-%m-%dTZ%H")
print(compiledData)
dataFile = open(fileName,'w')
dataFile.write('#Format: Timestamp;Volume;Low;High;Median\n')
dataFile.close()
pageData.to_csv(fileName,index=False,sep=';',mode='a',header=False)
def getList(fileName):
stockList = []
file = open(fileName+'.txt', 'r').read()
fileByLines = file.split('\n')
for eachLine in fileByLines:
if '#' not in eachLine:
lineByValues = eachLine.split('.')
stockList.append(lineByValues)
return stockList
start_time = time()
stockList = getList('stocks')
#for eachEntry in stockList:
# pullToday(eachEntry[0],eachEntry[1])
pullToday('O','AAPL')
delay=str(round((time()-start_time)))
print('Finished in ' + delay)