I'm seeing a memory leak when using boto to upload files. Am I doing something wrong here? Memory usage seems to increase less consistently if I remove the sleep or if I don't alternate between two different buckets.
import time, resource, os
import boto
conn = boto.connect_s3()
for i in range(20):
print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
path = 'test.png'
bucket = conn.lookup('jca-screenshots-' + ('thumbs' if i % 2 == 0 else 'normal'))
k = boto.s3.key.Key(bucket)
k.key = os.path.basename(path)
k.set_contents_from_filename(path)
time.sleep(5)
Sample output:
12406784
13123584
13242368
13344768
13398016
13422592
13484032
13524992
13553664
13590528
13656064
13664256
Solved by switching libs: https://github.com/tax/python-requests-aws
import time, resource, os
import requests
from awsauth import S3Auth
with open(os.path.expanduser("~/.boto")) as f:
lines = f.read().splitlines()
ACCESS_KEY = lines[1].split(' = ')[1]
SECRET_KEY = lines[2].split(' = ')[1]
for i in range(20):
print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
url = 'http://{}.s3.amazonaws.com/{}'.format(
'jca-screenshots-' + ('thumbs' if i % 2 == 0 else 'normal'), 'test.png')
with open('test.png', 'rb') as f:
resp = requests.put(url, data=f, auth=S3Auth(ACCESS_KEY, SECRET_KEY))
print 'resp:', resp
time.sleep(5)
Related
I am running a simple python webserver [SimpleHTTPServer] in my Linux . Wrote a python program to download all the files hosted in that server to my Windows Machine . But for some reason program is throwing FileNotFoundError even though Directory exists and I've provided Absolute Path .
Here is the code : https://drive.google.com/file/d/1CDrueDJcbu2z1XeeB_iYv0zmfIX1cCkx/view?usp=sharing
It's working correctly in Linux but trouble with Windows . Thanks
import requests
import argparse
from sys import argv
from urllib.parse import unquote
import os
from time import time
import random
from colorama import Fore, Style
import platform
def formatFiles(name):
name = name[13:-9]
nameLen = len(name) - 2
nameLen = int(nameLen/2)
name = name[:nameLen]
return name
# Creating a Temporary Folder to Download all the files in it
def fileCreate(saveFolder):
random.seed(int(time()))
text = ""
for x in range(5):
y = random.randrange(65,91)
text += chr(y)
saveFolder += text
os.popen("mkdir {}".format(saveFolder))
print("Temp Directory {} created to save files/folders".format(text))
return saveFolder
def winDows(endPoint, banner):
resp = requests.get(endPoint, allow_redirects=True)
resp = resp.text.split("\n")
resp = list(map(unquote, resp[10:-5])) #URL decoding using unquote
resp = list(map(formatFiles,resp))
for dir in resp:
tempPath = ""
tempEndpoint = endPoint[len(serverURL):] # Getting directory structure by removing IP:PORT in URL
tempPath = "\\".join(tempEndpoint.split("/")) # Removing / and adding \\ for Windows path
print(banner + dir)
tdir = dir
if(dir[-1] == "/"):
if( dir.split(" ")!=1 ): # If the directory name has spaces ,
tdir = dir[:-1]
tdir = "\""+tdir+"\""+"\\"
os.popen("mkdir "+saveFolder+"\\"+tempPath+tdir)
r = winDows(endPoint+dir, banner[:-4]+" |___")
else:
data = open(saveFolder+"\\"+tempPath+dir, "wb")
fileData = requests.get(endPoint+dir, allow_redirects=True)
data.write(fileData.content)
data.close()
return 1
parser = argparse.ArgumentParser()
sideBanner = " |___ "
parser.add_argument("ip", help = "IP address of FTP Server", type=ip_valid)
parser.add_argument("port" , help = "FTP Server Port you want to access", type=port_valid)
parser.add_argument("dst", help="Destination Path to save your files")
args = parser.parse_args()
ip = argv[1]
port = argv[2]
saveFolder = argv[3]
serverURL = "http://"+ip+":"+port+"/"
saveFolder = fileCreate(saveFolder)
print("Destination Folder - {}".format(saveFolder))
if(platform.system() == "Linux"):
linuX(serverURL, sideBanner)
else:
winDows(serverURL, sideBanner)
I have the following lambda function that will search my my s3 bucket with the prefix being the current time in milliseconds. I have about 600-800k files per hour that I would like to do some manipulation to. This code works as intended but takes forever to scan the prefix. I have a feeling that this part of my code is not efficient. Since this lambda function is scheduled to run every 10 mins I have my min range set to go back up to 11 mins in milliseconds. I would greatly appreciate if someone could help me make this piece more efficient if possible.
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
keys = []
result = []
now = int(round(time.time() * 1000))
now_min = now - 660000 # 11 mins
times = list(range(now_min,now+1))
for t in times:
prefix = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(t)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
if page.get('KeyCount') != 0:
for obj in page['Contents']:
keys.append(obj['Key'])
for key in keys[1:]:
The goal is take these 800k files and condense them into multiple larger files instead of having 800k small files.
for key in keys[1:]:
local_filepath = os.path.join(tempfile.gettempdir(), key)
regex_local_filepath = '/tmp/' + re.search('([^\/]+$)', local_filepath).group(0)
re_key = re.search('([^-/]+$)', key).group(0)
re_key = re_key.replace('.json','')
s3_resource.Bucket('bucket').download_file(key,regex_local_filepath)
with open (regex_local_filepath,'r') as infile:
result.append(json.load(infile))
file_name = 'Uploads/' + str(datetime.now(timezone.utc).strftime("%Y-%m-%d")) + '/' + str(datetime.utcnow().strftime('%H')) + '/' + str(now) + '.json'
s3object = s3_resource.Object('new-bucket', file_name)
s3object.put(
Body=(bytes(json.dumps(result, indent=2, sort_keys=True).encode('UTF-8')))
)
return None
I have figured out the correct way to efficiently loop through. It seems I was looping through multiple times and appending times to the keys.
If one needs to condense s3 files into larger single files. This approach works amazingly well. Cheers!
import os
import boto3
import json
import tempfile
import re
from datetime import date, datetime,timezone
import _datetime
import time
def lambda_handler(event, context):
# TODO implement
s3_client = boto3.client("s3")
s3_resource = boto3.resource('s3')
paginator = s3_client.get_paginator('list_objects_v2')
now = int(round(time.time() * 1000))
min_now = now - 360000 # Go back 6 mins since lambda function runs every 5 mins
max_now = now + 60000 # This is to handle minute 59 after the hour.
keys = []
regex_keys = []
result = []
content_keys = []
my_bucket = s3_resource.Bucket('bucket')
prefix = 'Uploads/'
key_objects = iter(my_bucket.objects.filter(Prefix=prefix))
next(key_objects)
for object_summary in key_objects:
obj_key = object_summary.key # This gives me all the keys in the above prefix
keys.append(obj_key)
for key in keys:
regex_key = re.search('\/(.*?)\-', key).group(0).replace('/','').replace('-','') # I just want the timestamp (miliseconds)
regex_keys.append(regex_key)
for regex_key in regex_keys:
if min_now <= int(regex_key) <= max_now:
prefix = 'Uploads/' + str(regex_key)
pages = paginator.paginate(Bucket='bucket', Prefix=prefix)
for page in pages:
for obj in page['Contents']:
content_keys.append(obj['Key'])
print(len(content_keys))
return None
I've made a lot of progress on this, and can now download 3 of 4 files just fine, however, one, the Wisconsin file contains timestamps that I can't have removed, and vary day to day and I'm struggling to figure out how to get the wildcards to work on those values with regular expressions. I've posted my revised code below:
Examples of the file names are:
BCW_Daily SDP Yield.rpt2020-02-17***-09-02-32***.csv
hbc_platelet_daily_02102020.csv
MBC_ROLLING_YIELD_02172020.CSV
IBC_SDP_Rolling_7Days_021720.CSV
Any help is appreciated.
import datetime
import ftplib
import os
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
filenameWI = ('BCW_SDP_Rolling_7Days.rpt'***+widate+'*'+***'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
dlfiles = [filenameMI,filenameIN,filenameWI,filenameIL]
connection = ftplib.FTP(host='xxx',user='xxx',passwd='xxx')
welcome = ftplib.FTP.getwelcome(connection)
print(welcome)
connection.cwd(ftpdir)
ftp_list = connection.nlst()
print(ftp_list)
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
else:
print(x+' fail')
connection.quit()
Solved it:
# import modules
import fnmatch
import datetime
import ftplib
import os
#define variables
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
filedir = "C:/DailyData/SDPS/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
pattern = ('BCW_SDP_Rolling_7Days.rpt'+widate+'*'+'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
#create FTP connection
connection = ftplib.FTP(xxxxxxx)
connection.cwd(ftpdir)
#generate file list on FTP
ftp_list = connection.nlst()
#create wildcard string for WI file
wistring = fnmatch.filter(ftp_list,pattern)
filenameWI = str(wistring[0])
dlfiles = [filenameMI,filenameIN,filenameIL,filenameWI]
#download files from FTP to local
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
connection.quit()
I'm running a Python script on a Sun Grid Engine supercompute cluster that reads in a list of file ids, sends each to a worker process for analysis, and writes one output per input file to disk.
The trouble is I'm getting IOError(110, 'Connection timed out') somewhere inside the worker function, and I'm not sure why. I've received this error in the past when making network requests that were severely delayed, but in this case the worker is only trying to read data from disk.
My question is: What would cause a Connection timed out error when reading from disk, and how can one resolve this error? Any help others can offer would be very appreciated.
Full script (the IOError crops up in minhash_text()):
from datasketch import MinHash
from multiprocessing import Pool
from collections import defaultdict
from nltk import ngrams
import json
import sys
import codecs
import config
cores = 24
window_len = 12
step = 4
worker_files = 50
permutations = 256
hashband_len = 4
def minhash_text(args):
'''Return a list of hashband strings for an input doc'''
try:
file_id, path = args
with codecs.open(path, 'r', 'utf8') as f:
f = f.read()
all_hashbands = []
for window_idx, window in enumerate(ngrams(f.split(), window_len)):
window_hashbands = []
if window_idx % step != 0:
continue
minhash = MinHash(num_perm=permutations, seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == hashband_len:
window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) )
hashband_vals = []
all_hashbands.append(window_hashbands)
return {'file_id': file_id, 'hashbands': all_hashbands}
except Exception as exc:
print(' ! error occurred while processing', file_id, exc)
return {'file_id': file_id, 'hashbands': []}
if __name__ == '__main__':
file_ids = json.load(open('file_ids.json'))
file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()]
worker_id = int(sys.argv[1])
worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id]
hashband_to_ids = defaultdict(list)
pool = Pool(cores)
for idx, result in enumerate(pool.imap(minhash_text, worker_ids)):
print(' * processed', idx, 'results')
file_id = result['file_id']
hashbands = result['hashbands']
for window_idx, window_hashbands in enumerate(hashbands):
for hashband in window_hashbands:
hashband_to_ids[hashband].append(file_id + '.' + str(window_idx))
with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out:
json.dump(dict(hashband_to_ids), out)
It turned out I was hammering the filesystem too hard, making too many concurrent read requests for files on the same server. That server could only allow a fixed number of reads in a given period, so any requests over that limit received a Connection Timed Out response.
The solution was to wrap each file read request in a while loop. Inside that while loop, try to read the appropriate file from disk. If the Connection timed out error springs, sleep for a second and try again. Only once the file has been read may the while loop be broken.
I have a script that parses xml files using the ElementTree Path Evaluator. It works fine as it is, but it takes a long for it to finish. So I tried to make a multithreaded implementation:
import fnmatch
import operator
import os
import lxml.etree
from nltk import FreqDist
from nltk.corpus import stopwords
from collections import defaultdict
from datetime import datetime
import threading
import Queue
STOPWORDS = stopwords.words('dutch')
STOPWORDS.extend(stopwords.words('english'))
DIR_NAME = 'A_DIRNAME'
PATTERN = '*.A_PATTERN'
def loadData(dir_name, pattern):
nohyphen_files = []
dir_names = []
dir_paths = []
for root, dirnames, filenames in os.walk(dir_name):
dir_names.append(dirnames)
dir_paths.append(root)
for filename in fnmatch.filter(filenames, pattern):
nohyphen_files.append(os.path.join(root, filename))
return nohyphen_files, dir_names, dir_paths
def freq(element_list, descending = True):
agglomerated = defaultdict(int)
for e in element_list:
agglomerated[e] += 1
return sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=descending)
def lexDiv(amount_words):
return 1.0*len(set(amount_words))/len(amount_words)
def anotherFreq(list_types, list_words):
fd = FreqDist(list_types)
print 'top 10 most frequent types:'
for t, freq in fd.items()[:10]:
print t, freq
print '\ntop 10 most frequent words:'
agglomerated = defaultdict(int)
for w in list_words:
if not w.lower() in STOPWORDS:
agglomerated[w] += 1
sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1),reverse=True)
print sorted_dict[:10]
def extractor(f):
print "check file: {}".format(f)
try:
# doc = lxml.etree.ElementTree(lxml.etree.XML(f))
doc = lxml.etree.ElementTree(file=f)
except lxml.etree.XMLSyntaxError, e:
print e
return
doc_evaluator = lxml.etree.XPathEvaluator(doc)
entities = doc_evaluator('//entity/*/externalRef/#reference')
places_dbpedia = doc_evaluator('//entity[contains(#type, "Schema:Place")]/*/externalRef/#reference')
non_people_dbpedia = set(doc_evaluator('//entity[not(contains(#type, "Schema:Person"))]'))
people = doc_evaluator('//entity[contains(#type, "Schema:Person")]/*/externalRef/#reference')
words = doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
unique_words = set(words)
other_tokens = doc.xpath('text/wf[re:match(text(), "[^A-Za-z-]")]/text()',\
namespaces={"re": "http://exslt.org/regular-expressions"})
amount_of_sentences = doc_evaluator('text/wf/#sent')[-1]
types = doc_evaluator('//term/#morphofeat')
longest_sentence = freq(doc.xpath('text/wf[re:match(text(), "[A-Za-z-]")]/#sent',\
namespaces={"re": "http://exslt.org/regular-expressions"}))[0]
top_people = freq([e.split('/')[-1] for e in people])[:10]
top_entities = freq([e.split('/')[-1] for e in entities])[:10]
top_places = freq([e.split('/')[-1] for e in places_dbpedia])[:10]
def worker():
while 1:
job_number = q.get()
extractor(job_number)
q.task_done() #this thread is complete, move on
if __name__ =='__main__':
startTime = datetime.now()
files, dirs, path = loadData(DIR_NAME, PATTERN)
startTime = datetime.now()
q = Queue.Queue()# job queue
for f in files:
q.put(f)
for i in range(20): #make 20 workerthreads ready
worker_thread = threading.Thread(target=worker)
worker_thread.daemon = True
worker_thread.start()
q.join()
print datetime.now() - startTime
This does something, but when timing it, it isn't faster than the normal version. I think it has something to do with opening and reading files making the threader not multithreaded. If I use a function that instead of parsing the xml file just sleeps for a couple of second and prints something, it does work and it is a lot faster. What do I have to account for to have a multithreaded XML parser?
Threading in Python doesn't work as it does in other languages. It relies on the Global Interpreter Lock that makes sure only one thread is active at one time (running bytecode to be exact).
What you want to do is use the multiprocess library, instead.
You can read more about the GIL and Threading here:
https://docs.python.org/2/glossary.html#term-global-interpreter-lock
https://docs.python.org/2/library/threading.html