I have a python script which downloads shell scripts from amazon S3 server and then executes them (each script is about 3GB in size). The function that downloads and executes the file looks like this:
import boto3
def parse_object_key(key):
key_parts = key.split(':::')
return key_parts[1]
def process_file(file):
client = boto3.client('s3')
node = parse_object_key(file)
file_path = "/tmp/" + node + "/tmp.sh"
os.makedirs(file_path)
client.download_file('category', file, file_path)
os.chmod(file_path, stat.S_IXUSR)
os.system(file_path)
The node is unique for each file.
I created a for loop to execute this:
s3 = boto3.resource('s3')
bucket = s3.Bucket('category')
for object in bucket.objects.page_size(count=50):
process_file(object.key, client)
This works perfectly, but when I try to create a separate thread for each file, I get error:
sh: 1: /path/to/file: Text file busy
The script with threading looks like:
s3 = boto3.resource('s3')
bucket = s3.Bucket('category')
threads = []
for object in bucket.objects.page_size(count=50):
t = threading.Thread(target=process_file, args=(object.key, client))
threads.append(t)
t.start()
for t in threads:
t.join()
Out of all the threads, exactly one thread succeed and all other fail on "Text file busy error". Can someone help me figure out what I am doing incorrectly?
Boto3 is not thread-safe so you cannot re-use your S3 connection for each download. See here for details of a workaround.
Related
I have a function that uploads files to s3 but it asks for MFA code before uploading starts. I am passing the function to the multiprocessing pool which creates two processes and runs the function two times concurrently.
When I run my script, It asks for MFA code twice in the terminal but the script crashes.
How do I enter MFA code in both processes concurrently and authenticate both processes?
Here is my Python code:
import multiprocessing
import boto3
session = boto3.Session()
s3_client = session.client('s3')
def load_to_s3(file_path):
response = s3_client.upload_file(file_path, bucket, target_path) # This line asks for MFA
return response
if __name__ == '__main__':
pool = multiprocessing.Pool(processes = 2)
response_list = pool.map(load_to_s3, file_path_chunks)
Error Messages -
You can try to use multiprocessing.Lock to make sure that only one process will be authenticated at a time.
Also you may need to create a new client for each process:
Resource instances are not thread safe and should not be shared across
threads or processes. These special classes contain additional meta
data that cannot be shared. It's recommended to create a new Resource
for each thread or process
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html?highlight=multithreading#multithreading-or-multiprocessing-with-resources
Example:
import multiprocessing
import time
import boto3
lock = multiprocessing.Lock()
def load_to_s3(file_path):
with lock:
print(file_path)
# there will be only one process at a time
# do your work here
# session = boto3.Session()
# s3_client = session.client('s3')
time.sleep(1)
if __name__ == "__main__":
pool = multiprocessing.Pool(processes=2)
file_path_chunks = ["1", "2", "3", "4"]
response_list = pool.map(load_to_s3, file_path_chunks)
Under CloudWatch, I created a rule to capture Athena Query State Change Event that will (1) write a log to a log group (2) trigger a Lambda function that will capture the Athena Query Execution details and pipe it to a s3 bucket. Point 2 fails as no Athena Query Execution details are piped it to a s3 bucket. Below is the Lambda Function I used:
import json
import boto3
from botocore.config import Config
my_config = Config(
region_name = '<my_region>')
print('Loading function')
def lambda_handler(event, context):
print("Received event: " + json.dumps(event))
print("QuertID: " + event['id'])
#get query statistics
client = boto3.client('athena', config=my_config)
queries = client.get_query_execution( QueryExecutionId=event['detail']['QueryExecutionId'])
del queries['QueryExecution']['Status']
#saving the query statistics to s3
s3 = boto3.resource('s3')
object = s3.Object('<s3_bucket_path>','query_statistics_json/' + event['detail']['QueryExecutionId'])
object.put(Body=str(queries['QueryExecution']))
return 0
I used this AWS Documentation as reference:
https://docs.aws.amazon.com/athena/latest/ug/control-limits.html
The body should be of the type binary data.
object.put(Body=some binary data)
Maybe you can write the str(queries['QueryExecution'] to a txt file in lambda's /tmp directory and upload it.
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', '/tmp/newfile.txt').put(Body=content)
it's just an indentation problem, after line 11, all should be indented...
I have created a lambda function which gets triggered when any R script file gets uploaded on S3 bucket.
test.R code:
a <- 1
b <- 3
c <- a + b
data = 1:20
print(data)
lambda function code:
import subprocess
import json
import urllib.parse
import boto3
print('Loading function')
def lambda_handler(event, context):
s3 = boto3.client('s3')
bucket_name = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
message = "Hey file is got uploaded " + key + " to this bucket " + bucket_name
print(message)
#3 - Fetch the file from S3
response = s3.get_object(Bucket=bucket_name, Key=key)
text = response["Body"].read().decode()
print(text)
command = "Rscript"
arg = "--vanlla"
path2script = key
retcode = subprocess.call([command, arg, path2script], shell=True)
Trigger is working fine but its giving --vanlla: Rscript: command not found error. but when I run simple python script to run R script code, its working fine. Not working with AWS lambda.
Need help in this.
I have flask python rest api which is called by another flask rest api.
the input for my api is one parquet file (FileStorage object) and ECS connection and bucket details.
I want to save parquet file to ECS in a specific folder using boto or boto3
the code I have tried
def uploadFileToGivenBucket(self,inputData,file):
BucketName = inputData.ecsbucketname
calling_format = OrdinaryCallingFormat()
client = S3Connection(inputData.access_key_id, inputData.secret_key, port=inputData.ecsport,
host=inputData.ecsEndpoint, debug=2,
calling_format=calling_format)
#client.upload_file(BucketName, inputData.filename, inputData.folderpath)
bucket = client.get_bucket(BucketName,validate=False)
key = boto.s3.key.Key(bucket, inputData.filename)
fileName = NamedTemporaryFile(delete=False,suffix=".parquet")
file.save(fileName)
with open(fileName.name) as f:
key.send_file(f)
but it is not working and giving me error like...
signature_host = '%s:%d' % (self.host, port)
TypeError: %d format: a number is required, not str
I tried google but no luck Can anyone help me with this or any sample code for the same.
After a lot of hit and tried and time, I finally got the solution. I posting it for everyone else who are facing the same issue.
You need to use Boto3 and here is the code...
def uploadFileToGivenBucket(self,inputData,file):
BucketName = inputData.ecsbucketname
#bucket = client.get_bucket(BucketName,validate=False)
f = NamedTemporaryFile(delete=False,suffix=".parquet")
file.save(f)
endpointurl = "<your endpoints>"
s3_client = boto3.client('s3',endpoint_url=endpointurl, aws_access_key_id=inputData.access_key_id,aws_secret_access_key=inputData.secret_key)
try:
newkey = 'yourfolderpath/anotherfolder'+inputData.filename
response = s3_client.upload_file(f.name, BucketName,newkey)
except ClientError as e:
logging.error(e)
return False
return True
I know there are a lot of questions on here about the same issue , however I have gone through each and every one of them and have tried out the suggestions and answers given there to no avail. Thats why I am posting this question here
I am trying to upload a file to my bucket. Since this file is larger than 100mb I am trying to upload it using the multipart_upload which boto supports. I was able to achieve that . Then i tried to increase upload speeds by using the pool class from multiprocessing module. I used the code given below . When I run the program, nothing happens. I used the from multiprocessing.dummy import pool for debugging purposes and the program raises an
boto.exception.S3ResponseError: S3ResponseError: 403 Forbidden
<?xml version="1.0" encoding="UTF-8"?>
<Error><Code>AccessDenied</Code><Message>Access Denied</Message><RequestId>55D423C42E8A9D94</RequestId><HostId>kxxX+UmBlGaT4X8adUAp9XQV/1jiiK83IZKQuKxAIMEmzdC3g9IRqDqIVXGLPAOe</HostId></Error>
at raise exc (marked with a '#') under _upload. I dont understand why i get this. I have full read and write access to the bucket and standard uploads work like a charm without any error. I can also delete any file I want from the bucket. The only issue seems to be when I try parallel uploads. Code is pasted below adn can also be found here
My code:(I have removed my keys and bucket name from the code)
def _upload_part(bucketname, aws_key, aws_secret, multipart_id, part_num,
source_path, offset, bytes, amount_of_retries=10):
"""
Uploads a part with retries.
"""
def _upload(retries_left=amount_of_retries):
try:
logging.info('Start uploading part #%d ...' % part_num)
conn = S3Connection(aws_key, aws_secret)
bucket = conn.get_bucket(bucketname)
for mp in bucket.get_all_multipart_uploads():
if mp.id == multipart_id:
with FileChunkIO(source_path, 'r', offset=offset,
bytes=bytes) as fp:
mp.upload_part_from_file(fp=fp, part_num=part_num)
break
except Exception, exc:
if retries_left:
_upload(retries_left=retries_left - 1)
else:
logging.info('... Failed uploading part #%d' % part_num)
raise exc #this line raises error
else:
logging.info('... Uploaded part #%d' % part_num)
_upload()
def upload(bucketname, aws_key, aws_secret, source_path, keyname,
acl='private', headers={}, parallel_processes=4):
"""
Parallel multipart upload.
"""
conn = S3Connection(aws_key, aws_secret)
bucket = conn.get_bucket(bucketname)
mp = bucket.initiate_multipart_upload(keyname, headers=headers)
source_size = os.stat(source_path).st_size
bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)),
5242880)
chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk)))
pool = Pool(processes=parallel_processes)
for i in range(chunk_amount):
offset = i * bytes_per_chunk
remaining_bytes = source_size - offset
bytes = min([bytes_per_chunk, remaining_bytes])
part_num = i + 1
pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id,
part_num, source_path, offset, bytes])
pool.close()
pool.join()
if len(mp.get_all_parts()) == chunk_amount:
mp.complete_upload()
key = bucket.get_key(keyname)
key.set_acl(acl)
else:
mp.cancel_upload()
upload(default_bucket, acs_key, sec_key, '/path/to/folder/testfile.txt', 'testfile.txt')