I have this multithreading script, which operates on a data set. Each thread gets a chunk of the data set and then each thread iterates over the data frame and calls and api (MS Graph Create).
What I have seen is that, my script tends to get stuck at almost finish time. I am running this on a linux Ubuntu server. 8vCpus. But this happens only when the total dataset size is in millions. (takes around 9-10 hrs for 2 million records)
I am writing a script (long running) for the first time. Would like to get an opinion if I am doing things correctly.
Please :
I would like to know if my code is the reason why my script hangs.
Have I done multithreading correctly ? Have I created and waited for threads to end correctly ?
UPDATE
Using answers, below, still the threads seems to get stuck at the end.
import pandas as pd
import sys
import os
import logging
import string
import secrets
import random
##### ----- Logging Setup -------
logging.basicConfig(filename="pylogs.log", format='%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
# Creating an object
logger = logging.getLogger()
# Setting the threshold of logger to DEBUG
logger.setLevel(logging.ERROR)
#####------ Function Definitions -------
# generates random password
def generateRandomPassword(lengthOfPassword):
# logic for random password gen
# the most important funtion
#
def createAccounts(splitData, threadID):
batchProgress = 0
batch_size = splitData.shape[0]
for row in splitData.itertuples():
try:
headers = {"Content-Type": "application/json", "Authorization":"Bearer "+access_token}
randomLength = [8,9,12,13,16]
passwordLength = random.choice(randomLength)
password = generateRandomPassword(passwordLength) # will be generated randomly - for debugging purpose
batchProgress+=1
post_request_body = {
"accountEnabled": True,
"displayName": row[5],
"givenName": row[3],
"surname": row[4],
"mobilePhone": row[1],
"mail": row[2],
"passwordProfile" : {
"password": password,
"forceChangePasswordNextSignIn": False
},
"state":"",
"identities": [
{
"signInType": "emailAddress",
"issuer": tenantName,
"issuerAssignedId": row[2]
}
]
}
# if phone number exists then only add - since phone number needs to have length between 1 and 64, cannot leave empty
if(len(row[4])):
post_request_body["identities"].append({"signInType": "phoneNumber","issuer": tenantName,"issuerAssignedId": row[1]})
responseFromApi = requests.post(graph_api_create, headers=headers, json=post_request_body)
status = responseFromApi.status_code
if(status == 201): #success
id = responseFromApi.json().get("id")
print(f" {status} | {batchProgress} / {batch_size} | Success {id}")
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}{row[11]}{row[12]}{row[13]}^Success'
elif(status == 429): #throttling issues
print(f" Thread {threadID} | Throttled by server ! Sleeping for 150 seconds")
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}^Throttled'
time.sleep(150)
elif(status == 401): #token expiry
print(f" Thread {threadID} | Token Expired. Getting it back !")
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}^Token Expired'
getRefreshToken()
else: #any other error
msg = ""
try:
msg = responseFromApi.json().get("error").get("message")
except Exception as e:
msg = f"Error {e}"
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}^{msg}'
print(f" {status} | {batchProgress} / {batch_size} | {msg} {row[2]}")
logger.error(errorDict)
except Exception as e:
# check for refresh token errors
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}^Exception_{e}'
logger.error(errorDict)
msg = " Error "
print(f" {status} | {batchProgress} / {batch_size} | {msg} {row[2]}")
print(f"Thread {threadID} completed ! {batchProgress} / {batch_size}")
batchProgress = 0
###### ------ Main Script ------
if __name__ == "__main__":
# get file name and appid from command line arguments
storageFileName = sys.argv[1]
appId = sys.argv[2]
# setup credentials
bigFilePath = f"./{storageFileName}"
CreatUserUrl = "https://graph.microsoft.com/v1.0/users"
B2C_Tenant_Name = "tenantName"
tenantName = B2C_Tenant_Name + ".onmicrosoft.com"
applicationID = appId
accessSecret = "" # will be taken from command line in future revisions
token_api_body = {
"grant_type": "client_credentials",
"scope": "https://graph.microsoft.com/.default",
"client_Id" : applicationID,
"client_secret": accessSecret
}
# Get initial access token from MS
print("Connecting to MS Graph API")
token_api = "https://login.microsoftonline.com/"+tenantName+"/oauth2/v2.0/token"
response = {}
try:
responseFromApi = requests.post(token_api, data=token_api_body)
responseJson = responseFromApi.json()
print(f"Token API Success ! Expires in {responseJson.get('expires_in')} seconds")
except Exception as e:
print("ERROR | Token auth failed ")
# if we get the token proceed else abort
if(responseFromApi.status_code == 200):
migrationData = pd.read_csv(bigFilePath)
print(" We got the data from Storage !", migrationData.shape[0])
global access_token
access_token = responseJson.get('access_token')
graph_api_create = "https://graph.microsoft.com/v1.0/users"
dataSetSize = migrationData.shape[0]
partitions = 50 # No of partitions # will be taken from command line in future revisions
size = int(dataSetSize/partitions) # No of rows per file
remainder = dataSetSize%partitions
print(f"Data Set Size : {dataSetSize} | Per file size = {size} | Total Files = {partitions} | Remainder: {remainder} | Start...... \n")
##### ------- Dataset partioning.
datasets = []
range_val = partitions + 1 if remainder !=0 else partitions
for partition in range(range_val):
if(partition == partitions):
df = migrationData[size*partition:dataSetSize]
else:
df = migrationData[size*partition:size*(partition+1)]
datasets.append(df)
number_of_threads = len(datasets)
start_time = time.time()
spawned_threads = []
######## ---- Threads are spawned ! here --------
for i in range(number_of_threads): # spawn threads
t = threading.Thread(target=createAccounts, args=(datasets[i], i))
t.start()
spawned_threads.append(t)
number_spawned = len(spawned_threads)
print(f"Started {number_spawned} threads !")
###### - Threads are killed here ! ---------
for thread in spawned_threads: # let the script wait for thread execution
thread.join()
print(f"Done! It took {time.time() - start_time}s to execute") # time check
#### ------ Retry Mechanism -----
print("RETRYING....... !")
os.system(f'python3 retry.py pylogs.log {appId}')
else:
print(f"Token Missing ! API response {responseJson}")```
Here's a refactoring of your code to use the standard library multiprocessing.ThreadPool for simplicity.
Naturally I couldn't have tested it since I don't have your data, but the basic idea should work. I removed the logging and retry stuff, since I really couldn't understand why you'd need it (but feel free to add it back); this will attempt to retry each row if the problem appears to be transient.
import random
import sys
import time
from multiprocessing.pool import ThreadPool
import pandas as pd
import requests
sess = requests.Session()
# globals filled in by `main`
tenantName = None
access_token = None
def submit_user_create(row):
headers = {"Content-Type": "application/json", "Authorization": "Bearer " + access_token}
randomLength = [8, 9, 12, 13, 16]
passwordLength = random.choice(randomLength)
password = generateRandomPassword(passwordLength) # will be generated randomly - for debugging purpose
post_request_body = {
"accountEnabled": True,
"displayName": row[5],
"givenName": row[3],
"surname": row[4],
"mobilePhone": row[1],
"mail": row[2],
"passwordProfile": {"password": password, "forceChangePasswordNextSignIn": False},
"state": "",
"identities": [{"signInType": "emailAddress", "issuer": tenantName, "issuerAssignedId": row[2]}],
}
# if phone number exists then only add - since phone number needs to have length between 1 and 64, cannot leave empty
if len(row[4]):
post_request_body["identities"].append({"signInType": "phoneNumber", "issuer": tenantName, "issuerAssignedId": row[1]})
return sess.post("https://graph.microsoft.com/v1.0/users", headers=headers, json=post_request_body)
def get_access_token(tenantName, applicationID, accessSecret):
token_api_body = {
"grant_type": "client_credentials",
"scope": "https://graph.microsoft.com/.default",
"client_Id": applicationID,
"client_secret": accessSecret,
}
token_api = f"https://login.microsoftonline.com/{tenantName}/oauth2/v2.0/token"
resp = sess.post(token_api, data=token_api_body)
if resp.status_code != 200:
raise RuntimeError(f"Token Missing ! API response {resp.content}")
json = resp.json()
print(f"Token API Success ! Expires in {json.get('expires_in')} seconds")
return json["access_token"]
def process_row(row):
while True:
response = submit_user_create(row)
status = response.status_code
if status == 201: # success
id = response.json().get("id")
print(f"Success {id}")
return True
if status == 429: # throttling issues
print(f"Throttled by server ! Sleeping for 150 seconds")
time.sleep(150)
continue
if status == 401: # token expiry?
print(f"Token Expired. Getting it back !")
getRefreshToken() # TODO
continue
try:
msg = response.json().get("error").get("message")
except Exception as e:
msg = f"Error {e}"
print(f" {status} | {msg} {row[2]}")
return False
def main():
global tenantName, access_token
# get file name and appid from command line arguments
bigFilePath = sys.argv[1]
appId = sys.argv[2]
# setup credentials
B2C_Tenant_Name = "tenantName"
tenantName = f"{B2C_Tenant_Name}.onmicrosoft.com"
accessSecret = "" # will be taken from command line in future revisions
access_token = get_access_token(tenantName, appId, accessSecret)
migrationData = pd.read_csv(bigFilePath)
start_time = time.time()
with ThreadPool(10) as pool:
for i, result in enumerate(pool.imap_unordered(process_row, migrationData.itertuples()), 1):
progress = i / len(migrationData) * 100
print(f"{i} / {len(migrationData)} | {progress:.2f}% | {time.time() - start_time:.2f} seconds")
print(f"Done! It took {time.time() - start_time}s to execute")
if __name__ == "__main__":
main()
un-fair use of MS Graph
Due to possible throttling by the server, the usage of the MS Graph resource might be un-fair between threads. I use fair in the resource starvation sense.
elif(status == 429): #throttling issues
print(f" Thread {threadID} | Throttled by server ! Sleeping for 150 seconds")
errorDict = f'{row[1]}^{row[2]}^{row[3]}^{row[4]}^{row[5]}^{row[6]}^{row[7]}^{row[8]}^{row[9]}^{row[10]}{row[11]}{row[12]}{row[13]}^Throttled'
time.sleep(150)
One thread making a million calls can get a disproportionate amount of 429 responses each followed by a penalty of 150 seconds. This sleep doesn't stop the other threads from making calls though and achieving forward progress.
This would result in one thread lagging far behind the others and giving the appearance of being stuck.
Related
So the purpose of my lambda function is to check if a success file is in a certain s3 bucket path. If it's not there then it should send a failure message to slack. If it is there then it should trigger a dag to run manually. So at the moment I've got the code working for that but I want to retry the function IF there is no success file for the next 10 hours every 15 minutes. If after all those attempts, it's still not there then I want to send the failure notification to slack. At the moment, my event bridge cron expression schedule run of the lambda is spamming the slack failure message x amount of times or triggering the dag x amount of times. I want to exit the schedule if there is a success file there and trigger the dag once but i want it to run again if its no there and then send a failure message on the last retry that ive got scheduled. Is this possible? My code is below and my eventbridge cron is for mon-friday, every 15 minutes from 4 am to 1pm. Any guidance on a possible solution would be appreciated!
def lambda_handler(event, context):
#slack and airflow config
slack_webhook_url_details = ""
dag_id = 'sample_dag'
airflow_url=''
def send_slack_message(slack_webhook_url,slack_message):
slack_payload = {'text':slack_message}
response = requests.post(slack_webhook_url, json.dumps(slack_payload))
response_json = response.text
print('response after posting to slack: '+ str(response_json))
def initialize_paths():
global bucket
global path
global dt
two_days_ago = date.today() - timedelta(days=2)
dt = two_days_ago.strftime('%Y%m%d')
#add for loop logic for iterating through countries in bucket
bucket = ""
path = f"test_lambda/US/{dt}/_SUCCESS"
def check_file_exists():
print(bucket,path)
s3 = resource('s3')
try:
#check if file exists
s3.Object(bucket, path).load()
logging.info(f'_SUCCESS file exists with at path: {bucket}/{path} for the following date: {dt}')
#trigger dag run
mwaa_env_name = 'airflow-prod-env'
dag_name= 'sample_dag'
mwaa_cli_command = 'dags trigger'
client = boto3.client('mwaa')
mwaa_cli_token = client.create_cli_token(Name=mwaa_env_name)
conn = http.client.HTTPSConnection(mwaa_cli_token['WebServerHostname'])
payload = mwaa_cli_command + " " + dag_name
headers = {
'Authorization': 'Bearer ' + mwaa_cli_token['CliToken'],
'Content-Type': 'text/plain'
}
conn.request("POST", "/aws_mwaa/cli", payload, headers)
res = conn.getresponse()
data = res.read()
dict_str = data.decode("UTF-8")
mydata = ast.literal_eval(dict_str)
return base64.b64decode(mydata['stdout'])
except botocore.exceptions.ClientError as errorStdOut:
if errorStdOut.response['Error']['Code'] >= "401":
error_message= f'_SUCCESS file NOT detected at path: {bucket}/{path} for the following date: {dt}'
logging.info(error_message)
send_slack_message(slack_webhook_url_details,error_message)
else:
logging.info('Error, something went wrong connecting to lambda')
initialize_paths()
check_file_exists()```
I am having some troubles with my program as when it reaches the end of the third() function, it continues to try to execute transactions. I tried having it return None to break out of the seemly infinite loop that it is in with no success. I am sure that I am missing something very simple here and am guessing it has something to do with the recursion that I used. Thanks for any help that you can provide.
import asyncio
import base64
import json
import os
import os.path
import time
import httpcore
import requests
from typing import Awaitable
import solana
import httpx
from rich import print
from solana.keypair import Keypair
from solana.publickey import PublicKey
from solana.rpc.api import Client
from solana.rpc.async_api import AsyncClient
from solana.rpc.commitment import Confirmed
from solana.rpc.types import TxOpts
from solana.transaction import Transaction
# Notes
# This is meant as a bare bones hello world and as such does not have :
#
# - error handling on http calls
# - checks / retries to ensure solana transactions go through
# - logging - just your basic print statement here. But at least you get the Rich pretty printing variant :)
#
# Libraries used
# - https://www.python-httpx.org/ - cause it's shinier and better than requests
# - https://michaelhly.github.io/solana-py/
# - https://github.com/Textualize/rich for pretty printing - because it rocks.
# I use poetry to manage dependencies but am not including the project file here for brevity.
# Mint constants
USDC_MINT = "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", 6
SOL_MINT = "So11111111111111111111111111111111111111112", 9
FAB_MINT = "EdAhkbj5nF9sRM7XN7ewuW8C9XEUMs8P7cnoQ57SYE96", 9
FUSD_MINT = "B7mXkkZgn7abwz1A3HnKkb18Y6y18WcbeSkh1DuLMkee", 8
# This works ok - most of the time
rpc_host = "https://api.mainnet-beta.solana.com"
filename = r"C:\Users\myname\.config\solana\burner.json"
def get_wallet_keypair(filename: str) -> Keypair:
"""Load a keypair from a filesystem wallet."""
if not os.path.isfile(filename):
raise Exception(f"Wallet file '{filename}' is not present.")
with open(filename) as json_file:
data = json.load(json_file)
mid = len(data) // 2
secret_key = data[:mid]
secret_bytes = bytes(secret_key)
keypair = Keypair.from_secret_key(secret_bytes)
print(f"Public Key is: {keypair.public_key}")
return keypair
async def get_quote(
input_mint: str, output_mint: str, amount: int, slippage: int = 0.2):
url_query = f"https://quote-api.jup.ag/v1/quote?outputMint={output_mint}&inputMint={input_mint}&amount={amount}&slippage={slippage}"
print(url_query)
async with httpx.AsyncClient() as client:
r = await client.get(url_query)
return r.json()
async def get_transaction(route: dict, user_key: str) -> dict:
swap_url = "https://quote-api.jup.ag/v1/swap"
input = {"route": route, "userPublicKey": user_key, "wrapUnwrapSOL": True}
print(json.dumps(input, indent=2))
async with httpx.AsyncClient() as client:
r = await client.post(swap_url, json=input,
timeout=6.0) # slightly longer timout as the free rpc server can be a bit laggy
return r.json()
def send_transaction(payer: Keypair, cc: Client, swap_transaction: str, opts: TxOpts) -> str:
""" Send a serialized transaction to the RPC node """
trans = Transaction.deserialize(base64.b64decode(swap_transaction))
result = cc.send_transaction(trans, payer, opts=opts)
txid = result["result"]
print(f"transaction details :https://solscan.io/tx/{txid}")
return txid
async def async_main(from_mint, from_decimals, to_mint, quantity):
cc = Client(rpc_host)
print(f" Converting {quantity} {from_mint} to {to_mint} with {from_decimals} Decimals")
quote_quantity = quantity * (10 ** from_decimals)
r = await get_quote(str(from_mint), str(to_mint), quote_quantity, slippage=2)
quote, outAmount = r["data"][0], int(r['data'][0]['outAmountWithSlippage']) / (10 ** from_decimals)
print("Out Amount =", outAmount)
if quote := r["data"][0]:
print(quote)
# get the relevant transaction details
trans = await get_transaction(quote, str(pubkey))
setup_transaction = trans["setupTransaction"] if "setupTransaction" in trans else None
swap_transaction = trans["swapTransaction"] if "swapTransaction" in trans else None
cleanup_transaction = trans["cleanupTransaction"] if "cleanupTransaction" in trans else None
opts = TxOpts(skip_preflight=True)
# Setup transaction. Will create any missing accounts if required.
if setup_transaction:
print("Sending setup transaction")
#print(setup_transaction)
send_transaction(payer, cc, setup_transaction, opts)
# This one actually does the business
if swap_transaction:
print("Sending swap transaction")
txid = send_transaction(payer, cc, swap_transaction, opts)
# Wait for the transaction to complete before looking it up on chain.
# Clearly this is *not* the right way to do this. Retry in a loop or something fancy.
await asyncio.sleep(20)
result = cc.get_transaction(txid, commitment=Confirmed)
print(result)
# Haven't seen one of these needed yet. Hopefully the jup.ag devs can explain when it's required.
if cleanup_transaction:
print("Sending send transaction")
send_transaction(payer, cc, cleanup_transaction, opts)
print("Swap Complete !")
return outAmount
def get_balance(input_mint):
url = "https://api.mainnet-beta.solana.com"
headers = {'Content-type': 'application/json'}
if input_mint == "So11111111111111111111111111111111111111112":
data = {"jsonrpc": "2.0", "id": 1, "method": "getBalance", "params": [f"{pubkey}"]}
response = requests.post(url, data=json.dumps(data), headers=headers)
response = response.text
parsed = json.loads(response)
# print(json.dumps(parsed, indent=4, sort_keys=True))
accountBal = (parsed['result']['value']) / 10 ** SOL_MINT[1]
print(accountBal)
else:
data = {"jsonrpc": "2.0", "id": 1, "method": "getTokenAccountsByOwner",
"params": [f"{pubkey}",
{"mint": f"{input_mint}"}, {"encoding": "jsonParsed"}]}
response = requests.post(url, data=json.dumps(data), headers=headers)
response = response.text
parsed = json.loads(response)
# print(json.dumps(parsed, indent=4, sort_keys=True))
accountBal = parsed['result']['value'][0]['account']['data']['parsed']['info']['tokenAmount']['uiAmount']
print(accountBal)
return accountBal
# usdc buys fusd fusd is sold for sol sol is sold for usdc
# (from_mint, from_decimals, to_mint, quantity):
class swaps:
def __init__(self, input_mint, decimals, output_mint, amount):
self.input_mint = input_mint
self.decimals = decimals
self.output_mint = output_mint
self.amount = amount
def swap(self):
asyncio.run(async_main(self.input_mint, self.decimals, self.output_mint, self.amount))
def first(count, previous = 0):
try:
if get_balance(USDC_MINT[0]) <= 1:
time.sleep(1)
count += 1
if count >= 60:
third(0)
first(count)
except TypeError:
first(0)
step1 = swaps(USDC_MINT[0], USDC_MINT[1], FUSD_MINT[0], get_balance(USDC_MINT[0]) if previous == 0 else previous)
try:
step1.swap()
except httpx.ReadTimeout:
print("Retrying")
time.sleep(10)
first(0)
second(0)
def second(count, previous = 0):
try:
if get_balance(FUSD_MINT[0]) <= 1:
time.sleep(1)
count += 1
if count >= 60:
first(0)
second(count)
except TypeError:
second(0)
step2 = swaps(FUSD_MINT[0], FUSD_MINT[1], SOL_MINT[0], get_balance(FUSD_MINT[0]) if previous == 0 else previous)
try:
step2.swap()
except:
print("Retrying")
time.sleep(10)
second(0)
count = 0
third(0)
def third(count, previous = 0):
if get_balance(SOL_MINT[0]) < .6:
time.sleep(1)
count += 1
if count >= 60:
second(0)
third(count)
step3 = swaps(SOL_MINT[0], SOL_MINT[1], USDC_MINT[0], get_balance(SOL_MINT[0]) - 0.5 if previous == 0 else previous)
try:
step3.swap()
except:
print("Retrying")
time.sleep(10)
third(previous)
print("All Swaps Completed")
return None
payer = get_wallet_keypair(filename)
pubkey = payer.public_key
loops = 0
if __name__ == "__main__":
previousBalence = get_balance(USDC_MINT[0])
print(f"Starting Balence: {previousBalence}")
#for loops in range(5):
first(0)
loops += 1
endBalance = get_balance((USDC_MINT[0]))
print(f"End balence is {endBalance}")
totalProfit = endBalance-previousBalence
print(f"Total Profit is: {totalProfit}")
Edit: The output when the code continues is it keeps trying to swap fUSD for SOL and SOL for USDC over and over again.
Solution: https://pastebin.com/8id7gfe4
I've been successfully querying s3 via athena from inside a lambda function for quite some time but it has suddenly stopped working. Further investigation shows that the response from get_query_execution() is returned a state of 'QUEUED' (which i was led to believe is not used?!)
My code is as follows:
def run_query(query, database, s3_output, max_execution=5):
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output
})
execution_id = response['QueryExecutionId']
print("QueryExecutionId = " + str(execution_id))
state = 'RUNNING'
while (max_execution > 0 and state in ['RUNNING']):
max_execution = max_execution - 1
print("maxexecution=" + str(max_execution))
response = client.get_query_execution(QueryExecutionId = execution_id)
if 'QueryExecution' in response and \
'Status' in response['QueryExecution'] and \
'State' in response['QueryExecution']['Status']:
state = response['QueryExecution']['Status']['State']
print(state)
if state == 'SUCCEEDED':
print("Query SUCCEEDED: {}".format(execution_id))
s3_key = 'athena_output/' + execution_id + '.csv'
print(s3_key)
local_filename = '/tmp/' + execution_id + '.csv'
print(local_filename)
rows = []
try:
print("s3key =" + s3_key)
print("localfilename = " + local_filename)
s3.Bucket(BUCKET).download_file(s3_key, local_filename)
with open(local_filename) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
rows.append(row)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
print(e)
else:
raise
return json.dumps(rows)
elif state == 'FAILED':
return False
time.sleep(10)
return False
So it obviously is working as it should be - it's just that the 'QUEUED' state is completely unexpected and i'm not sure what to do about it? What can cause the query_execution to become 'QUEUED' and what needs to change in my code to cater for it?
Take a look on Athena hook in Apache Airflow. Athena has final states (SUCCEEDED, FAILED and CANCELLED) and intermediate states - RUNNING and QUEUED. QUEUED is a normal state for a query before it got stared. So you could use code like this:
def run_query(query, database, s3_output, max_execution=5):
response = client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': database
},
ResultConfiguration={
'OutputLocation': s3_output
})
execution_id = response['QueryExecutionId']
print("QueryExecutionId = " + str(execution_id))
state = 'QUEUED'
while (max_execution > 0 and state in ['RUNNING', 'QUEUED']):
max_execution = max_execution - 1
print("maxexecution=" + str(max_execution))
response = client.get_query_execution(QueryExecutionId = execution_id)
if 'QueryExecution' in response and \
'Status' in response['QueryExecution'] and \
'State' in response['QueryExecution']['Status']:
state = response['QueryExecution']['Status']['State']
print(state)
if state == 'SUCCEEDED':
print("Query SUCCEEDED: {}".format(execution_id))
s3_key = 'athena_output/' + execution_id + '.csv'
print(s3_key)
local_filename = '/tmp/' + execution_id + '.csv'
print(local_filename)
rows = []
try:
print("s3key =" + s3_key)
print("localfilename = " + local_filename)
s3.Bucket(BUCKET).download_file(s3_key, local_filename)
with open(local_filename) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
rows.append(row)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
print(e)
else:
raise
return json.dumps(rows)
elif state == 'FAILED' or state == 'CANCELLED':
return False
time.sleep(10)
return False
Got this response from AWS - there has been changes to Athena that caused this issue (although QUEUED has been in the state enum for some time is hasn't been used until now):
The Athena team recently deployed a host of new functionality for Athena, including more granular CloudWatch metrics for Athena queries.
For more information:
AWS What's New page
Athena docs on CloudWatch metrics
As part of the deployment of more granular metrics, Athena now includes a QUEUED status for queries. This status indicates that an Athena query is waiting for resources to be allocated for processing. Query flow is roughly:
SUBMITTED -> QUEUED -> RUNNING -> COMPLETED/FAILED
Note that queries that fail due to system errors can be put back into the queue and retried.
I apologise for the frustration that this change has caused.
It seems like the forum formatting has stripped some elements from your code snippets.
However, I think that your WHILE loop is working on an array of the possible query statuses, which didn't previously cater for QUEUED.
If that is the case, then yes, adding QUEUED to that array will allow your application to handle the new status.
I currently have a master python script which launches 6 jobs on remote hosts, and polls whether the jobs are done or not over a long period (days, usually). However, in my code below, the first element in the self.job_results list is always ''sh: 1: mv: not found'. However, the 6 job values always are in that list (e.g. there are 7 elements in the list, and there should only be 6). It appears that rq.job.Job is returning this value; any idea why?
hosts = HOSTS.keys()
job_ids = []
for host in hosts:
r = requests.get(HOSTS[host] + 'launch_jobs', auth=('admin', 'secret'))
job_ids.append(r.text)
host_job_dict = dict(zip(hosts, job_ids))
print "HOST_JOB_DICT: %s " % host_job_dict
launch_time = datetime.datetime.now()
self.job_result = []
complete = False
status = [False]*len(hosts)
host_job_keys = host_job_dict.keys()
while not complete:
check_time = datetime.datetime.now()
time_diff = check_time - launch_time
if time_diff.seconds > JOB_TIMEOUT:
sys.exit('Job polling has lasted 10 days, something is wrong')
print "HOST_JOB_KEYS %s " % host_job_keys
for idx, key in enumerate(host_job_keys):
if not status[idx]:
host = HOSTS[key]
j_id = host_job_dict[key]
req = requests.get(host + 'check_job/' + j_id, auth=('admin', 'secret'))
if req.status_code == 202:
continue
elif req.status_code == 200:
self.job_result.append(req.json()['results'].encode('ascii').split())
status[idx] = True
complete = all(status)
time.sleep(1)
And on the server side of things...
#app.route("/check_job/<job_key>", methods=['GET'])
#requires_auth
def check_job(job_key):
job = Job.fetch(job_key, connection=conn)
if job.is_finished:
data = job.return_value
json_data = jsonify({"results": data})
# return Response(response=json_data, status=200, mimetype="application/json")
return json_data
elif job.status == 'failed':
return "Failed", 202
else:
return "Not yet", 202
This turned out to be an extremely convoluted issue where mv and other commands in /bin aren't being recognized. To get around this, we just were explicit and used /bin/mvinstead. We believe this issue cropped up as a result of a complication from a systemctl instantiation
I'm using bloomberg api for python to get the option data. Firstly, I got all the symbols of option chain. Then I used them to get the bid and ask prices. Through function getOptionChain, there are more than 400 options and I checked the result , it was fine. However, when I run the getPX function, I got only 10 results in the end. Could anyone help me looking into this? Thanks in advance!
import blpapi
import pandas
import csv
options = blpapi.SessionOptions()
options.setServerHost('localhost')
options.setServerPort(8194)
SECURITY_DATA = blpapi.Name("securityData")
SECURITY = blpapi.Name("security")
FIELD_DATA = blpapi.Name("fieldData")
FIELD_ID = blpapi.Name("fieldId")
OPT_CHAIN = blpapi.Name("OPT_CHAIN")
SECURITY_DES = blpapi.Name("Security Description")
def getOptionChain (sec_list):
session = blpapi.Session(options)
session.start()
session.openService('//blp/refdata')
refDataService = session.getService("//blp/refdata")
request = refDataService.createRequest("ReferenceDataRequest")
for s in sec_list:
request.append("securities",s)
request.append("fields", "OPT_CHAIN")
cid = session.sendRequest(request)
try:
# Process received events
while(True):
# We provide timeout to give the chance to Ctrl+C handling:
ev = session.nextEvent(500)
response = []
for msg in ev:
if cid in msg.correlationIds():
securityDataArray = msg.getElement(SECURITY_DATA)
for securityData in securityDataArray.values():
fieldData = securityData.getElement(FIELD_DATA)
for field in fieldData.elements():
for n in range(field.numValues()):
fld = field.getValueAsElement(n)
response.append (fld.getElement(SECURITY_DES).getValueAsString())
# Response completely received, so we could exit
if ev.eventType() == blpapi.Event.RESPONSE:
break
finally:
# Stop the session
session.stop()
return response
def getPX (sec_list, fld_list):
opt_chain_list = getOptionChain(sec_list)
session = blpapi.Session(options)
session.start()
session.openService('//blp/refdata')
refDataService = session.getService("//blp/refdata")
request = refDataService.createRequest("ReferenceDataRequest")
for s in opt_chain_list:
request.append("securities",s)
for f in fld_list:
request.append("fields",f)
cid = session.sendRequest(request)
try:
# Process received events
while(True):
# We provide timeout to give the chance to Ctrl+C handling:
ev = session.nextEvent(500)
response = {}
for msg in ev:
if cid in msg.correlationIds():
securityDataArray = msg.getElement(SECURITY_DATA)
for securityData in securityDataArray.values():
secName = securityData.getElementAsString(SECURITY)
fieldData = securityData.getElement(FIELD_DATA)
response[secName] = {}
for field in fieldData.elements():
response[secName][field.name()] = field.getValueAsFloat()
# Response completely received, so we could exit
if ev.eventType() == blpapi.Event.RESPONSE:
break
finally:
# Stop the session
session.stop()
tempdict = {}
for r in response:
tempdict[r] = pandas.Series(response[r])
data = pandas.DataFrame(tempdict)
return data
sec = ["IBM US Equity"]
fld = ["PX_ASK","PX_BID"]
getPX(sec,fld)
It looks like you've got the "response = {}" in the wrong place.
Currently you're clearing at each iteration of your loop so each event coming in refills it.
If you shift the "response = {}" to just before "While(True):" each iteration will append to it rather than clearing and refilling.
The same is true of the first function, but the bulk data comes back in a single event in this case. If you were using multiple securities you would see the same issue (a single Bloomberg refdata (partial) response contains data for at most 10 securities).