I have been trying to optimize my code, which fetches the data from a given ip address and has pagination involved in it. I have tried applying multiprocessing/multithreading but can't able to implement them successfully.
My goal of using multiprocessing is to reduce the execution time. Please help me out in this.
has_pagination = True
session_obj = some_value #like this <requests.sessions.Session object at 0x7fac952c4fa0>
headers = {'X-XSRF-TOKEN':token, 'Content-type':'application/json', 'Accept':'application/json', 'Cookie':jsessionid}
while has_pagination:
url = f"https://{self.ip}/data/page?count=100&scrollId={scroll_id}"
response = session_object.get(url=url, headers=headers, verify=False)
try:
resp_json = response.json()
data = resp_json['data']
has_pagination = resp_json['pageInfo']['hasMoreData']
except Exception as e:
print(f'status code: {response.status_code}, {e}')
# # Logging out if session is established
logout_url = f'https://{self.ip}/logout?nocache=123456'
logout_resp = session_obj.get(url=logout_url, headers=headers, verify=False, allow_redirects=False)
print(f'Logging out. Response code: {str(logout_resp.status_code)}')
# # Relogin
print(f'2. Trying to re-login with connnection details Token: {str(token)}. Jsessionid: {str(jsessionid)}. Headers: {str(headers)}.')
login_result = self.login()
if login_result == False:
return False
else:
number_of_relogin += 1
session_obj, token, jsessionid = login_result[0], login_result[1], login_result[2]
headers = {'X-XSRF-TOKEN':token, 'Content-type':'application/json', 'Accept':'application/json', 'Cookie':jsessionid}
logger.info(f'2. New session established with Token: {str(token)}. Jsessionid: {str(jsessionid)}. Headers: {str(headers)}')
Related
I wanted to extract drug class using Rxnorm API (RxNorm API) using NDC code. My python codes are:
#!/usr/bin/python
#pip install simplejson
import os
import sys
import requests
import simplejson as json
def connectionCheck():
url = 'http://rxnav.nlm.nih.gov/REST/version'
header = {'Accept': 'application/json'}
getCheck = requests.get(url, headers=header)
if getCheck.status_code != requests.codes.ok:
response = "RXNorm server response error. Response code: %s" % getCheck.status_code
else:
response = "Connection check complete. RXNorm online. Response code: %s" % getCheck.status_code
return response
def rxNorm(ndc):
# ndc value coming from master.py
# ndc = [array of ndc values]
if ndc[0] is None:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
else:
# if internet or request throws an error, print out to check connection and exit
try:
baseurl = 'http://rxnav.nlm.nih.gov/REST/'
# Searching RXNorm API, Search by identifier to find RxNorm concepts
# http://rxnav.nlm.nih.gov/REST/rxcui?idtype=NDC&id=0591-2234-10
# Set url parameters for searching RXNorm for SETID
ndcSearch = 'rxcui?idtype=NDC&id='
# Search RXNorm API, Return all properties for a concept
rxPropSearch = 'rxcui/'
rxttySearch = '/property?propName=TTY'
rxstringSearch = '/property?propName=RxNorm%20Name'
# Request RXNorm API to return json
header = {'Accept': 'application/json'}
def getTTY(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getTTY = requests.get(baseurl+rxPropSearch+rxCUI+rxttySearch, headers=header)
ttyJSON = json.loads(getTTY.text, encoding="utf-8")
return ttyJSON['propConceptGroup']['propConcept'][0]['propValue']
def getSTRING(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getString = requests.get(baseurl+rxPropSearch+rxCUI+rxstringSearch, headers=header)
stringJSON = json.loads(getString.text, encoding="utf-8")
return stringJSON['propConceptGroup']['propConcept'][0]['propValue']
# Search RXNorm using NDC code, return RXCUI id
# ndc = [ndc1, ndc2, ... ]
for item in ndc:
getRXCUI = requests.get(baseurl+ndcSearch+item, headers=header)
if getRXCUI.status_code != requests.codes.ok:
print ("RXNorm server response error. Response code: %s" % getRXCUI.status_code)
rxcuiJSON = json.loads(getRXCUI.text, encoding="utf-8")
# Check if first value in list returns a RXCUI, if not go to next value
try:
if rxcuiJSON['idGroup']['rxnormId']:
rxCUI = rxcuiJSON['idGroup']['rxnormId'][0]
rxTTY = getTTY(rxCUI)
rxSTRING = getSTRING(rxCUI)
return {"rxcui": rxCUI, "rxtty": rxTTY, "rxstring": rxSTRING}
except:
# if last item return null values
if item == ndc[-1]:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
pass
except:
sys.exit("RXNorm connection")
Test using Toy NDC ID Code:
dataTest=rxNorm(['69238131109'])
print(dataTest)
which gave me the following output:
{'rxcui': '483448', 'rxtty': 'SCD', 'rxstring': 'pregabalin 50 MG Oral Capsule'}
Now I am interested to get the drug class using 'rxcui': '483448' info using RxClass API. However, I couldn't make sense of this API. How can I use 'rxcui': '483448' info here to get the desired drug class. I appreciate your time. Thanks!
I am trying to build a class to automate requests to our suplier, but i am having trouble with pagination.
This is the working snippet so far, but the api limits the request to 1000 records, and if the endpoint has more, then I would have to paginate:
response = requests.get(url, data=params, headers=headers).json()
return response
This is what I tried and failed, getting a KeyError: 'next':
response = requests.get(url, data=params, headers=headers).json()
results = response['data']
while response['links']['next']:
response = requests.get(response['links']['next'], data=params, headers=headers).json()
results.extend(response['data'])
return results
you can check the basic sctructure for the response here in the API doc.
Please enlighten me, thank you very much!
You could simply check
while "next" in response['links']:
# ... code ...
But if it can get other problems then putting all in `try/except can be also usefull.
results = [] # empty list
try:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
while "next" in json_data['links']:
response = requests.get(json_data['links']['next'], data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
except Exception as ex:
print("Exception:", ex)
return results
But I would reduce it to while True
results = [] # empty list
try:
while True:
response = requests.get(url, data=params, headers=headers)
#print(response.status_code) # for debug
#print(response.text) # for debug
json_data = response.json()
results += json_data['data']
if "next" not in json_data['links']:
break
url = json_data['links']['next']
except Exception as ex:
print("Exception:", ex)
return results
I have a Python class that sends payloads to AWS with boto3 and requests library. However, sometimes the http requests fail with various codes, so I wanted to write a wrapper function inside the class that will retry to send the payload 5 times if it gets certain codes, and raise an exception if it completely fails. Here is the class method(assume method calls work as expected):
import requests
from boto3 import Session
def update_status(self, status):
payload = status
auth = self.sign_request()
response = requests.patch(self.url, auth=auth, data=payload)
status_code = response.status_code
response_text = response.text
if not response.ok:
logging.error("Failed updating status of request: " + str(
{'host': self.host, 'region': self.region,
'service': self.service, 'url': self.url, 'status': str(status)}))
raise IOError('Update training status failed with status code: ' + str(status_code) + '\n' + response_text)
logging.info("Updated status")
Sometimes this api call will fail with status 504. I would like to write a wrapper retry method around this class method that will by default retry 5 times with a wait of retry^2 between every try, and exit the loop if it's a success with code 200.
I found this code which seems to be along the lines of what I would use, I'm just not sure how to wrap my current method inside this and call it:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def requests_retry_session(
retries=5,
backoff_factor=0.3,
status_forcelist=(500, 502, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
Issue with the above code is it is using requests.session and returning it while my class is already using boto3.Session. Any help would be appreciated!
I'd try something like this:
import time
import requests
from functools import wraps
import logging
logging.basicConfig(level=logging.DEBUG)
def retry(delay=10, retries=4):
def retry_decorator(f):
#wraps(f)
def f_retry(*args, **kwargs):
opt_dict = {'retries': retries, 'delay': delay}
while opt_dict['retries'] > 1:
try:
return f(*args, **kwargs)
except Exception as e:
msg = "Exception: {}, Retrying in {} seconds...".format(e, delay)
print(msg)
time.sleep(opt_dict['delay'])
opt_dict['retries'] -= 1
return f(*args, **kwargs)
return f_retry
return retry_decorator
class YourClass:
# JUST MOCK FOR PROOF OF CONCEPT
url = 'YOUR URL'
status = 'YOUR STATUS'
def sign_request(self):
return ''
host = 'YOUR HOST'
region = 'YOUR REGION'
service = 'YOUR SERVICE'
# MOCK END
def update_status(self, status):
payload = status
auth = self.sign_request()
#retry(1, 5)
def get_status():
response = requests.patch(self.url, auth=auth, data=payload)
if not response.ok:
logging.error("Failed updating status of request: " + str(
{'host': self.host, 'region': self.region,
'service': self.service, 'url': self.url, 'status': str(status)}))
raise IOError('Update training status failed with status code: ' + str(response.status_code) + '\n' + response.text)
return response
res = get_status()
status_code = res.status_code
response_text = res.text
logging.info("Updated status")
x = YourClass()
x.url = 'https://httpstat.us/200'
x.update_status('')
x.url = 'https://httpstat.us/504'
x.update_status('')
Of course you may want to adjust it to your needs.
I need help on how to do a loop so each time I make a GET request, it will always be the new page from the API.
I start with getting the first response. It includes a parameter to the next page next_key
{
"result": [
{
...,
...
}
],
"next_key": 123
}
Below is my current attempt
import requests
import json
url = "https://flespi.io/gw/channels/all/messages"
headers = {"Authorization": "FlespiToken 23ggh45"}
def getFirst():
data = {"limit_count":100, "limit_size":10000}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqFirst = requests.get(url, params=params, headers=headers).json()
return reqFirst["next_key"] ## this returns "123"
def getDataNext():
data = {"limit_count":100, "limit_size":10000, "curr_key":getFirst()}
params = {"data":json.dumps(data, separators=(",", ":"))}
reqNext = requests.get(url, params=params, headers=headers)
jsonData = reqNext.json()
while True:
if "next_key" in jsonData:
data = {"limit_count":100, "limit_size":10000,"curr_key":jsonData["next_key"]}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
print req["next_key"] # this returns "3321" which is the value for "next_key" in second page
else:
pass
getDataNext()
The full url including limit count, limit size and curr key is as follows https://flespi.io/gw/channels/all/messages?data=%7B%22curr_key%22%123%2C%22limit_count%22%3A100%2C%22limit_size%22%3A10000%7D
As you can see this only returns the second page that is jsonData["next_key"]. What I want to do is that for each GET request, the program will read the next_key and put it on the next GET request.
I am thinking to use increment on the curr_key but the key is random and also I do not know how many page there is.
I believe there must be just a simple solution for this but apparently I could not think about it. Thank you for your help and suggestion.
try this
has_next_key = False
nextKey = ""
if "next_key" in jsonData:
has_next_key = True
nextKey = jsonData["next_key"]
while has_next_key:
data = {"limit_count":100, "limit_size":10000,"curr_key":nextKey}
params = {"data":json.dumps(data, separators=(",", ":"))}
req = requests.get(url, params=params, headers=headers).json() ## this should do GET request for the third page and so on...
if "next_key" in req:
nextKey = req["next_key"]
print nextKey # this returns "3321" which is the value for "next_key" in second page
else:
has_next_key = False
# no next_key, stop the loop
I'm trying to do some analytics analysis on Instagram photos that are posted with a specified hashtag. So now I'm trying to store all the images in a temporary database that'll be used for the analysis.
I'm using python and I've a celery task to get all the images, but it is not working when I run with a next_max_tag_id, which is probably wrong.
Does someone know how to get the correct next_max_tag_id?
this is the code I'm using:
#task()
def get_latest_photos():
next_max_tag_id = get_option('next_max_tag_id')
if not next_max_tag_id:
next_max_tag_id = 0
url = BASE + '/tags/{tag}/media/recent?client_id={cliend_id}' \
'&max_tag_id={max_id}'.format(**{
'tag': a_tag,
'cliend_id': getattr(settings, 'INSTAGRAM_CLIENT_ID'),
'max_id': next_max_tag_id
})
while url:
request = requests.get(url)
if request.status_code != 200:
pass #TODO: error
json_response = request.json()
if json_response['meta']['code'] != 200:
pass #TODO: error
# do something with json_response['data']:
url = None
if json_response.has_key('pagination'):
pagination = json_response['pagination']
if pagination.has_key('next_url'):
url = json_response['pagination']['next_url']
if pagination.has_key('next_max_tag_id'):
next_max_tag_id = pagination['next_max_tag_id']
update_option('next_max_tag_id', next_max_tag_id)
The flow is basically this:
get next_max_tag_id from the db (defaults to 0)
while we have a valid URL it fetches the data, the next url and the next_max_tag_id
updates the next_max_tag_id
The only thing that seems wrong to me is the next_max_tag_id, because every time I go to the API URL with the last next_max_tag_id I get the old images.
Yes. Here's how to use pagination correctly. You have to loop through the pages and reference the function you're in. You can update the script below that gets everyone you're following and query for next_max_id as well.
currently_following = set([])
def parse_following(next_url=None):
if next_url == None:
urlUserMedia = "https://api.instagram.com/v1/users/self/follows?access_token=%s" % (auth_token)
else:
urlUserMedia = next_url
values = {
'client_id' : client_id}
try:
data = urllib.urlencode(values)
req = urllib2.Request(urlUserMedia,None,headers)
response = urllib2.urlopen(req)
result = response.read()
dataObj = json.loads(result)
next_url = None
if dataObj.get('pagination') is not None:
next_url = dataObj.get('pagination').get('next_url')
currently_following.update(user['id'] for user in dataObj['data'])
if next_url is not None:
parse_following(next_url)
except Exception as e:
print e