I'm trying to do some analytics analysis on Instagram photos that are posted with a specified hashtag. So now I'm trying to store all the images in a temporary database that'll be used for the analysis.
I'm using python and I've a celery task to get all the images, but it is not working when I run with a next_max_tag_id, which is probably wrong.
Does someone know how to get the correct next_max_tag_id?
this is the code I'm using:
#task()
def get_latest_photos():
next_max_tag_id = get_option('next_max_tag_id')
if not next_max_tag_id:
next_max_tag_id = 0
url = BASE + '/tags/{tag}/media/recent?client_id={cliend_id}' \
'&max_tag_id={max_id}'.format(**{
'tag': a_tag,
'cliend_id': getattr(settings, 'INSTAGRAM_CLIENT_ID'),
'max_id': next_max_tag_id
})
while url:
request = requests.get(url)
if request.status_code != 200:
pass #TODO: error
json_response = request.json()
if json_response['meta']['code'] != 200:
pass #TODO: error
# do something with json_response['data']:
url = None
if json_response.has_key('pagination'):
pagination = json_response['pagination']
if pagination.has_key('next_url'):
url = json_response['pagination']['next_url']
if pagination.has_key('next_max_tag_id'):
next_max_tag_id = pagination['next_max_tag_id']
update_option('next_max_tag_id', next_max_tag_id)
The flow is basically this:
get next_max_tag_id from the db (defaults to 0)
while we have a valid URL it fetches the data, the next url and the next_max_tag_id
updates the next_max_tag_id
The only thing that seems wrong to me is the next_max_tag_id, because every time I go to the API URL with the last next_max_tag_id I get the old images.
Yes. Here's how to use pagination correctly. You have to loop through the pages and reference the function you're in. You can update the script below that gets everyone you're following and query for next_max_id as well.
currently_following = set([])
def parse_following(next_url=None):
if next_url == None:
urlUserMedia = "https://api.instagram.com/v1/users/self/follows?access_token=%s" % (auth_token)
else:
urlUserMedia = next_url
values = {
'client_id' : client_id}
try:
data = urllib.urlencode(values)
req = urllib2.Request(urlUserMedia,None,headers)
response = urllib2.urlopen(req)
result = response.read()
dataObj = json.loads(result)
next_url = None
if dataObj.get('pagination') is not None:
next_url = dataObj.get('pagination').get('next_url')
currently_following.update(user['id'] for user in dataObj['data'])
if next_url is not None:
parse_following(next_url)
except Exception as e:
print e
Related
my code is about send get request using query parameters which depends on a page number
After that o have to do for loop to get some ids from the response and also getting the next page number of the same response
and send a new get request with the new next page number that I got from the first response, and I need to get the ids also from the new response
My code works fine , but I’m using two loop which it’s not the right way I think? I couldn’t do it with one loop any ideas?
def get():
response = requests.get(url, headers=header)
data = response.text
data = json.loads(data)
check_if_theres_next_page = data['pagination']['hasMorePages']
check_for_next_page_number = data['pagination']['nextPage']
last_page_number = data['pagination']['lastPage']
orders = data['orders']
list_of_ids = []
for manufacturingOrderId in orders:
ids = manufacturingOrderId['manufacturingOrderId']
list_of_ids.append(ids)
if check_for_next_page_number == 4:
check_for_next_page_number = last_page_number
if check_if_theres_next_page:
url_ = url + '&page_number=' + str(check_for_next_page_number)
response = requests.get(url_, headers=header)
data = response.text
data = json.loads(data)
orders = data['orders']
for manufacturingOrderId_ in orders:
ids = manufacturingOrderId_['manufacturingOrderId']
list_of_ids.append(ids)
if "nextPage" in data['pagination']:
check_for_next_page_number = data['pagination']['nextPage']
else:
check_if_theres_next_page = False
return list_of_ids
I'm trying to get the destination of a bunch of t.co links from Twitter. I can get this for active links, but when they are 404 or dead links, the program dies. If I enter this into the browser, it shows me the destination URL.
Is there a way to do this in Python 3?
This is my existing code:
import requests
import pandas as pd
from requests.models import Response
# Loading my array of links
data = pd.read_json('tco-links.json')
links = pd.DataFrame(data)
output = []
session = requests.Session() # so connections are recycled
with open('output.json', 'w') as f:
for index, row in links.iterrows():
fullLink = 'http://' + row['link']
try:
response = session.head(fullLink, allow_redirects=True)
except:
# how I'm handling errors right now
response = Response()
response.url = 'Failed'
output.append({
'link': fullLink,
'id': row['id'],
'unshortened': response.url
})
for x in output:
f.write(json.dumps(x) + '\n')
I wanted to extract drug class using Rxnorm API (RxNorm API) using NDC code. My python codes are:
#!/usr/bin/python
#pip install simplejson
import os
import sys
import requests
import simplejson as json
def connectionCheck():
url = 'http://rxnav.nlm.nih.gov/REST/version'
header = {'Accept': 'application/json'}
getCheck = requests.get(url, headers=header)
if getCheck.status_code != requests.codes.ok:
response = "RXNorm server response error. Response code: %s" % getCheck.status_code
else:
response = "Connection check complete. RXNorm online. Response code: %s" % getCheck.status_code
return response
def rxNorm(ndc):
# ndc value coming from master.py
# ndc = [array of ndc values]
if ndc[0] is None:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
else:
# if internet or request throws an error, print out to check connection and exit
try:
baseurl = 'http://rxnav.nlm.nih.gov/REST/'
# Searching RXNorm API, Search by identifier to find RxNorm concepts
# http://rxnav.nlm.nih.gov/REST/rxcui?idtype=NDC&id=0591-2234-10
# Set url parameters for searching RXNorm for SETID
ndcSearch = 'rxcui?idtype=NDC&id='
# Search RXNorm API, Return all properties for a concept
rxPropSearch = 'rxcui/'
rxttySearch = '/property?propName=TTY'
rxstringSearch = '/property?propName=RxNorm%20Name'
# Request RXNorm API to return json
header = {'Accept': 'application/json'}
def getTTY(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getTTY = requests.get(baseurl+rxPropSearch+rxCUI+rxttySearch, headers=header)
ttyJSON = json.loads(getTTY.text, encoding="utf-8")
return ttyJSON['propConceptGroup']['propConcept'][0]['propValue']
def getSTRING(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getString = requests.get(baseurl+rxPropSearch+rxCUI+rxstringSearch, headers=header)
stringJSON = json.loads(getString.text, encoding="utf-8")
return stringJSON['propConceptGroup']['propConcept'][0]['propValue']
# Search RXNorm using NDC code, return RXCUI id
# ndc = [ndc1, ndc2, ... ]
for item in ndc:
getRXCUI = requests.get(baseurl+ndcSearch+item, headers=header)
if getRXCUI.status_code != requests.codes.ok:
print ("RXNorm server response error. Response code: %s" % getRXCUI.status_code)
rxcuiJSON = json.loads(getRXCUI.text, encoding="utf-8")
# Check if first value in list returns a RXCUI, if not go to next value
try:
if rxcuiJSON['idGroup']['rxnormId']:
rxCUI = rxcuiJSON['idGroup']['rxnormId'][0]
rxTTY = getTTY(rxCUI)
rxSTRING = getSTRING(rxCUI)
return {"rxcui": rxCUI, "rxtty": rxTTY, "rxstring": rxSTRING}
except:
# if last item return null values
if item == ndc[-1]:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
pass
except:
sys.exit("RXNorm connection")
Test using Toy NDC ID Code:
dataTest=rxNorm(['69238131109'])
print(dataTest)
which gave me the following output:
{'rxcui': '483448', 'rxtty': 'SCD', 'rxstring': 'pregabalin 50 MG Oral Capsule'}
Now I am interested to get the drug class using 'rxcui': '483448' info using RxClass API. However, I couldn't make sense of this API. How can I use 'rxcui': '483448' info here to get the desired drug class. I appreciate your time. Thanks!
So I'm new to coding in general, but for my first project I'm trying to create a monitor to monitor product changes to a Shopify site.
My method was grab publicly shared code online and work backwards from there to understand it, so I've got the following code in a wider class which seems to take the products.json by looping through the pages.
But when I load up https://www.hanon-shop.com/collections/all/products.json but then print my Items list below, the first few products are different, how does that make sense?
def scrape_site(self):
"""
Scrapes the specified Shopify site and adds items to array
:return: None
"""
self.items = []
s = rq.Session()
page = 1
while page > 0:
try:
html = s.get(self.url + '?page=' + str(page) + '&limit=250', headers=self.headers, proxies=self.proxy, verify=False, timeout=20)
output = json.loads(html.text)['products']
if output == []:
page = 0
else:
for product in output:
product_item = [{'title': product['title'], 'image': product['images'][0]['src'], 'handle': product['handle'], 'variants':product['variants']}]
self.items.append(product_item)
logging.info(msg='Successfully scraped site')
page += 1
except Exception as e:
logging.error(e)
page = 0
time.sleep(0.5)
s.close()
Requests takes a dict of parameters and also has a json method, so this can be much cleaner.
import time
import requests
def scrape_site(self):
self.items = []
page = 1
with requests.Session() as s:
while True:
params = {
'page': page,
'limit': 250
}
try:
r = s.get(self.url, params=params, headers=self.headers, proxies=self.proxy, verify=False, timeout=20)
r.raise_for_status()
output = r.json()
if not output:
break
for product in output['products']:
product_item = {
'title': product['title'],
'image': product['images'][0]['src'],
'handle': product['handle'],
'variants':product['variants']
}
self.items.append(product_item)
logging.info(f'Successfully scraped page {page}')
page += 1
time.sleep(1)
except Exception as e:
logging.error(e)
break
return self.items
I am want to extract all Wikipedia titles via API.Each response contains continue key which is used to get next logical batch,but after 30 requests continue key starts to repeat it mean I am receiving same pages.
I have tried the following code above and Wikipedia documentation
https://www.mediawiki.org/wiki/API:Allpages
def get_response(self, url):
resp = requests.get(url=url)
return resp.json()
appcontinue = []
url = 'https://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=500'
json_resp = self.get_response(url)
next_batch = json_resp["continue"]["apcontinue"]
url +='&apcontinue=' + next_batch
appcontinue.append(next_batch)
while True:
json_resp = self.get_response(url)
url = url.replace(next_batch, json_resp["continue"]["apcontinue"])
next_batch = json_resp["continue"]["apcontinue"]
appcontinue.append(next_batch)
I am expecting to receive more than 10000 unique continue keys as one response could contains max 500 Titles.
Wikipedia has 5,673,237 articles in English.
Actual response. I did more than 600 requests and there is only 30 unique continue keys.
json_resp["continue"] contains two pairs of values, one is apcontinue and the other is continue. You should add them both to your query. See https://www.mediawiki.org/wiki/API:Query#Continuing_queries for more details.
Also, I think it'll be easier to use the params parameter of request.get instead of manually replacing the continue values. Perhaps something like this:
import requests
def get_response(url, params):
resp = requests.get(url, params)
return resp.json()
url = 'https://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=500'
params = {}
while True:
json_resp = get_response(url, params)
params = json_resp["continue"]
...