I'm using the Google APIs python client to download some data from Google Analytics. I basically copied one of their exampels and modified it to do exactly what I need.
I took this piece of code from the examples:
request = service.data().ga().get(
ids=ids,
start_date=str(start_date),
end_date=str(end_date),
dimensions=','.join(dimensions),
filters=filters,
sort="ga:date",
metrics=','.join(metrics)
)
Then add it to the batch object, and execute it once it has collected 10 requests. This all works well, but the problem is, some of those requests return a nextLink. Now I could just create a new request object (with the above code) with a different start-index, but isn't there a better way?
Is there a way to just parse the nextLink into a new request object?
I'm using this approach:
firstRun = True
params = {'ids':'ga:00000001',
'start_date':'2013-07-01',
'end_date':'2013-07-31',
'metrics':'ga:visits',
'dimensions':'ga:source',
'sort':'-ga:visits',
'start_index':1,
'max_results':10000}
while firstRun == True or result.get('nextLink'):
if firstRun == False:
params['start_index'] = int(params['start_index']) + int(params['max_results'])
result = service.data().ga().get(**params).execute()
firstRun = False
I could not found a way to parse the nextLink object and do a request with it but this was my solution and works fine:
max_results = 10000
params = {
'ids': 'ga:' + profile_id,
'start_date': start_date,
'end_date': end_date,
'metrics': ','.join(metrics),
'dimensions': ','.join(dimensions),
'start_index': 1,
'max_results': max_results
}
has_more = True
while has_more:
results = service.data().ga().get(**params).execute()
#do something with results
params['start_index'] = int(params['start_index']) + int(params['max_results'])
has_more = results.get('nextLink')
Why we can't just do:
params = {'ids': profile_id,
'start_date': start_date,
'end_date': end_date,
'metrics': metrics,
'dimensions': dimensions,
'sort': sort,
'start_index': 1,
'max_results': 1}
dummy_call = service.data().ga().get(**params).execute() # just to find out the totalResults number
params['max_results'] = dummy_call[u'totalResults'] # replace max_result with the totalResults number
all_rows = service.data().ga().get(**params).execute()[u'rows']
???
Related
I'm trying to get the last data from the bitmex API
Base URI: https://www.bitmex.com/api/v1
I don't really understand how to get the last data (from today) using filters : https://www.bitmex.com/app/restAPI
here is my code:
from datetime import date
import requests
import json
import pandas as pd
today = date.today()
d1 = today.strftime("%Y-%m-%d")
#print("d1 =", d1)
def parser():
today = date.today()
# yy/dd/mm
d1 = today.strftime("%Y-%m-%d")
# print("d1 =", d1)
return f'https://www.bitmex.com/api/v1/trade?symbol=.BVOL24H&startTime={d1}×tamp.time=12:00:00.000&columns=price'
# Making a get request
response = requests.get(parser()).json()
# print(response)
for elem in response:
print(elem)
and the response is :
...
{'symbol': '.BVOL24H', 'timestamp': '2021-12-27T08:05:00.000Z', 'price': 2.02}
{'symbol': '.BVOL24H', 'timestamp': '2021-12-27T08:10:00.000Z', 'price': 2.02}
{'symbol': '.BVOL24H', 'timestamp': '2021-12-27T08:15:00.000Z', 'price': 2.02}
it's missing a few hours, I tried using endTime, StartTime and Count without success..
I think I need to pass another filter like endtime = now and timestamp.time = now but I don't know how to send a payload or how to url-encode it.
As Filtering part tells
Many table endpoints take a filter parameter. This is expected to be JSON
These parameters are not keys in the query string, but keys in a dictionary given in filter key
url = "https://www.bitmex.com/api/v1/trade"
filters = {
'startTime': date(2021, 12, 20).strftime("%Y-%m-%d"),
'timestamp.time': '12:00:00.000'
}
params = {
'symbol': '.BVOL24H',
'filter': json.dumps(filters),
}
response = requests.get(url, params=params)
for elem in response.json():
print(elem)
Example
/trade?symbol=.BVOL24H&filter={%22startTime%22:%222021-12-20%22,%22timestamp.time%22:%2212:00:00.000%22}
You can add additional parameters to the url with & like below.
'https://www.bitmex.com/api/v1/trade?symbol=.BVOL24H&startTime={d1}×tamp.time=12:00:00.000&columns=price&endTime={date.today()}×tamp.time={date.today()}'
I have successfully written code that calls an API and then converts the results into a DataFrame.
wax_wallet = "zqsfm.wam"
# Get Assets from AtomicHub API
response1 = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets?"
f"owner={wax_wallet}"
"&collection_whitelist=nftdraft2121"
"&page=1"
"&limit=1000"
"&order=asc"
"&sort=name")
# Save Response as JSON
json_assets = response1.json()
# Convert JSON to DataFrame
df = pd.json_normalize(json_assets['data'])
This API returns at most 1000 items per page so I need to have it loop through as many pages as needed and ultimately get the results stored into a DataFrame.
I attempted to solve it with the below code, but was unsuccessful.
asset_count = 2500
pages = int(math.ceil(asset_count / 1000))
# Get Assets from AtomicHub API
all_assets = []
for page in range(1, pages):
url = f'https://wax.api.atomicassets.io/atomicassets/v1/assets?owner={wax_wallet}' \
f'&collection_whitelist=nftdraft2121&page={page}&limit=1000&order=asc&sort=name'
response = rq.get(url)
all_assets.append(json.loads(response.text))["response"]
Thanks in advance for any help!
You can turn them into dataframes and then concatenate the individual frames into a final result:
def get_page(page_num):
wax_wallet = "zqsfm.wam"
response = requests.get(
"https://wax.api.atomicassets.io/atomicassets/v1/assets",
params={
"owner": wax_wallet,
"collection_whitelist": "nftdraft2121",
"page": page_num,
"limit": "1000",
"order": "asc",
"sort": "name"
}
)
json_assets = response.json()
return pd.json_normalize(json_assets['data'])
# The number of pages you want
number_of_pages_requested = 10
# Get all pages as dataframes
pages = [get_page(n + 1) for n in range(number_of_pages_requested)]
# Combine pages to single dataframe
df = pd.concat(pages)
Edit: updated using params based on Olvin Roght's comment
Edit 2: fixed indexing error
I think this should help:-
import requests
all_assets = []
URL = 'https://wax.api.atomicassets.io/atomicassets/v1/assets'
params = {
'owner': 'zqsfm.wam',
'collection_whitelist': 'nftdraft2121',
'page': 1,
'order': 'asc',
'sort': 'name',
'limit': 1000
}
with requests.Session() as session:
while True:
print(f"Getting page {params['page']}")
response = session.get(URL, params=params)
response.raise_for_status()
_j = response.json()
data = _j['data']
if len(data) > 0:
all_assets.append(data)
params['page'] += 1
else:
break
print('Done')
I have the below code, and want to get it to return a dataframe properly. The polling logic works, but the dataframe doesn't seem to get created/returned. Right now it just returns None when called.
import boto3
import pandas as pd
import io
import re
import time
AK='mykey'
SAK='mysecret'
params = {
'region': 'us-west-2',
'database': 'default',
'bucket': 'my-bucket',
'path': 'dailyreport',
'query': 'SELECT * FROM v_daily_report LIMIT 100'
}
session = boto3.Session(aws_access_key_id=AK,aws_secret_access_key=SAK)
# In[32]:
def athena_query(client, params):
response = client.start_query_execution(
QueryString=params["query"],
QueryExecutionContext={
'Database': params['database']
},
ResultConfiguration={
'OutputLocation': 's3://' + params['bucket'] + '/' + params['path']
}
)
return response
def athena_to_s3(session, params, max_execution = 5):
client = session.client('athena', region_name=params["region"])
execution = athena_query(client, params)
execution_id = execution['QueryExecutionId']
df = poll_status(execution_id, client)
return df
def poll_status(_id, client):
'''
poll query status
'''
result = client.get_query_execution(
QueryExecutionId = _id
)
state = result['QueryExecution']['Status']['State']
if state == 'SUCCEEDED':
print(state)
print(str(result))
s3_key = 's3://' + params['bucket'] + '/' + params['path']+'/'+ _id + '.csv'
print(s3_key)
df = pd.read_csv(s3_key)
return df
elif state == 'QUEUED':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'RUNNING':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'FAILED':
return result
else:
print(state)
raise Exception
df_data = athena_to_s3(session, params)
print(df_data)
I plan to move the dataframe load out of the polling function, but just trying to get it to work as is right now.
I recommend you to take a look at AWS Wrangler instead of using the traditional boto3 Athena API. This newer and more specific interface to all things data in AWS including queries to Athena and giving more functionality.
import awswrangler as wr
df = wr.pandas.read_sql_athena(
sql="select * from table",
database="database"
)
Thanks to #RagePwn comment it is worth checking PyAthena as an alternative to the boto3 option to query Athena.
If it is returning None, then it is because state == 'FAILED'. You need to investigate the reason it failed, which may be in 'StateChangeReason'.
{
'QueryExecution': {
'QueryExecutionId': 'string',
'Query': 'string',
'StatementType': 'DDL'|'DML'|'UTILITY',
'ResultConfiguration': {
'OutputLocation': 'string',
'EncryptionConfiguration': {
'EncryptionOption': 'SSE_S3'|'SSE_KMS'|'CSE_KMS',
'KmsKey': 'string'
}
},
'QueryExecutionContext': {
'Database': 'string'
},
'Status': {
'State': 'QUEUED'|'RUNNING'|'SUCCEEDED'|'FAILED'|'CANCELLED',
'StateChangeReason': 'string',
'SubmissionDateTime': datetime(2015, 1, 1),
'CompletionDateTime': datetime(2015, 1, 1)
},
'Statistics': {
'EngineExecutionTimeInMillis': 123,
'DataScannedInBytes': 123,
'DataManifestLocation': 'string',
'TotalExecutionTimeInMillis': 123,
'QueryQueueTimeInMillis': 123,
'QueryPlanningTimeInMillis': 123,
'ServiceProcessingTimeInMillis': 123
},
'WorkGroup': 'string'
}
}
Just to elaborate on the RagePwn's answer of using PyAthena -that's what I ultimately did as well. For some reason AwsWrangler choked on me and couldn't handle the JSON that was being returned from S3. Here's the code snippet that worked for me based on PyAthena's PyPi page
import os
from pyathena import connect
from pyathena.util import as_pandas
aws_access_key_id = os.getenv('ATHENA_ACCESS_KEY')
aws_secret_access_key = os.getenv('ATHENA_SECRET_KEY')
region_name = os.getenv('ATHENA_REGION_NAME')
staging_bucket_dir = os.getenv('ATHENA_STAGING_BUCKET')
cursor = connect(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
s3_staging_dir=staging_bucket_dir,
).cursor()
cursor.execute(sql)
df = as_pandas(cursor)
The above assumes you have defined as environment variables the following:
ATHENA_ACCESS_KEY: the AWS access key id for your AWS account
ATHENA_SECRET_KEY: the AWS secret key
ATHENA_REGION_NAME: the AWS region name
ATHENA_STAGING_BUCKET: a bucket in the same account that has the correct access settings (explanation of which is outside the scope of this answer)
I am writing an API request that gives paginated results.
To get results from the next page I need to take a value of 'next_page_cursor' and put it in the parameters of my request that is a dictionary.
This is what I have tried so far. Need to keep changing cursor value in params until there are no more pages.
params = {'title': 'Cybertruck',
'per_page':100,
'cursor': '*'
}
response = requests.get("https://api.aylien.com/news/stories",
headers = headers, params=params).json()
if "next_page_cursor" in response:
cursor = response["next_page_cursor"]
You can use a while loop:
params = {
"title": "Cybertruck",
"per_page": 100,
"cursor": "initial_cursor"
}
def make_request(params)
return requests.get("https://api.aylien.com/news/stories",
headers=headers, params=params).json()
result = []
response = make_request(params)
while "next_page_cursor" in response:
params["cursor"] = response["next_page_cursor"]
response = make_request(params)
result.append(response["information_your_are_interested_in"])
I have followed a tutorial on how to download data from Google Analytics with Python using GA Reporting API. I was able to query the data I wanted, although reaching the rows limit.
I saw in the documentation that there is a pageToken to avoid the issue. I have added this field to my request (as describe in the documentation), but I am not able to make it work.
sample_request = {
'viewId': '12345678',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 30),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [
{'name': 'ga:date'},
{'name': 'ga:dimension7'},
{'name': 'ga:dimension6'},
{'name': 'ga:dimension9'}
],
'metrics': [
{'expression': 'ga:users'},
{'expression': 'ga:totalevents'}
],
"pageSize": 100000,
'pageToken': 'abc'
}
response = api_client.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()
You will hit the limit, but the parameter nextPageToken will allow you to page through multiple rows. For example:
def processReport (self, aDimensions):
"""Get a full report, returning the rows"""
# Get the first set
oReport = self.getReport(aDimensions)
oResponse = self.getResponse(oReport, True)
aRows = oResponse.get('rows')
# Add any additional sets
while oResponse.get('nextPageToken') != None:
oResponse = self.getReport(aDimensions, oResponse.get('nextPageToken'))
oResponse = self.getResponse(oResponse, False)
aRows.extend(oResponse.get('rows'))
return aRows
You can see the complete program here:
https://github.com/aiqui/ga-download
I solved it like this
def handle_report(analytics,pagetoken,rows):
response = get_report(analytics, pagetoken)
columnHeader = response.get("reports")[0].get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
pagetoken = response.get("reports")[0].get('nextPageToken', None)
rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
rows = rows + rowsNew
print("len(rows): " + str(len(rows)))
if pagetoken != None:
return handle_report(analytics,pagetoken,rows)
else:
return rows
def main():
analytics = initialize_analyticsreporting()
global dfanalytics
dfanalytics = []
rows = []
rows = handle_report(analytics,'0',rows)
dfanalytics = pd.DataFrame(list(rows))