Use of pageToken with Google Analytics Reporting API v4 and Python - python

I have followed a tutorial on how to download data from Google Analytics with Python using GA Reporting API. I was able to query the data I wanted, although reaching the rows limit.
I saw in the documentation that there is a pageToken to avoid the issue. I have added this field to my request (as describe in the documentation), but I am not able to make it work.
sample_request = {
'viewId': '12345678',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 30),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [
{'name': 'ga:date'},
{'name': 'ga:dimension7'},
{'name': 'ga:dimension6'},
{'name': 'ga:dimension9'}
],
'metrics': [
{'expression': 'ga:users'},
{'expression': 'ga:totalevents'}
],
"pageSize": 100000,
'pageToken': 'abc'
}
response = api_client.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()

You will hit the limit, but the parameter nextPageToken will allow you to page through multiple rows. For example:
def processReport (self, aDimensions):
"""Get a full report, returning the rows"""
# Get the first set
oReport = self.getReport(aDimensions)
oResponse = self.getResponse(oReport, True)
aRows = oResponse.get('rows')
# Add any additional sets
while oResponse.get('nextPageToken') != None:
oResponse = self.getReport(aDimensions, oResponse.get('nextPageToken'))
oResponse = self.getResponse(oResponse, False)
aRows.extend(oResponse.get('rows'))
return aRows
You can see the complete program here:
https://github.com/aiqui/ga-download

I solved it like this
def handle_report(analytics,pagetoken,rows):
response = get_report(analytics, pagetoken)
columnHeader = response.get("reports")[0].get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
pagetoken = response.get("reports")[0].get('nextPageToken', None)
rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
rows = rows + rowsNew
print("len(rows): " + str(len(rows)))
if pagetoken != None:
return handle_report(analytics,pagetoken,rows)
else:
return rows
def main():
analytics = initialize_analyticsreporting()
global dfanalytics
dfanalytics = []
rows = []
rows = handle_report(analytics,'0',rows)
dfanalytics = pd.DataFrame(list(rows))

Related

How to show specific page views using Google Analytics API?

for an API that I am using, we need to be able to view what specific pages are being clicked on and output to a CSV File. I am able to see the average session duration, and the amount of page views. I am curious as to what I need to add into my code attached below to make sure that this is possible for exporting to a CSV file. Thank you!
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'client_secrets.json'
VIEW_ID ='insert here'
credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
response = analytics.reports().batchGet(body={
'reportRequests': [{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '30daysAgo', 'endDate': 'today'}],
'metrics': [
{"expression": "ga:pageviews"},
{"expression": "ga:avgSessionDuration"}
], "dimensions": [
{"name": "ga:deviceCategory"}
]
}]}).execute()
response
{'reports': [{'columnHeader': {'dimensions': ['ga:deviceCategory'],
'metricHeader': {'metricHeaderEntries': [{'name': 'ga:pageviews',
'type': 'INTEGER'},
{'name': 'ga:avgSessionDuration', 'type': 'TIME'}]}},
'data': {'isDataGolden': True,
'maximums': [{'values': ['485', '94.95454545454545']}],
'minimums': [{'values': ['29', '51.21186440677966']}],
'rowCount': 3,
'rows': [{'dimensions': ['desktop'],
'metrics': [{'values': ['485', '51.21186440677966']}]},
{'dimensions': ['mobile'],
'metrics': [{'values': ['409', '69.30859375']}]},
{'dimensions': ['tablet'],
'metrics': [{'values': ['29', '94.95454545454545']}]}],
'totals': [{'values': ['923', '60.06487341772152']}]}}]}
import pandas as pd
df = pd.DataFrame(columns=['Name', 'Age'])
def ga_response_dataframe(response):
row_list = []
# Get each collected report
for report in response.get('reports', []):
# Set column headers
column_header = report.get('columnHeader', {})
dimension_headers = column_header.get('dimensions', [])
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
# Get each row in the report
for row in report.get('data', {}).get('rows', []):
# create dict for each row
row_dict = {}
dimensions = row.get('dimensions', [])
date_range_values = row.get('metrics', [])
# Fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimension_headers, dimensions):
row_dict[header] = dimension
# Fill dict with metric header (key) and metric value (value)
for i, values in enumerate(date_range_values):
for metric, value in zip(metric_headers, values.get('values')):
# Set int as int, float a float
if ',' in value or '.' in value:
row_dict[metric.get('name')] = float(value)
else:
row_dict[metric.get('name')] = int(value)
row_list.append(row_dict)
df = row_list
return df
df = ga_response_dataframe(response)
#df = pd.DataFrame(row_list)
print(df)

How can I transform the json output from Google Analytics into a Pandas dataframe in Python?

TLDR: How can I transform the json output from Google Analytics into a Pandas dataframe in Python?
I'm using the official documentation to get results out of Google Analytics. This works, but I would like to have the output in a pandas dataframe for further data analysis instead of in json format.
Remarks:
When you look at the code below, you'll see that I've commented out '''print(response)''' because that outputs the results in json format.
I've used the code from this article to write the pandas part.
I've searched extensively on Stackoverflow to find a solution but to no avail.
Thank you in advance.
def get_report(analytics):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '7daysAgo', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:adMatchedQuery'}]
}]
}
).execute()
response = get_report(initialize_analyticsreporting())
'''print(response)'''
def print_response(response):
list = []
# get report data
for report in response.get('reports', []):
# set column headers
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
#set int as int, float a float
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
df = pd.DataFrame(list)
return df

Downloading batch reports from google analytics API v4

I am trying to get a report for 3 months, for this, I need to make multiple requests and append the results to a list because API only returns 100,000 rows per request. There is a variable returned from the API named nextPageToken which I need to pass into the next query to get the next 100,000 rows of the report. I am having a difficult time trying to do this.
Here is my code:
def initialize_analyticsreporting():
'''Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
'''
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
list = []
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '90daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:adClicks'}, {'expression': 'ga:impressions'}, {'expression': 'ga:adCost'}, {'expression': 'ga:CTR'}, {'expression': 'ga:CPC'}, {'expression': 'ga:costPerTransaction'}, {'expression': 'ga:transactions'}, {'expression': 'ga:transactionsPerSession'}, {'expression': 'ga:pageviews'}, {'expression': 'ga:timeOnPage'}],
"pageToken": pageTokenVariable,
'dimensions': [{'name': 'ga:adMatchedQuery'}, {'name': 'ga:campaign'}, {'name': 'ga:adGroup'}, {'name': 'ga:adwordsCustomerID'}, {'name': 'ga:date'}],
'orderBys': [{'fieldName': 'ga:impressions', 'sortOrder': 'DESCENDING'}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adwordsCustomerID',
'operator': 'EXACT',
'expressions': 'abc',
'not': 'True'
}]
}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adMatchedQuery',
'operator': 'EXACT',
'expressions': '(not set)',
'not': 'True'
}]
}]
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
# Append that data to a list as a dictionary
# pagination function
while pagetoken: # This says while there is info in the nextPageToken get the data, process it and add to the list
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0]['nextPageToken']
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
df = pd.DataFrame(list)
print(df) # Append that data to a list as a dictionary
df.to_csv('full_dataset.csv', encoding="utf-8", index=False)
Where is my mistake trying to pass the pagetoken?
Here is the documentation for the pageToken from Google.
So you're updating the pagetoken in pagetoken = response['reports'][0]['nextPageToken'] but shouldn't you also update rows in the while loop with new data?
Something like this.
while pagetoken:
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0].get('nextPageToken')
for report in reponse.get('reports', []):
rows = report.get('data', {}).get('rows', [])
for row in rows:

How to fix "timeout: _ssl.c:1039: The handshake operation timed out"

I am attempting to use the Google Analytics API and I am getting a timeout error. I do not know how to fix this issue.
def initialize_analyticsreporting():
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
analytics = build('analyticsreporting', 'v4',credentials=credentials)
return analytics
def get_report(analytics):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '7daysAgo', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:country'}]
}]
}).execute()
def print_response(response):
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
for row in report.get('data', {}).get('rows', []):
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print(header + ': ' + dimension)
for i, values in enumerate(dateRangeValues):
print('Date range: ' + str(i))
for metricHeader, value in zip(metricHeaders, values.get('values')):
print(metricHeader.get('name') + ': ' + value)
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
if __name__ == '__main__':
main()
I expect the output to be a dataframe with the data I have requested but instead I get the the error "timeout: _ssl.c:1039: The handshake operation timed out"

Using nextLink attribute to get the next result page

I'm using the Google APIs python client to download some data from Google Analytics. I basically copied one of their exampels and modified it to do exactly what I need.
I took this piece of code from the examples:
request = service.data().ga().get(
ids=ids,
start_date=str(start_date),
end_date=str(end_date),
dimensions=','.join(dimensions),
filters=filters,
sort="ga:date",
metrics=','.join(metrics)
)
Then add it to the batch object, and execute it once it has collected 10 requests. This all works well, but the problem is, some of those requests return a nextLink. Now I could just create a new request object (with the above code) with a different start-index, but isn't there a better way?
Is there a way to just parse the nextLink into a new request object?
I'm using this approach:
firstRun = True
params = {'ids':'ga:00000001',
'start_date':'2013-07-01',
'end_date':'2013-07-31',
'metrics':'ga:visits',
'dimensions':'ga:source',
'sort':'-ga:visits',
'start_index':1,
'max_results':10000}
while firstRun == True or result.get('nextLink'):
if firstRun == False:
params['start_index'] = int(params['start_index']) + int(params['max_results'])
result = service.data().ga().get(**params).execute()
firstRun = False
I could not found a way to parse the nextLink object and do a request with it but this was my solution and works fine:
max_results = 10000
params = {
'ids': 'ga:' + profile_id,
'start_date': start_date,
'end_date': end_date,
'metrics': ','.join(metrics),
'dimensions': ','.join(dimensions),
'start_index': 1,
'max_results': max_results
}
has_more = True
while has_more:
results = service.data().ga().get(**params).execute()
#do something with results
params['start_index'] = int(params['start_index']) + int(params['max_results'])
has_more = results.get('nextLink')
Why we can't just do:
params = {'ids': profile_id,
'start_date': start_date,
'end_date': end_date,
'metrics': metrics,
'dimensions': dimensions,
'sort': sort,
'start_index': 1,
'max_results': 1}
dummy_call = service.data().ga().get(**params).execute() # just to find out the totalResults number
params['max_results'] = dummy_call[u'totalResults'] # replace max_result with the totalResults number
all_rows = service.data().ga().get(**params).execute()[u'rows']
???

Categories

Resources