Extracting pandas DataFrame from function to a global variable - python

I am new to pandas and python.
I am trying to extract pandas DataFrame that I create in a function to a global variable which I could use in future functions.
My code:
def initialize_analyticsreporting():
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '7daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:country'}, {'name': 'ga:hostname'}, {'name': 'ga:pagePathLevel1'}, {'name': 'ga:pagePathLevel2'}, {'name': 'ga:keyword'}, {'name': 'ga:adMatchedQuery'}, {'name': 'ga:operatingSystem'}, {'name': 'ga:hour'}, {'name': 'ga:exitPagePath'}]
}]
}
).execute()
def print_response(response):
list = []
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
df = pd.DataFrame(list)
return df
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
if __name__ == '__main__':
main()
At this point after main() is called I can either print(df) inside print_response' function. But I want to be able to call thedfin myprint_response` function outside of the function, like to make df be accessible globally.
Thank you for your suggestions.

You can simply assign to a global variable inside the function or return the value and assign it as you call the function. You can declare global variables beforehand or in the function for the first time, but the keyword global needs to be used.
df1 = None
def f():
global df1, df2
df1 = pd.DataFrame()
df2 = pd.DataFrame()
return pd.DataFrame()
df3 = f()
# Now df 1, 2 and 3 are all global DataFrames
Whether you should use global variables is another topic. You can just return whatever from the function and pass it to other functions just as well.
def f() -> pd.DataFrame:
return pd.DataFrame()
def g(df: pd.DataFrame):
# Do stuff with df
def main():
df = f()
g(df)

Related

Python Google Analytics API - print Date

I have the following code to get to the data of my google analytics api.
"""Hello Analytics Reporting API V4."""
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import csv
import pandas as pd
SCOPES = ['https://w......']
KEY_FILE_LOCATION = '/Users/,,,,test.json'
VIEW_ID = 'xxxxx'
def initialize_analyticsreporting():
"""Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
def get_report(analytics):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '7daysAgo', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:country'}]
}]
}
).execute()
def print_response(response):
"""Parses and prints the Analytics Reporting API V4 response.
Args:
response: An Analytics Reporting API V4 response.
"""
for report in response.get('reports', []):
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
ofile = open('/Users/...csv', 'w', newline='')
writer = csv.writer(ofile)
dime= []
item = []
val = []
for row in report.get('data', {}).get('rows', []):
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
print(header + ': ', dimension)
writer.writerow(dimensions)
dime.append(dimension)
for i, values in enumerate(dateRangeValues):
print('Date range:', str(i))
print(str(i))
print('range')
print(dateRangeValues[0])
writer.writerow(str(i))
item.append(str(i))
for metricHeader, value in zip(metricHeaders, values.get('values')):
print(metricHeader.get('name') + ':', value)
print(value)
writer.writerows(value)
val.append(value)
#open the file in the write mode
with open('/Users/...csv', 'w') as f:
write = csv.writer(f)
write.writerow(dime)
write.writerows(val)
df = pd.DataFrame(
{'Country': dime,
'Value': val,
'Date Range': item
}
)
df.to_csv('/Users/.../pd.csv') ##It will include index also
'''with open('/Users/...document.csv','a') as f:
writer = csv.writer(f)
writer.writerow(dimensions)
writer.writerow(str(i))
writer.writerow(value)'''
'''with open('/Users....csv','a') as fd:
fd.writero(dimension)
fd.write(str(i))
fd.write(value)'''
'''df = pd.DataFrame()
df["Sessions"]=val
df["pagePath"]=dim
df=df[["pagePath","Sessions"]]
#Export to CSV
df.to_csv("page_by_session.csv")'''
def main():
analytics = initialize_analyticsreporting()
response = get_report(analytics)
print_response(response)
'''with open('/Users.../document.csv', 'a') as f:
for row in response:
f.write(f)'''
# open the file in the write mode
#with open('/Users/....', 'w', encoding='UTF8') as f:
# create the csv writer
# writer = csv.writer(f)
# write a row to the csv file
#writer.writerow(response)
if __name__ == '__main__':
main()
My problem now is that when I print the results and store them into my DataFrame I got this result
But I don't want to have the 0 for DateRange
I want to have the actual Range of Dates into this column
I know that I have specified the DateRanges with the different input parameters like "StartDate" , "EndDate", "7DaysAgo" and "today"...
but how can I use those to iterate over them and include them into my pandas data frame?
So how can I add the startDate , endDate and the daterange
to my csv file?
Any help?
Thanks

How to show specific page views using Google Analytics API?

for an API that I am using, we need to be able to view what specific pages are being clicked on and output to a CSV File. I am able to see the average session duration, and the amount of page views. I am curious as to what I need to add into my code attached below to make sure that this is possible for exporting to a CSV file. Thank you!
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'client_secrets.json'
VIEW_ID ='insert here'
credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
response = analytics.reports().batchGet(body={
'reportRequests': [{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '30daysAgo', 'endDate': 'today'}],
'metrics': [
{"expression": "ga:pageviews"},
{"expression": "ga:avgSessionDuration"}
], "dimensions": [
{"name": "ga:deviceCategory"}
]
}]}).execute()
response
{'reports': [{'columnHeader': {'dimensions': ['ga:deviceCategory'],
'metricHeader': {'metricHeaderEntries': [{'name': 'ga:pageviews',
'type': 'INTEGER'},
{'name': 'ga:avgSessionDuration', 'type': 'TIME'}]}},
'data': {'isDataGolden': True,
'maximums': [{'values': ['485', '94.95454545454545']}],
'minimums': [{'values': ['29', '51.21186440677966']}],
'rowCount': 3,
'rows': [{'dimensions': ['desktop'],
'metrics': [{'values': ['485', '51.21186440677966']}]},
{'dimensions': ['mobile'],
'metrics': [{'values': ['409', '69.30859375']}]},
{'dimensions': ['tablet'],
'metrics': [{'values': ['29', '94.95454545454545']}]}],
'totals': [{'values': ['923', '60.06487341772152']}]}}]}
import pandas as pd
df = pd.DataFrame(columns=['Name', 'Age'])
def ga_response_dataframe(response):
row_list = []
# Get each collected report
for report in response.get('reports', []):
# Set column headers
column_header = report.get('columnHeader', {})
dimension_headers = column_header.get('dimensions', [])
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
# Get each row in the report
for row in report.get('data', {}).get('rows', []):
# create dict for each row
row_dict = {}
dimensions = row.get('dimensions', [])
date_range_values = row.get('metrics', [])
# Fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimension_headers, dimensions):
row_dict[header] = dimension
# Fill dict with metric header (key) and metric value (value)
for i, values in enumerate(date_range_values):
for metric, value in zip(metric_headers, values.get('values')):
# Set int as int, float a float
if ',' in value or '.' in value:
row_dict[metric.get('name')] = float(value)
else:
row_dict[metric.get('name')] = int(value)
row_list.append(row_dict)
df = row_list
return df
df = ga_response_dataframe(response)
#df = pd.DataFrame(row_list)
print(df)

How can I transform the json output from Google Analytics into a Pandas dataframe in Python?

TLDR: How can I transform the json output from Google Analytics into a Pandas dataframe in Python?
I'm using the official documentation to get results out of Google Analytics. This works, but I would like to have the output in a pandas dataframe for further data analysis instead of in json format.
Remarks:
When you look at the code below, you'll see that I've commented out '''print(response)''' because that outputs the results in json format.
I've used the code from this article to write the pandas part.
I've searched extensively on Stackoverflow to find a solution but to no avail.
Thank you in advance.
def get_report(analytics):
"""Queries the Analytics Reporting API V4.
Args:
analytics: An authorized Analytics Reporting API V4 service object.
Returns:
The Analytics Reporting API V4 response.
"""
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '7daysAgo', 'endDate': 'today'}],
'metrics': [{'expression': 'ga:sessions'}],
'dimensions': [{'name': 'ga:adMatchedQuery'}]
}]
}
).execute()
response = get_report(initialize_analyticsreporting())
'''print(response)'''
def print_response(response):
list = []
# get report data
for report in response.get('reports', []):
# set column headers
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
#set int as int, float a float
if ',' in value or '.' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = int(value)
list.append(dict)
df = pd.DataFrame(list)
return df

Downloading batch reports from google analytics API v4

I am trying to get a report for 3 months, for this, I need to make multiple requests and append the results to a list because API only returns 100,000 rows per request. There is a variable returned from the API named nextPageToken which I need to pass into the next query to get the next 100,000 rows of the report. I am having a difficult time trying to do this.
Here is my code:
def initialize_analyticsreporting():
'''Initializes an Analytics Reporting API V4 service object.
Returns:
An authorized Analytics Reporting API V4 service object.
'''
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
list = []
def get_report(analytics, pageTokenVariable):
return analytics.reports().batchGet(
body={
'reportRequests': [
{
'viewId': VIEW_ID,
'pageSize': 100000,
'dateRanges': [{'startDate': '90daysAgo', 'endDate': 'yesterday'}],
'metrics': [{'expression': 'ga:adClicks'}, {'expression': 'ga:impressions'}, {'expression': 'ga:adCost'}, {'expression': 'ga:CTR'}, {'expression': 'ga:CPC'}, {'expression': 'ga:costPerTransaction'}, {'expression': 'ga:transactions'}, {'expression': 'ga:transactionsPerSession'}, {'expression': 'ga:pageviews'}, {'expression': 'ga:timeOnPage'}],
"pageToken": pageTokenVariable,
'dimensions': [{'name': 'ga:adMatchedQuery'}, {'name': 'ga:campaign'}, {'name': 'ga:adGroup'}, {'name': 'ga:adwordsCustomerID'}, {'name': 'ga:date'}],
'orderBys': [{'fieldName': 'ga:impressions', 'sortOrder': 'DESCENDING'}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adwordsCustomerID',
'operator': 'EXACT',
'expressions': 'abc',
'not': 'True'
}]
}],
'dimensionFilterClauses': [{
'filters': [{
'dimension_name': 'ga:adMatchedQuery',
'operator': 'EXACT',
'expressions': '(not set)',
'not': 'True'
}]
}]
}]
}
).execute()
analytics = initialize_analyticsreporting()
response = get_report(analytics, "0")
for report in response.get('reports', []):
pagetoken = report.get('nextPageToken', None)
print(pagetoken)
#------printing the pagetoken here returns `100,000` which is expected
columnHeader = report.get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get(
'metricHeader', {}).get('metricHeaderEntries', [])
rows = report.get('data', {}).get('rows', [])
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
# Append that data to a list as a dictionary
# pagination function
while pagetoken: # This says while there is info in the nextPageToken get the data, process it and add to the list
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0]['nextPageToken']
print(pagetoken)
#------printing the pagetoken here returns `200,000` as is expected but the data being pulled is the same as for the first batch and so on. While in the loop the pagetoken is being incremented but it does not retrieve new data
for row in rows:
# create dict for each row
dict = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
# fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimensionHeaders, dimensions):
dict[header] = dimension
# fill dict with metric header (key) and metric value (value)
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
# set int as int, float a float
if ',' in value or ',' in value:
dict[metric.get('name')] = float(value)
else:
dict[metric.get('name')] = float(value)
list.append(dict)
df = pd.DataFrame(list)
print(df) # Append that data to a list as a dictionary
df.to_csv('full_dataset.csv', encoding="utf-8", index=False)
Where is my mistake trying to pass the pagetoken?
Here is the documentation for the pageToken from Google.
So you're updating the pagetoken in pagetoken = response['reports'][0]['nextPageToken'] but shouldn't you also update rows in the while loop with new data?
Something like this.
while pagetoken:
response = get_report(analytics, pagetoken)
pagetoken = response['reports'][0].get('nextPageToken')
for report in reponse.get('reports', []):
rows = report.get('data', {}).get('rows', [])
for row in rows:

Use of pageToken with Google Analytics Reporting API v4 and Python

I have followed a tutorial on how to download data from Google Analytics with Python using GA Reporting API. I was able to query the data I wanted, although reaching the rows limit.
I saw in the documentation that there is a pageToken to avoid the issue. I have added this field to my request (as describe in the documentation), but I am not able to make it work.
sample_request = {
'viewId': '12345678',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 30),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [
{'name': 'ga:date'},
{'name': 'ga:dimension7'},
{'name': 'ga:dimension6'},
{'name': 'ga:dimension9'}
],
'metrics': [
{'expression': 'ga:users'},
{'expression': 'ga:totalevents'}
],
"pageSize": 100000,
'pageToken': 'abc'
}
response = api_client.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()
You will hit the limit, but the parameter nextPageToken will allow you to page through multiple rows. For example:
def processReport (self, aDimensions):
"""Get a full report, returning the rows"""
# Get the first set
oReport = self.getReport(aDimensions)
oResponse = self.getResponse(oReport, True)
aRows = oResponse.get('rows')
# Add any additional sets
while oResponse.get('nextPageToken') != None:
oResponse = self.getReport(aDimensions, oResponse.get('nextPageToken'))
oResponse = self.getResponse(oResponse, False)
aRows.extend(oResponse.get('rows'))
return aRows
You can see the complete program here:
https://github.com/aiqui/ga-download
I solved it like this
def handle_report(analytics,pagetoken,rows):
response = get_report(analytics, pagetoken)
columnHeader = response.get("reports")[0].get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
pagetoken = response.get("reports")[0].get('nextPageToken', None)
rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
rows = rows + rowsNew
print("len(rows): " + str(len(rows)))
if pagetoken != None:
return handle_report(analytics,pagetoken,rows)
else:
return rows
def main():
analytics = initialize_analyticsreporting()
global dfanalytics
dfanalytics = []
rows = []
rows = handle_report(analytics,'0',rows)
dfanalytics = pd.DataFrame(list(rows))

Categories

Resources