Extending Googles HelloAnalytics Python script to pull out the data in a csv [duplicate] - python

I am trying to build an API for my Google Analytics Account to export the data into a CSV. I have the Authentication code working, but I am struggling with now printing the data in the format I would like.
For the time being, I am only pulling dimension country, dimension city, and metric session. (However these will change when I get this working.) Right now, it prints:
Date Range(0)
ga:sessions: 2
ga:country:United States
ga:city:Los Angeles
...
However, I would like to have this in a line:
date Range sessions country city
0 2 USA Los Angeles
...
What code in Python do I need to use? Below is what I have.
def initialize_analyticsreporting():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[tools.argparser])
flags = parser.parse_args([])
http = credentials.authorize(httplib2.Http())
service = build('analytics', 'v4', http=http, discoveryServiceUrl=('https://analyticsreporting.googleapis.com/$discovery/rest'))
def get_report(service):
return service.reports().batchGet(
body={
'reportRequests':[
{
"viewId": "ga:52783868",
"dimensions": [{
"name": "ga:country"},
{"name": "ga:city"}],
"metrics": [{
"expression": "ga:sessions"}],
"dateRanges": [{
"startDate": "2017-04-10",
"endDate": "2017-04-12"}]
}
]
}
).execute()
countries = []
cities = []
val = []
def print_reponse(response):
for report in response.get('reports', []):
columnHeader = report.get('columnHeader',{})
dimensionHeaders=columnHeader.get('columnHeader',[])
metricHeaders = columnHeader.get('metricHeader',{}).get('metricHeaderEntries',[])
rows = report.get('data',{}).get('rows',[])
for row in rows:
dimensions = row.get('dimensions',[])
dateRangeValues=row.get('metrics',[])
for header, dimension in zip(dimensionHeaders,dimensions):
print(header+':'+dimension)
for i, values in enumerate(dateRangeValues):
for metricHeader, value in zip(metricHeaders, values.get('values')):
print(metricHeader.get('name')+':'+value)
def main():
analytics = initialize_analyticsreporting()
response = get_report(service)
print_reponse(response)
if __name__ == '__main__':
main()

As lrnzcig's suggestion, we could parse the data with pandas and then export to csv file.
First, import pandas and json_normalize
import pandas as pd
from pandas.io.json import json_normalize
Use this function to parse data
def parse_data(response):
reports = response['reports'][0]
columnHeader = reports['columnHeader']['dimensions']
metricHeader = reports['columnHeader']['metricHeader']['metricHeaderEntries']
columns = columnHeader
for metric in metricHeader:
columns.append(metric['name'])
data = json_normalize(reports['data']['rows'])
data_dimensions = pd.DataFrame(data['dimensions'].tolist())
data_metrics = pd.DataFrame(data['metrics'].tolist())
data_metrics = data_metrics.applymap(lambda x: x['values'])
data_metrics = pd.DataFrame(data_metrics[0].tolist())
result = pd.concat([data_dimensions, data_metrics], axis=1, ignore_index=True)
return result
The result will look like ...
ga:country ga:city ga:sessions
0 (not set) (not set) 64
1 Argentina (not set) 1
2 Australia Adelaide 3
3 Australia Brisbane 9
...
Finally, call function to_csv to save data as csv file
result.to_csv('result.csv')

Related

Python for Google Sheets: Create new sheet (in the same workbook) automatically and write a new dataframe into it

I have a loop that runs 'n' times and generates 1 new dataframe in each iteration. How can I automatically create a new Google sheet (in the same workbook) every time a new dataframe is created at the end of each iteration?
I use this code to write one dataframe to a Google Sheet which is already created manually by me:
#Code to write one dataframe to Google Sheets:
cell_range_insert= 'B7'
values = df1.to_json()
body = {'values': values}
response_date= service.spreadsheets().values().append(
spreadsheetId=spreadsheet_id,
valueInputOption='RAW',
range=cell_range_insert,
body=dict(
majorDimension= 'ROWS',
values=df1.T.reset_index().T.values.tolist()
)
).execute()
#Code to generate one dataframe in each iteration:
Ticker_List= ["AAPL", "GOOG", "AMZN"]
for Ticker in Ticker_List:
values = [['=GOOGLEFINANCE("' + str(Ticker) + '", "ALL", "1/1/2014", "2/6/2018" ,"DAILY")']]
cell_range_insert = 'B7'
body = {'values': values}
#Send formula to Google Sheet
service.spreadsheets().values().update(
spreadsheetId=spreadsheet_id,
valueInputOption='USER_ENTERED',
range=cell_range_insert,
body=body
).execute()
# Readback stock data from Google Sheets:
response = service.spreadsheets().values().get(
spreadsheetId=spreadsheet_id,
majorDimension='ROWS',
range='Sheet1'
).execute()
# Readback data from Google Sheets and assign it to dataframe df:
columns = response['values'][1]
data = response['values'][2:]
df = pd.DataFrame(data, columns=columns)
In your situation, how about the following flow.
Insert new sheets and put the formula to the cell "B7" to each new sheet using the batchUpdate method.
Retrieve values from each new sheet using the values.batchGet method.
When this flow is reflected in your script, it becomes as follows.
Modified script:
spreadsheet_id = "###" # Please set your Spreadsheet ID.
requests = []
Ticker_List = ["AAPL", "GOOG", "AMZN"]
start_sheet_id = 123456
for i, Ticker in enumerate(Ticker_List):
sheet_id = start_sheet_id + i
formula = '=GOOGLEFINANCE("' + str(Ticker) + '", "ALL", "1/1/2014", "2/6/2018" ,"DAILY")'
requests.extend(
[
{"addSheet": {"properties": {"title": Ticker, "sheetId": sheet_id}}},
{
"updateCells": {
"start": {"sheetId": sheet_id, "rowIndex": 6, "columnIndex": 1},
"rows": [
{
"values": [
{"userEnteredValue": {"formulaValue": formula}}
]
}
],
"fields": "userEnteredValue",
}
},
]
)
service.spreadsheets().batchUpdate(spreadsheetId=spreadsheet_id, body={"requests": requests}).execute()
time.sleep(5) # This is required to be used.
res = service.spreadsheets().values().batchGet(spreadsheetId=spreadsheet_id, ranges=[f"'{e}'!B7:G" for e in Ticker_List]).execute()
# Put the values to dataframe.
for e in res["valueRanges"]:
values = e["values"]
columns = values[0]
data = values[1:]
df = pd.DataFrame(data, columns=columns)
print(df)
When this script is run, 3 new sheets are inserted as the sheet names of "AAPL", "GOOG", "AMZN". And the formula is put to the cell "B7" of each inserted sheet. This is run with the batchUpdate method.
After the batchUpdate method was finished, 5 seconds are waiting. In this case, please adjust for your actual situation. 10 seconds might be suitable.
After the wait, the values are retrieved from each inserted sheet. And, the values are put into the dataframe.
Note:
In this script, 2 quotas of Sheets API are used.
About the dataframe, this script is from your showing script. So, please modify this for your actual situation.
References:
Method: spreadsheets.batchUpdate
Method: spreadsheets.values.batchGet

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

Find coordinates in wikipedia pages iterating over a list

Probably this is a simple question, but my experience in for loop is very limited.
I was trying to adapt the solution in this page https://www.mediawiki.org/wiki/API:Geosearch with some simple examples that i have, but the result is not what i expected.
For example:
I have this simple data frame:
df= pd.DataFrame({'City':['Sesimbra','Ciudad Juárez','31100 Treviso','Ramada Portugal','Olhão'],
'Country':['Portugal','México','Itália','Portugal','Portugal']})
I created a list based on cities:
lista_cidades = list(df['City'])
and i would like to iterate over this list to get the coordinates (decimal, preferably)
So far i tried this approach:
import requests
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
"action": "query",
"format": "json",
"titles": [lista_cidades],
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for i in range(len(lista_cidades)):
for k, v in PAGES.items():
try:
lat_dict[lista_cidades[i]] = str(v['coordinates'][0]['lat'])
lng_dict[lista_cidades[i]] = str(v['coordinates'][0]['lon'])
except:
pass
but it looks like the code doesn't iterate over the list and always returns the same coordinate
For example, when i call the dictionary with latitude coordinates, this is what i get
lng_dict
{'Sesimbra': '-7.84166667',
'Ciudad Juárez': '-7.84166667',
'31100 Treviso': '-7.84166667',
'Ramada Portugal': '-7.84166667',
'Olhão': '-7.84166667'}
What should i do to solve this?
Thanks in advance
I think the query returns only one result, it will take only the last city from you list (in your cas the "Olhão" coordinates).
You can check it by logging the DATA content.
I do not know about wikipedia API, but either your call lack a parameter (documentation should give you the information) or you have to call the API for each city like :
import pandas as pd
import requests
df = pd.DataFrame({'City': ['Sesimbra', 'Ciudad Juárez', '31100 Treviso', 'Ramada Portugal', 'Olhão'],
'Country': ['Portugal', 'México', 'Itália', 'Portugal', 'Portugal']})
lista_cidades = list(df['City'])
lng_dict = {}
lat_dict = {}
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
for city in lista_cidades:
PARAMS = {
"action": "query",
"format": "json",
"titles": city,
"prop": "coordinates"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for k, v in PAGES.items():
try:
lat_dict[city] = str(v['coordinates'][0]['lat'])
lng_dict[city] = str(v['coordinates'][0]['lon'])
except:
pass

Conversion from nested json to csv with pandas

I am trying to convert a nested json into a csv file, but I am struggling with the logic needed for the structure of my file: it's a json with 2 objects and I would like to convert into csv only one of them, which is a list with nesting.
I've found very helpful "flattening" json info in this blog post. I have been basically adapting it to my problem, but it is still not working for me.
My json file looks like this:
{
"tickets":[
{
"Name": "Liam",
"Location": {
"City": "Los Angeles",
"State": "CA"
},
"hobbies": [
"Piano",
"Sports"
],
"year" : 1985,
"teamId" : "ATL",
"playerId" : "barkele01",
"salary" : 870000
},
{
"Name": "John",
"Location": {
"City": "Los Angeles",
"State": "CA"
},
"hobbies": [
"Music",
"Running"
],
"year" : 1985,
"teamId" : "ATL",
"playerId" : "bedrost01",
"salary" : 550000
}
],
"count": 2
}
my code, so far, looks like this:
import json
from pandas.io.json import json_normalize
import argparse
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Converting json files into csv for Tableau processing')
parser.add_argument(
"-j", "--json", dest="json_file", help="PATH/TO/json file to convert", metavar="FILE", required=True)
args = parser.parse_args()
with open(args.json_file, "r") as inputFile: # open json file
json_data = json.loads(inputFile.read()) # load json content
flat_json = flatten_json(json_data)
# normalizing flat json
final_data = json_normalize(flat_json)
with open(args.json_file.replace(".json", ".csv"), "w") as outputFile: # open csv file
# saving DataFrame to csv
final_data.to_csv(outputFile, encoding='utf8', index=False)
What I would like to obtain is 1 line per ticket in the csv, with headings:
Name,Location_City,Location_State,Hobbies_0,Hobbies_1,Year,TeamId,PlayerId,Salary.
I would really appreciate anything that can do the click!
Thank you!
I actually wrote a package called cherrypicker recently to deal with this exact sort of thing since I had to do it so often!
I think the following code would give you exactly what you're after:
from cherrypicker import CherryPicker
import json
import pandas as pd
with open('file.json') as file:
data = json.load(file)
picker = CherryPicker(data)
flat = picker['tickets'].flatten().get()
df = pd.DataFrame(flat)
print(df)
This gave me the output:
Location_City Location_State Name hobbies_0 hobbies_1 playerId salary teamId year
0 Los Angeles CA Liam Piano Sports barkele01 870000 ATL 1985
1 Los Angeles CA John Music Running bedrost01 550000 ATL 1985
You can install the package with:
pip install cherrypicker
...and there's more docs and guidance at https://cherrypicker.readthedocs.io.
An you already have a function to flatten a Json object, you have just to flatten the tickets:
...
with open(args.json_file, "r") as inputFile: # open json file
json_data = json.loads(inputFile.read()) # load json content
final_data = pd.DataFrame([flatten_json(elt) for elt in json_data['tickets']])
...
With your sample data, final_data is as expected:
Location_City Location_State Name hobbies_0 hobbies_1 playerId salary teamId year
0 Los Angeles CA Liam Piano Sports barkele01 870000 ATL 1985
1 Los Angeles CA John Music Running bedrost01 550000 ATL 1985
There may be a simpler solution for this. But this should work!
import json
import pandas as pd
with open('file.json') as file:
data = json.load(file)
df = pd.DataFrame(data['tickets'])
for i,item in enumerate(df['Location']):
df['location_city'] = dict(df['Location'])[i]['City']
df['location_state'] = dict(df['Location'])[i]['State']
for i,item in enumerate(df['hobbies']):
df['hobbies_{}'.format(i)] = dict(df['hobbies'])[i]
df = df.drop({'Location','hobbies'}, axis=1)
print(df)

What is the data format returned by the AdWords API TargetingIdeaPage service?

When I query the AdWords API to get search volume data and trends through their TargetingIdeaSelector using the Python client library the returned data looks like this:
(TargetingIdeaPage){
totalNumEntries = 1
entries[] =
(TargetingIdea){
data[] =
(Type_AttributeMapEntry){
key = "KEYWORD_TEXT"
value =
(StringAttribute){
Attribute.Type = "StringAttribute"
value = "keyword phrase"
}
},
(Type_AttributeMapEntry){
key = "TARGETED_MONTHLY_SEARCHES"
value =
(MonthlySearchVolumeAttribute){
Attribute.Type = "MonthlySearchVolumeAttribute"
value[] =
(MonthlySearchVolume){
year = 2016
month = 2
count = 2900
},
...
(MonthlySearchVolume){
year = 2015
month = 3
count = 2900
},
}
},
},
}
This isn't JSON and appears to just be a messy Python list. What's the easiest way to flatten the monthly data into a Pandas dataframe with a structure like this?
Keyword | Year | Month | Count
keyword phrase 2016 2 10
The output is a sudsobject. I found that this code does the trick:
import suds.sudsobject as sudsobject
import pandas as pd
a = [sudsobject.asdict(x) for x in output]
df = pd.DataFrame(a)
Addendum: This was once correct but new versions of the API (I tested
201802) now return a zeep.objects. However, zeep.helpers.serialize_object should do the same trick.
link
Here's the complete code that I used to query the TargetingIdeaSelector, with requestType STATS, and the method I used to parse the data to a useable dataframe; note the section starting "Parse results to pandas dataframe" as this takes the output given in the question above and converts it to a dataframe. Probably not the fastest or best, but it works! Tested with Python 2.7.
"""This code pulls trends for a set of keywords, and parses into a dataframe.
The LoadFromStorage method is pulling credentials and properties from a
"googleads.yaml" file. By default, it looks for this file in your home
directory. For more information, see the "Caching authentication information"
section of our README.
"""
from googleads import adwords
import pandas as pd
adwords_client = adwords.AdWordsClient.LoadFromStorage()
PAGE_SIZE = 10
# Initialize appropriate service.
targeting_idea_service = adwords_client.GetService(
'TargetingIdeaService', version='v201601')
# Construct selector object and retrieve related keywords.
offset = 0
stats_selector = {
'searchParameters': [
{
'xsi_type': 'RelatedToQuerySearchParameter',
'queries': ['donald trump', 'bernie sanders']
},
{
# Language setting (optional).
# The ID can be found in the documentation:
# https://developers.google.com/adwords/api/docs/appendix/languagecodes
'xsi_type': 'LanguageSearchParameter',
'languages': [{'id': '1000'}],
},
{
# Location setting
'xsi_type': 'LocationSearchParameter',
'locations': [{'id': '1027363'}] # Burlington,Vermont
}
],
'ideaType': 'KEYWORD',
'requestType': 'STATS',
'requestedAttributeTypes': ['KEYWORD_TEXT', 'TARGETED_MONTHLY_SEARCHES'],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
stats_page = targeting_idea_service.get(stats_selector)
##########################################################################
# Parse results to pandas dataframe
stats_pd = pd.DataFrame()
if 'entries' in stats_page:
for stats_result in stats_page['entries']:
stats_attributes = {}
for stats_attribute in stats_result['data']:
#print (stats_attribute)
if stats_attribute['key'] == 'KEYWORD_TEXT':
kt = stats_attribute['value']['value']
else:
for i, val in enumerate(stats_attribute['value'][1]):
data = {'keyword': kt,
'year': val['year'],
'month': val['month'],
'count': val['count']}
data = pd.DataFrame(data, index = [i])
stats_pd = stats_pd.append(data, ignore_index=True)
print(stats_pd)

Categories

Resources