Big Query table create and load data via Python

Big Query table create and load data via Python - python

I'm trying to extract events from MixPanel, process it and then upload to BigQuery table(Creating a new table).
I googled for all the available resources but not useful in solving the issue.
Below is my code,
# Required modules import
import os
from mixpanel_api import Mixpanel
import collections
import json
from google.cloud import storage, bigquery
# Function to flatten exported file
def flatten(d, parent_key='', sep=''):
items = []
for k, v in d.items():
new_key = parent_key.replace("PROPERTIES","").replace("-","_").replace("[","").replace("]","").replace("/","").replace("\\","").replace("'","") + sep + k.replace(" ","").replace("_","").replace("$","").replace("-","_").replace("[","").replace("]","").replace("/","").replace("\\","").replace("'","") if parent_key else k
#new_key = parent_key.replace("PROPERTIES","").join(e for e in parent_key if e.isalnum()) + sep + k.join(e for e in k if e.isalnum()) if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key.upper(), sep=sep).items())
else:
items.append((new_key.upper().replace("-","_").replace("[","").replace("]","").replace("/","").replace("\\","").replace("'",""), v))
#items.append((new_key.upper().join(e for e in new_key if e.isalnum()), v))
#items.append(("ID","1"))
#items.append(("PROCESS_DATE",""))
#items.append(("DATA_DATE",""))
return dict(items)
# Start of execution point
if __name__ == '__main__':
# Secret and token to access API
api_sec = 'aa8af6b5ca5a5ed30e20f3af0acdfb2d'
api_tok = 'ad5234953e64b908bcd35388875324db'
# User input for date range and filename
start_date = str(input('Enter the start date(format: YYYY-MM-DD): '))
end_date = str(input('Enter the end date(format: YYYY-MM-DD): '))
file_name = str(input('Enter filename to store output: '))
file_formatter = str(input('Enter filename to store formatted output: '))
# Instantiating Mixpanel object
mpo = Mixpanel(api_sec,
api_tok
)
# Exporting events for the specified date range and storing in the filename provided, gunzip'ed file
mpo.export_events(file_name,
{'from_date':start_date,
'to_date':end_date
},
add_gzip_header=False,
raw_stream=True
)
# Dict for schema derived from file
schema_dict = {}
# Flatten file and write-out to another file
with open(file_name, 'r') as uf, open(file_formatter, 'a') as ff, open('schema_file', 'a') as sf:
#schema_list = []
for line in uf:
temp = flatten(json.loads(line))
for k in temp.keys():
if k not in schema_dict:
schema_dict[k] = "STRING"
#schema_list.append({"name" : k, "type" : "STRING"})
#ff.write(json.dumps(temp))
json.dump(temp, ff, indent = None, sort_keys = True) # Dumps each dictionary entry as a newline entry, even '{' '}' is on new lines
ff.write('\n') # Adds a new line after each object dump to file
#json.dump(schema_dict, sf, indent = None, sort_keys = True)
#json.dump(schema_list, sf, indent = None, sort_keys = True)
# Removing source file
if os.path.isfile(file_name):
sfr = os.remove(file_name)
if sfr == None:
print 'File ' +file_name+ ' removed from local storage'
else:
print 'File ' +file_name+ ' remove failed from local storage'
# Uploading file to Google bucket
client = storage.Client()
bucket = client.get_bucket('yathin-sample-bucket')
blob = bucket.blob(file_formatter)
status = blob.upload_from_filename(file_formatter)
if status == None:
print 'File ' +file_formatter+ ' upload success. Removing local copy.'
fr = os.remove(file_formatter)
if fr == None:
print 'File ' +file_formatter+ ' removed from local storage'
else:
print 'File ' +file_formatter+ ' remove failed from local storage'
# Loading file to BigQuery
client = bigquery.Client()
dataset_id = 'sample_dataset'
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.schema = [ bigquery.SchemaField(k,v) for k,v in schema_dict.items() ]
#job_config.autodetect = True
#job_config.create_dsiposition = 'CREATE_IF_NEEDED'
#job_config.write_disposition = 'WRITE_APPEND'
job_config.source_format = 'NEWLINE_DELIMITED_JSON'
uri = 'gs://yathin-sample-bucket/'+file_formatter
load_job = client.load_table_from_uri(
uri,
dataset_ref.table('test_json'),
job_config=job_config) # API request
#assert load_job.job_type == 'load'
#load_job.result() # Waits for table load to complete.
This code isn't returning any error, but table isn't getting created.
Can someone please help here by pointing out what is wrong.

It's possible that there is an error, but youre not returning the results in your script. I am not sure why you commented out load_job.result() but that is probably necessary to make sure the job completes.
If there still isn't an error this script can give you a list of your last jobs and the result with any error codes. Just change the max_results kwarg.
client = biquery.Client()
for job in client.list_jobs(max_results=1, all_users=False):
jobid = job.job_id
job = client.get_job(jobid)
print("------BIG QUERY JOB ERROR REASON", job.errors)
Also, Per your question in the comments about how to check to see if a table exists...
from google.cloud.exceptions import NotFound
client = bigquery.Client()
try:
dataset = client.dataset('DatasetName')
table_ref = dataset.table('TableName')
client.get_table(table_ref)
except: NotFound:
print('Table Not Found')

Related

Python: Passing a variable to another function

I am really new to Python and was hoping someone could help me with this. I have a function (an AWS Lamdba function actually) that I need to pass a variable value after it is created. I'm doing this to create a campaign in Pinpoint when a segment is created.
import os
import time
import boto3
from botocore.exceptions import ClientError
from datetime import datetime,timedelta
AWS_REGION = os.environ['region']
projectId = os.environ['projectId']
importRoleArn = os.environ['importRoleArn']
def lambda_handler(event, context):
print("Received event: " + str(event))
for record in event['Records']:
# Assign some variables to make it easier to work with the data in the
# event recordi
bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key']
folder = os.path.split(key)[0]
folder_path = os.path.join(bucket, folder)
full_path = os.path.join(bucket, key)
s3_url = "s3://" + folder_path
# print(full_path);
# Check to see if all file parts have been processed.
if all_files_processed(bucket, folder, full_path):
# If you haven't recently run an import job that uses a file stored in
# the specified S3 bucket, then create a new import job. This prevents
# the creation of duplicate segments.
if not (check_import_jobs(bucket, folder, s3_url, full_path)):
segmentID = create_import_job(s3_url, full_path)
create_campaign(segmentID)
else:
print("Import job found with URL s3://"
+ os.path.join(bucket,folder) + ". Aborting.")
else:
print("Parts haven't finished processing yet.")
# Determine if all of the file parts have been processed.
def all_files_processed(bucket, folder, full_path):
# Use the "__ofN" part of the file name to determine how many files there
# should be.
number_of_parts = int((full_path.split("__of")[1]).split("_processed")[0])
# Figure out how many keys contain the prefix for the current batch of
# folders (basically, how many files are in the appropriate "folder").
client = boto3.client('s3')
objs = client.list_objects_v2(Bucket=bucket,Prefix=folder)
file_count = objs['KeyCount']
ready_for_import = False
if file_count == number_of_parts:
ready_for_import = True
return ready_for_import
# Check Amazon Pinpoint to see if any import jobs have been created by using
# the same S3 folder.
def check_import_jobs(bucket, folder, s3_url, full_path):
url_list = []
print(s3_url);
# Retrieve a list of import jobs for the current project ID.
client = boto3.client('pinpoint')
try:
client_response = client.get_import_jobs(
ApplicationId=projectId
)
except ClientError as e:
print(e.response['Error']['Message'])
else:
segment_response = client_response['ImportJobsResponse']['Item']
#print(segment_response);
# Parse responses. Add all S3Url values to a list.
for item in segment_response:
#print(item);
s3_url_existing = full_path
url_list.append(s3_url_existing)
#print(url_list);
# Search for the current S3 URL in the list.
if s3_url in url_list:
found = True
else:
found = False
print(found);
return found
# Create the import job in Amazon Pinpoint.
def create_import_job(s3_url, full_path):
client = boto3.client('pinpoint')
segment_name = s3_url.split('/')[4]
try:
response = client.create_import_job(
ApplicationId=projectId,
ImportJobRequest={
'DefineSegment': True,
'Format': 'CSV',
'RegisterEndpoints': True,
'RoleArn': importRoleArn,
'S3Url': s3_url,
'SegmentName': segment_name
}
)
except ClientError as e:
print(e.response['Error']['Message'])
else:
print("Import job " + response['ImportJobResponse']['Id'] + " "
+ response['ImportJobResponse']['JobStatus'] + ".")
print("Segment ID: "
+ response['ImportJobResponse']['Definition']['SegmentId'])
print("Application ID: " + projectId)
return response['ImportJobResponse']['Definition']['SegmentId']
def create_campaign(segmentID):
client = boto3.client('pinpoint')
now = datetime.now()
dt_string = now.isoformat()
print(type(segmentID))
try:
response = client.create_campaign(
ApplicationId=projectId,
WriteCampaignRequest={
'Description': 'Test SMS Campaign 2',
'MessageConfiguration': {
'EmailMessage': {
'Body': 'This is a test 2',
'FromAddress': 'xxx#xxx.com',
'HtmlBody': '<p>Test 2</p>',
'Title': 'This is a test 2'
},
'SMSMessage': {
'Body': 'Thanks for your visit to {{Attributes.Provider_Name}} on {{Attributes.Clinical_Date_of_Service}}',
'MessageType': 'PROMOTIONAL',
'SenderId': 'XXX'
}
},
'Schedule': {
'Frequency': 'ONCE',
'IsLocalTime': True,
'StartTime': dt_string,
'Timezone': 'UTC'
},
'Name': 'Test Email Campaign 6',
'SegmentId': segmentID
}
)
except ClientError as e:
print(e.response['Error']['Message'])
else:
print('Campaign Created')
The issue comes up in create_campaign where I want to send the SegmentID to. I end up getting the following error...
"Segment specified in SegmentId is not found"
I can print the segmentID to the console no problem, it's just getting it to pass to the function is the roadblock I'm hitting. Thanks in advance!

TWS API - store company snapshot and financial statements

My goal is to use a list of tickers together with the TWS API to extract parts of the company snapshot (reqFundamentalData() -> "ReportSnapshot") and the financial statements (reqFundamentalData() -> "ReportsFinStatements") of these tickers, convert into a dataframe and store it as a parquet file.
I tried to merge solutions provided:
use a list of tickers
TWS API to download stock fundamental data runs only the first data entry and ignores the others. Whow to solve this?
store XML as dataframe
Converting XML to Pandas
store data
Save data from TWS API to csv file
Code:
from datetime import datetime
from bs4 import BeautifulSoup as bs
import pandas as pd
from ibapi.client import EClient
from ibapi.contract import Contract
from ibapi.wrapper import EWrapper
import logging
import random
import pathlib
import time
from datetime import date
import datetime
from pathlib import Path
class TestApp(EWrapper, EClient):
def __init__(self, addr, port, client_id):
EWrapper.__init__(self) # new - book
EClient.__init__(self, self)
self.firstReqId = 8001
self.contracts = {} # keep in dict so you can lookup
self.contNumber = self.firstReqId
# add dataframes to store the result
self.df_company_info = pd.DataFrame(data=None, index=None, columns=None)
self.df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
def addContracts(self, cont):
self.contracts[self.contNumber] = cont # add to dict using 8001 first time
self.contNumber += 1 # next id will be 8002 etc.
def nextValidId(self, orderId: int):
# now you are connected, ask for data, no need for sleeps
# this isn't the only way to know the api is started but it's what IB recommends
self.contNumber = self.firstReqId # start with first reqId
self.getNextData()
def error(self, reqId, errorCode, errorString):
print("Error: ", reqId, "", errorCode, "", errorString)
# if there was an error in one of your requests, just contimue with next id
if reqId > 0 and self.contracts.get(self.contNumber):
# err in reqFundametalData based on reqid existing in map
print('err in', self.contracts[reqId].symbol)
self.getNextData() # try next one
def fundamentalData(self, reqId, fundamental_data):
self.fundamentalData = fundamental_data
try:
if self.fundamentalData is not None:
# convert XML to dictionary entry
dict_company_info = self.CompanyInfoXMLtoDict(self.fundamentalData)
# add dict entry to dataframe
df_add_row = pd.DataFrame([dict_company_info])
self.df_company_info = self.df_company_info.append(df_add_row, ignore_index=True)
except KeyError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except TypeError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except ValueError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except IndexError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
self.getNextData()
def getNextData(self):
if self.contracts.get(self.contNumber): # means a contract exists
# so req data
self.reqFundamentalData(self.contNumber, self.contracts[self.contNumber], "ReportSnapshot", [])
self.contNumber += 1 # now get ready for next request
else: # means no more sequentially numbered contracts
print('done')
self.disconnect() # just exit
def CompanyInfoXMLtoDict(self, fundamentals):
soup = bs(fundamentals, 'xml')
df_company_info = pd.DataFrame(data=None, index=None, columns=None)
ticker = ''
longName = ''
fullTimeEmployees = 0
# search for a tag e.g. </IssueID>
for issues in soup.find_all('IssueID'):
# within this tag -> search of unique ID e.g. IssueID type=...
if issues.get('Type') == "Ticker":
ticker = issues.get_text()
break
for coID_i in soup.find_all('CoID'):
if coID_i.get('Type') == "CompanyName":
longName = coID_i.get_text()
break
for employees_i in soup.find_all('Employees'):
fullTimeEmployees = employees_i.get_text()
break
# create result entry row
if ticker is not None and ticker != '':
new_row_dict = {'ticker': ticker, 'name': longName,
'num_employees': fullTimeEmployees}
else:
new_row_dict = {}
return new_row_dict
def FinStmtsXMLtoDF(self, fundamentals, ticker, stmts_type):
today = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
today_date = date.today().strftime("%Y-%m-%d")
if stmts_type == 'annual':
period_type = 'Annual'
else:
period_type = 'Interim'
soup = bs(fundamentals, 'xml')
# build dict
stmts_terms = {}
for terms in soup.find_all("mapItem"):
# add entry to dict -> dict for maping of code to description
stmts_terms[terms.get('coaItem')] = terms.get_text()
bal_l = []
inc_l = []
cas_l = []
for period in soup.find_all('FiscalPeriod'):
# quarterly vs. annually
if period.get('Type') == period_type:
for statement in period.find_all('Statement'):
if statement.find('UpdateType').get('Code') != 'CLA':
dic = {}
stmts_type = statement.get('Type')
# source_date = statement.find('Source').get('Date')
statement_date = statement.find('StatementDate').text
# dic['date'] = source_date
dic['rep_date'] = statement_date
for item in statement.find_all('lineItem'):
# dic[item.get('coaCode')] = item.text
dic[stmts_terms.get(item.get('coaCode'), 'DEFAULT')] = item.text
if stmts_type == 'BAL':
bal_l.append(dic)
# print(stmts_type, date, dic)
elif stmts_type == 'INC':
inc_l.append(dic)
elif stmts_type == 'CAS':
cas_l.append(dic)
df_balance_sheet = pd.DataFrame(bal_l).sort_values('rep_date')
df_income_statement = pd.DataFrame(inc_l).sort_values('rep_date')
df_cash_flow = pd.DataFrame(cas_l).sort_values('rep_date')
# merge all stmts for same rep_date
df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
df_fin_stmts = df_balance_sheet.merge(df_income_statement, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts = df_fin_stmts.merge(df_cash_flow, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts.insert(loc=0, column='ticker', value=ticker)
df_fin_stmts.insert(loc=1, column='date_updated', value=today_date)
return df_fin_stmts
def main():
# ----- config
project_data_folder = '/home/data/'
project_data_folder = Path(project_data_folder)
# ticker are stored in a csv file
csv_master_ticker = Path('home/data/ticker/ticker-list.csv')
# load list of tickers
df = pd.read_csv(csv_master_ticker)
list_master_ticker = df['ticker'].tolist()
fusion_company_info = pd.DataFrame(data=None, index=None, columns=None)
fusion_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
fusion_q_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
client = TestApp('127.0.0.1', 7496, 0)
for ticker in list_master_ticker:
# remove additional postfix for exchange e.g. XYZ.F -> XYZ
ticker_clean = ticker.rstrip('.')
contract = Contract()
contract.symbol = ticker_clean
contract.secType = 'STK'
contract.exchange = "SMART"
contract.currency = 'USD'
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
if fusion_company_info.empty:
fusion_company_info = client.df_company_info
else:
fusion_company_info = pd.concat([fusion_company_info, client.df_company_info])
tws_company_info_file_name = 'tws_company_info.parquet'
file_name = project_data_folder / tws_company_info_file_name
try:
if fusion_company_info is not None:
if not fusion_company_info.empty:
fusion_company_info.to_parquet(file_name, engine='pyarrow')
# financial statements - annual
tws_fin_stmts_file_name = 'tws_fin_stmts.parquet'
file_name = project_data_folder / tws_fin_stmts_file_name
try:
if fusion_fin_stmts is not None:
if not fusion_fin_stmts.empty:
fusion_fin_stmts.to_parquet(file_name, engine='pyarrow')
I get an error message
Traceback (most recent call last):
File "...\ibapi\client.py", line 239, in run
self.decoder.interpret(fields)
File "...\ibapi\decoder.py", line 1278, in interpret
self.interpretWithSignature(fields, handleInfo)
File "...\ibapi\decoder.py", line 1259, in interpretWithSignature
method(*args)
TypeError: 'str' object is not callable
python-BaseException
Can someone help me with this error message?
If I remove the for loop and run it only for a single ticker e.g.
client.contracts = {}
contract = Contract()
contract.symbol = 'AMD'
contract.secType = 'STK'
contract.currency = 'USD'
contract.exchange = "SMART"
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
I don't get a error message and the dataframe self.company_info get's populated with the correct data of AMD.
General questions:
Is it possible to get via reqFundamentalData() not only the company info "ReportSnapshot", but also the financial statements "ReportsFinStatements" (df_fin_stmts and the function "FinStmtsXMLtoDF") in one request/run ?
I new to python and would expect functions are only executed, if the functions is called within the code, but somehow with the the TWS API (socket, reqID) it seems to work different and it's not fully clear to me when which funciton is called after one and another.
e.g. how do I know that by executing reqFundamentalData() the function fundamentalData() is called. Or e.g. nextValidID() is somehow triggered, but not explicit called within the program. Is there a good tutorial to introduce the process of what functions are called in which order?
Thank you very much

Confused by a python type errror

I've been using python for a little while and have made some improvements but this a new error to me. I'm trying to learn social media analysis for my career and that's why I am trying out this set of code here.
I've de bugged one error but this one, which appears at line 81, has got me stumped as I can't see why the function "def get_user_objects(follower_ids):" returns none and what i'd need to change it in accordance with previous advice on other questions here.
Here's script to that point for simplicity. All help appreciated.
The error, to repeat is TypeError: object of type 'NoneType' has no len()
from tweepy import OAuthHandler
from tweepy import API
from collections import Counter
from datetime import datetime, date, time, timedelta
import sys
import json
import os
import io
import re
import time
# Helper functions to load and save intermediate steps
def save_json(variable, filename):
with io.open(filename, "w", encoding="utf-8") as f:
f.write(str(json.dumps(variable, indent=4, ensure_ascii=False)))
def load_json(filename):
ret = None
if os.path.exists(filename):
try:
with io.open(filename, "r", encoding="utf-8") as f:
ret = json.load(f)
except:
pass
return ret
def try_load_or_process(filename, processor_fn, function_arg):
load_fn = None
save_fn = None
if filename.endswith("json"):
load_fn = load_json
save_fn = save_json
else:
load_fn = load_bin
save_fn = save_bin
if os.path.exists(filename):
print("Loading " + filename)
return load_fn(filename)
else:
ret = processor_fn(function_arg)
print("Saving " + filename)
save_fn(ret, filename)
return ret
# Some helper functions to convert between different time formats and
perform date calculations
def twitter_time_to_object(time_string):
twitter_format = "%a %b %d %H:%M:%S %Y"
match_expression = "^(.+)\s(\+[0-9][0-9][0-9][0-9])\s([0-9][0-9][0-9]
[09])$"
match = re.search(match_expression, time_string)
if match is not None:
first_bit = match.group(1)
second_bit = match.group(2)
last_bit = match.group(3)
new_string = first_bit + " " + last_bit
date_object = datetime.strptime(new_string, twitter_format)
return date_object
def time_object_to_unix(time_object):
return int(time_object.strftime("%s"))
def twitter_time_to_unix(time_string):
return time_object_to_unix(twitter_time_to_object(time_string))
def seconds_since_twitter_time(time_string):
input_time_unix = int(twitter_time_to_unix(time_string))
current_time_unix = int(get_utc_unix_time())
return current_time_unix - input_time_unix
def get_utc_unix_time():
dts = datetime.utcnow()
return time.mktime(dts.timetuple())
# Get a list of follower ids for the target account
def get_follower_ids(target):
return auth_api.followers_ids(target)
# Twitter API allows us to batch query 100 accounts at a time
# So we'll create batches of 100 follower ids and gather Twitter User
objects for each batch
def get_user_objects(follower_ids):
batch_len = 100
num_batches = len(follower_ids)/100
batches = (follower_ids[i:i+batch_len] for i in range(0,
len(follower_ids), batch_len))
all_data = []
for batch_count, batch in enumerate(batches):
sys.stdout.write("\r")
sys.stdout.flush()
sys.stdout.write("Fetching batch: " + str(batch_count) + "/" +
str(num_batches))
sys.stdout.flush()
users_list = auth_api.lookup_users(user_ids=batch)
users_json = (map(lambda t: t._json, users_list))
all_data += users_json
return all_data
# Creates one week length ranges and finds items that fit into those range
boundaries
def make_ranges(user_data, num_ranges=20):
range_max = 604800 * num_ranges
range_step = range_max/num_ranges
# We create ranges and labels first and then iterate these when going
through the whole list
# of user data, to speed things up
ranges = {}
labels = {}
for x in range(num_ranges):
start_range = x * range_step
end_range = x * range_step + range_step
label = "%02d" % x + " - " + "%02d" % (x+1) + " weeks"
labels[label] = []
ranges[label] = {}
ranges[label]["start"] = start_range
ranges[label]["end"] = end_range
for user in user_data:
if "created_at" in user:
account_age = seconds_since_twitter_time(user["created_at"])
for label, timestamps in ranges.iteritems():
if account_age > timestamps["start"] and account_age <
timestamps["end"]:
entry = {}
id_str = user["id_str"]
entry[id_str] = {}
fields = ["screen_name", "name", "created_at",
"friends_count", "followers_count", "favourites_count", "statuses_count"]
for f in fields:
if f in user:
entry[id_str][f] = user[f]
labels[label].append(entry)
return labels
if __name__ == "__main__":
account_list = []
if (len(sys.argv) > 1):
account_list = sys.argv[1:]
if len(account_list) < 1:
print("No parameters supplied. Exiting.")
sys.exit(0)
consumer_key="XXXXXXX"
consumer_secret="XXXXXX"
access_token="XXXXXXX"
access_token_secret="XXXXXXXX"
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
auth_api = API(auth)
for target in account_list:
print("Processing target: " + target)
# Get a list of Twitter ids for followers of target account and save it
filename = target + "_follower_ids.json"
follower_ids = try_load_or_process(filename, get_follower_ids,
target)
# Fetch Twitter User objects from each Twitter id found and save the data
filename = target + "_followers.json"
user_objects = try_load_or_process(filename, get_user_objects,
follower_ids)
total_objects = len(user_objects)
# Record a few details about each account that falls between specified age
ranges
ranges = make_ranges(user_objects)
filename = target + "_ranges.json"
save_json(ranges, filename)
# Print a few summaries
print
print("\t\tFollower age ranges")
print("\t\t===================")
total = 0
following_counter = Counter()
for label, entries in sorted(ranges.iteritems()):
print("\t\t" + str(len(entries)) + " accounts were created
within " + label)
total += len(entries)
for entry in entries:
for id_str, values in entry.iteritems():
if "friends_count" in values:
following_counter[values["friends_count"]] += 1
print("\t\tTotal: " + str(total) + "/" + str(total_objects))
print
print("\t\tMost common friends counts")
print("\t\t==========================")
total = 0
for num, count in following_counter.most_common(20):
total += count
print("\t\t" + str(count) + " accounts are following " +
str(num) + " accounts")
print("\t\tTotal: " + str(total) + "/" + str(total_objects))
print
print

The immediate problem is in load_json: you assume its return value is a list or dict, or something that can be passed to len. However, it can return None in a number of circumstances:
The file to read from isn't found
There is some error reading from the file
There is a problem decoding the contents of the file
The file contains just the JSON value null.
At no point after you call load_json do you check its return value.
Worse, you catch and ignore any exception that might occur in load_json, causing it to silently return None with no indication that something went wrong.
The function would be better written like
def load_json(filename):
with io.open(filename, "r", encoding="utf-8") as f:
return json.load(f)
At least now, any errors will raise an uncaught exception, making it more obvious that there was a problem and providing a clue as to what the problem was. The golden rule of exception handling is to only catch the exceptions you can do something about, and if you can't do anything about a caught exception, re-raise it.

You could check for the resultant value and follow accordingly:
# Fetch Twitter User objects from each Twitter id found and save the data
filename = target + "_followers.json"
res_get_user_objects = get_user_objects()
if res_get_user_objects is not None:
user_objects = try_load_or_process(filename, get_user_objects,
follower_ids)
total_objects = len(user_objects)
else:
# handle it otherwise

How to check if a file has completed uploading into S3 Bucket using Boto in Python?

I'm trying to upload an image into S3 bucket using boto. After the image has successfully uploaded, I want to perform a certain operation using the file URL of the image in the S3 bucket. The problem is that sometimes the image doesn't upload fast enough and I end up with a server error when I want to perform the operation dependent on the file URL of the Image.
This is my source code. I'm using python flask.
def search_test(consumer_id):
consumer = session.query(Consumer).filter_by(consumer_id=consumer_id).one()
products = session.query(Product).all()
product_dictionary = {'Products': [p.serialize for p in products]}
if request.method == 'POST':
p_product_image_url = request.files['product_upload_url']
s3 = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = s3.get_bucket(AWS_BUCKET_NAME)
k = Key(bucket)
if p_product_image_url and allowed_file(p_product_image_url.filename):
# Read the contents of the file
file_content = p_product_image_url.read()
# Use Boto to upload the file to S3
k.set_metadata('Content-Type', mimetypes.guess_type(p_product_image_url.filename))
k.key = secure_filename(p_product_image_url.filename)
k.set_contents_from_string(file_content)
print ('consumer search upload successful')
new_upload = Uploads(picture_upload_url=k.key.replace(' ', '+'), consumer=consumer)
session.add(new_upload)
session.commit()
new_result = jsonify(Result=perform_actual_search(amazon_s3_base_url + k.key.replace(' ', '+'),
product_dictionary))
return new_result
else:
return render_template('upload_demo.html', consumer_id=consumer_id)
The jsonify method needs a valid image url to perform the operation. It works sometimes, sometimes it doesn't. The reason I suspect being due to the issue that the image would not have uploaded yet by the time it executes that line of code.
The perform_actual_search method is as follows:
def get_image_search_results(image_url):
global description
url = ('http://style.vsapi01.com/api-search/by-url/?apikey=%s&url=%s' % (just_visual_api_key, image_url))
h = httplib2.Http()
response, content = h.request(url,
'GET') # alternatively write content=h.request((url,'GET')[1]) ///Numbr 2 in our array
result = json.loads(content)
result_dictionary = []
for i in range(0, 10):
if result:
try:
if result['errorMessage']:
result_dictionary = []
except:
pass
if result['images'][i]:
images = result['images'][i]
jv_img_url = images['imageUrl']
title = images['title']
try:
if images['description']:
description = images['description']
else:
description = "no description"
except:
pass
# print("\njv_img_url: %s,\ntitle: %s,\ndescription: %s\n\n"% (
# jv_img_url, title, description))
image_info = {
'image_url': jv_img_url,
'title': title,
'description': description,
}
result_dictionary.append(image_info)
if result_dictionary != []:
# for i in range(len(result_dictionary)):
# print (result_dictionary[i])
# print("\n\n")
return result_dictionary
else:
return []
def performSearch(jv_input_dictionary, imagernce_products_dict):
print jv_input_dictionary
print imagernce_products_dict
global common_desc_ratio
global isReady
image_search_results = []
if jv_input_dictionary != []:
for i in range(len(jv_input_dictionary)):
print jv_input_dictionary[i]
for key in jv_input_dictionary[i]:
if key == 'description':
input_description = jv_input_dictionary[i][key]
s1w = re.findall('\w+', input_description.lower())
s1count = Counter(s1w)
print input_description
for j in imagernce_products_dict:
if j == 'Products':
for q in range(len(imagernce_products_dict['Products'])):
for key2 in imagernce_products_dict['Products'][q]:
if key2 == 'description':
search_description = imagernce_products_dict['Products'][q]['description']
print search_description
s2w = re.findall('\w+', search_description.lower())
s2count = Counter(s2w)
# Commonality magic
common_desc_ratio = difflib.SequenceMatcher(None, s1w, s2w).ratio()
print('Common ratio is: %.2f' % common_desc_ratio)
if common_desc_ratio > 0.09:
image_search_results.append(imagernce_products_dict['Products'][q])
if image_search_results:
print image_search_results
return image_search_results
else:
return {'404': 'No retailers registered with us currently own this product.'}
def perform_actual_search(image_url, imagernce_product_dictionary):
return performSearch(get_image_search_results(image_url), imagernce_product_dictionary)
Any help solving this would be greatly appreciated.

I would configure S3 to generate notifications on events such as s3:ObjectCreated:*
Notifications can be posted to an SNS topic, a SQS queue or directly trigger a lambda function.
More details about S3 notifications : http://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html
You should rewrite your code to separate the upload part and the image processing part. The later can be implemented as a Lambda function in Python.
Working in an Asynchronous way is key here, writing blocking code is usually not scalable.

you can compare bytes written to s3 with file size. lets say you use following method to write to s3:
bytes_written = key.set_contents_from_file(file_binary, rewind=True)
in your case it's set_contents_from_string
then I would compare, bytes_written with p_product_image_url.seek(0, os.SEEK_END)
if they match. whole file has been uploaded to s3.

How to get the Worksheet ID from a Google Spreadsheet with python?

I'd like to identify a method to attain the Worksheet ID within the URL for each of the worksheets within a Google Spreadsheet Workbook. For example, the worksheet id for 'sheet2' of this workbook is '1244369280' , since it's url is https://docs.google.com/spreadsheets/d/1yd8qTYjRns4_OT8PbsZzH0zajvzguKS79dq6j--hnTs/edit#gid=1244369280
One method I've found is to pull the XML of a Google Spreadsheet, since according to this question, the only way to get the Worksheet ID is to stream down the XML of a worksheet, but the example is in Javascript and I need to do this in Python
This is the Javascript Code that I'd like to execute in Python:
Dim worksheetFeed As WorksheetFeed
Dim query As WorksheetQuery
Dim worksheet As WorksheetEntry
Dim output As New MemoryStream
Dim xml As String
Dim gid As String = String.Empty
Try
_service = New Spreadsheets.SpreadsheetsService("ServiceName")
_service.setUserCredentials(UserId, Password)
query = New WorksheetQuery(feedUrl)
worksheetFeed = _service.Query(query)
worksheet = worksheetFeed.Entries(0)
' Save worksheet feed to memory stream so we can
' get the xml returned from the feed url and look for
' the gid. Gid allows us to download the specific worksheet tab
Using output
worksheet.SaveToXml(output)
End Using
xml = Encoding.ASCII.GetString(output.ToArray())
It seems that the best way to get the XML from a Google Spreadsheet is using Gdata, so I've downloaded GData and tried the Google Spreadsheet example with my credentials.
See below
#!/usr/bin/python
#
# Copyright (C) 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__author__ = 'api.laurabeth#gmail.com (Laura Beth Lincoln)'
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import gdata.spreadsheet.service
import gdata.service
import atom.service
import gdata.spreadsheet
import atom
import getopt
import sys
import string
class SimpleCRUD:
def __init__(self, email, password):
self.gd_client = gdata.spreadsheet.service.SpreadsheetsService()
self.gd_client.email = 'chris#curalate.com'
self.gd_client.password = 'jkjkdioerzumawya'
self.gd_client.source = 'Spreadsheets GData Sample'
self.gd_client.ProgrammaticLogin()
self.curr_key = ''
self.curr_wksht_id = ''
self.list_feed = None
def _PromptForSpreadsheet(self):
# Get the list of spreadsheets
feed = self.gd_client.GetSpreadsheetsFeed()
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_key = id_parts[len(id_parts) - 1]
def _PromptForWorksheet(self):
# Get the list of worksheets
feed = self.gd_client.GetWorksheetsFeed(self.curr_key)
self._PrintFeed(feed)
input = raw_input('\nSelection: ')
id_parts = feed.entry[string.atoi(input)].id.text.split('/')
self.curr_wksht_id = id_parts[len(id_parts) - 1]
def _PromptForCellsAction(self):
print ('dump\n'
'update {row} {col} {input_value}\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ', 1)
if command[0] == 'dump':
self._CellsGetAction()
elif command[0] == 'update':
parsed = command[1].split(' ', 2)
if len(parsed) == 3:
self._CellsUpdateAction(parsed[0], parsed[1], parsed[2])
else:
self._CellsUpdateAction(parsed[0], parsed[1], '')
else:
self._InvalidCommandError(input)
def _PromptForListAction(self):
print ('dump\n'
'insert {row_data} (example: insert label=content)\n'
'update {row_index} {row_data}\n'
'delete {row_index}\n'
'Note: No uppercase letters in column names!\n'
'\n')
input = raw_input('Command: ')
command = input.split(' ' , 1)
if command[0] == 'dump':
self._ListGetAction()
elif command[0] == 'insert':
self._ListInsertAction(command[1])
elif command[0] == 'update':
parsed = command[1].split(' ', 1)
self._ListUpdateAction(parsed[0], parsed[1])
elif command[0] == 'delete':
self._ListDeleteAction(command[1])
else:
self._InvalidCommandError(input)
def _CellsGetAction(self):
# Get the feed of cells
feed = self.gd_client.GetCellsFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(feed)
def _CellsUpdateAction(self, row, col, inputValue):
entry = self.gd_client.UpdateCell(row=row, col=col, inputValue=inputValue,
key=self.curr_key, wksht_id=self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsCell):
print 'Updated!'
def _ListGetAction(self):
# Get the list feed
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self._PrintFeed(self.list_feed)
def _ListInsertAction(self, row_data):
entry = self.gd_client.InsertRow(self._StringToDictionary(row_data),
self.curr_key, self.curr_wksht_id)
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Inserted!'
def _ListUpdateAction(self, index, row_data):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
entry = self.gd_client.UpdateRow(
self.list_feed.entry[string.atoi(index)],
self._StringToDictionary(row_data))
if isinstance(entry, gdata.spreadsheet.SpreadsheetsList):
print 'Updated!'
def _ListDeleteAction(self, index):
self.list_feed = self.gd_client.GetListFeed(self.curr_key, self.curr_wksht_id)
self.gd_client.DeleteRow(self.list_feed.entry[string.atoi(index)])
print 'Deleted!'
def _StringToDictionary(self, row_data):
dict = {}
for param in row_data.split():
temp = param.split('=')
dict[temp[0]] = temp[1]
return dict
def _PrintFeed(self, feed):
for i, entry in enumerate(feed.entry):
if isinstance(feed, gdata.spreadsheet.SpreadsheetsCellsFeed):
print '%s %s\n' % (entry.title.text, entry.content.text)
elif isinstance(feed, gdata.spreadsheet.SpreadsheetsListFeed):
print '%s %s %s' % (i, entry.title.text, entry.content.text)
# Print this row's value for each column (the custom dictionary is
# built using the gsx: elements in the entry.)
print 'Contents:'
for key in entry.custom:
print ' %s: %s' % (key, entry.custom[key].text)
print '\n',
else:
print '%s %s\n' % (i, entry.title.text)
def _InvalidCommandError(self, input):
print 'Invalid input: %s\n' % (input)
def Run(self):
self._PromptForSpreadsheet()
self._PromptForWorksheet()
input = raw_input('cells or list? ')
if input == 'cells':
while True:
self._PromptForCellsAction()
elif input == 'list':
while True:
self._PromptForListAction()
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["user=", "pw="])
except getopt.error, msg:
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
user = 'fake#gmail.com'
pw = 'fakepassword'
key = ''
# Process options
for o, a in opts:
if o == "--user":
user = a
elif o == "--pw":
pw = a
if user == '' or pw == '':
print 'python spreadsheetExample.py --user [username] --pw [password] '
sys.exit(2)
sample = SimpleCRUD(user, pw)
sample.Run()
if __name__ == '__main__':
main()
However this returns the following error:
Traceback (most recent call last):
File "/Users/Chris/Desktop/gdata_test.py", line 200, in <module>
main()
File "/Users/Chris/Desktop/gdata_test.py", line 196, in main
sample.Run()
File "/Users/Chris/Desktop/gdata_test.py", line 162, in Run
self._PromptForSpreadsheet()
File "/Users/Chris/Desktop/gdata_test.py", line 49, in _PromptForSpreadsheet
feed = self.gd_client.GetSpreadsheetsFeed()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/service.py", line 99, in GetSpreadsheetsFeed
converter=gdata.spreadsheet.SpreadsheetsSpreadsheetsFeedFromString)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/service.py", line 1074, in Get
return converter(result_body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/gdata/spreadsheet/__init__.py", line 395, in SpreadsheetsSpreadsheetsFeedFromString
xml_string)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 93, in optional_warn_function
return f(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/atom/__init__.py", line 127, in CreateClassFromXMLString
tree = ElementTree.fromstring(xml_string)
File "<string>", line 125, in XML
cElementTree.ParseError: no element found: line 1, column 0
[Finished in 0.3s with exit code 1]
[shell_cmd: python -u "/Users/Chris/Desktop/gdata_test.py"]
[dir: /Users/Chris/Desktop]
[path: /usr/bin:/bin:/usr/sbin:/sbin]
I should also mention that I've been using Gspread as a method to interact with Google Spreadsheets, but when I run the below code, I get the gid, but I need to have the worksheet id.
gc = gspread.authorize(credentials)
sh = gc.open_by_url('google_spreadsheet_url')
sh.get_id_fields()
>> {'spreadsheet_id': '1BgCEn-3Nor7UxOEPwD-qv8qXe7CaveJBrn9_Lcpo4W4','worksheet_id': 'oqitk0d'}

See the self.gd_client.ProgrammaticLogin() call - this is causing the major problem since it uses the "ClientLogin" authorization method which was first deprecated and later removed on April 20, 2015.
I would actually look into the more fresh and actively developed gspread module instead.
Here is a, somewhat insane, example demonstrating how to extract the actual "gid" value for a given spreadsheet and worksheet name. Note that you would first need to generate the JSON file with the OAuth credentials (I'm assuming you've already done that).
The code (added comments that would hopefully help to understand it):
import urlparse
import xml.etree.ElementTree as ET
import gspread
from oauth2client.service_account import ServiceAccountCredentials
SPREADSHEET_NAME = 'My Test Spreadsheet'
WORKSHEET_NAME = "Sheet2"
PATH_TO_JSON_KEYFILE = '/path/to/json/key/file.json'
NAMESPACES = {'ns0': 'http://www.w3.org/2005/Atom'}
SCOPES = ['https://spreadsheets.google.com/feeds']
# log in
credentials = ServiceAccountCredentials.from_json_keyfile_name(PATH_TO_JSON_KEYFILE, SCOPES)
gss_client = gspread.authorize(credentials)
# open spreadsheet
gss = gss_client.open(SPREADSHEET_NAME)
# extract the full feed url
root = gss._feed_entry
full_feed_url = next(elm.attrib["href"] for elm in root.findall("ns0:link", namespaces=NAMESPACES) if "full" in elm.attrib["href"])
# get the feed and extract the gid value for a given sheet name
response = gss_client.session.get(full_feed_url)
root = ET.fromstring(response.content)
sheet_entry = next(elm for elm in root.findall("ns0:entry", namespaces=NAMESPACES)
if elm.find("ns0:title", namespaces=NAMESPACES).text == WORKSHEET_NAME)
link = next(elm.attrib["href"] for elm in sheet_entry.findall("ns0:link", namespaces=NAMESPACES)
if "gid=" in elm.attrib["href"])
# extract "gid" from URL
gid = urlparse.parse_qs(urlparse.urlparse(link).query)["gid"][0]
print(gid)
It also looks like there is a way to convert the worksheet ID to a gid value, see:
How to convert Google spreadsheet's worksheet string id to integer index (GID)?

Jan 2017
You can use the new google spreadsheet api v4. You could take look at pygsheets library which uses api v4.
import pygsheets
#authorize the pygsheets
gc = pygsheets.authorize()
#open the spreadsheet
sh = gc.open('my new ssheet')
# get the worksheet and its id
print sh.worksheet_by_title("my test sheet").id

this seems to work for me using gspread
given a spreadsheet's worksheet url named 'mysheet1' that looks like this:
https://docs.google.com/spreadsheets/d/xxxxxf435454xxkjkjk23232325/edit#gid=645031900
this could be use to retrieve the gid value (aka: worksheet id or sheetid)
ss_key = xxxxxf435454xxkjkjk23232325
wks_name = mysheet1
gc.open_by_key('xxxxxf435454xxkjkjk23232325').worksheet('mysheet1').id
result:
645031900

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Big Query table create and load data via Python - python

Related

Python: Passing a variable to another function

TWS API - store company snapshot and financial statements

Confused by a python type errror

How to check if a file has completed uploading into S3 Bucket using Boto in Python?

How to get the Worksheet ID from a Google Spreadsheet with python?

Categories

Resources