Passing in 'date' as a runtime argument in Google Dataflow Template - python

I'm currently trying to generate a Google Dataflow custom template, that will call an API when run, and write the results to a BigQuery table.
However the issue I'm encountering is that the API requires a date parameter 'YYYY-MM-DD' to be passed in for it to work.
Unfortunately it seems that when constructing a template Dataflow requires that you use ValueProvider (as described here) for any variables that are relative to when the job is being run (i.e. today's date). Otherwise it'll just carry on using the same date that was generated when the template was originally created. (i.e. with dt.date.today() etc - h/t to this post)
Therefore with the code that I've got, is there any way to generate the template so that it will utilise today's date correctly as an argument at runtime, rather than just using the same static date indefinitely - or as is currently the case - just not converting to a template at all.
from __future__ import print_function, absolute_import
import argparse
import logging
import sys
import apache_beam as beam
from apache_beam.io.gcp.internal.clients import bigquery
from apache_beam.metrics.metric import Metrics
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
from apache_beam.options.value_provider import ValueProvider
import datetime as dt
from datetime import timedelta, date
import time
import re
logging.getLogger().setLevel(logging.INFO)
class GetAPI():
def __init__(self, data={}, date=None):
self.num_api_errors = Metrics.counter(self.__class__, 'num_api_errors')
self.data = data
self.date = date
def get_job(self):
import requests
endpoint = f'https://www.rankranger.com/api/v2/?rank_stats&key={self.data.api_key}&date={self.date}'\
f'&campaign_id={self.data.campaign}&se_id={self.data.se}&domain={self.data.domain}&output=json'
logging.info("Endpoint: {}".format(str(endpoint)))
try:
res = requests.get(endpoint)
if res.status_code == 200:
# logging.info("Reponse: {}".format(str(res.text)))
json_data = res.json()
## Store the API response
if 'result' in json_data:
response = json_data.get('result')
return response
except Exception as e:
self.num_api_errors.inc()
logging.error(f'Exception: {e}')
logging.error(f'Extract error on "%s"', 'Rank API')
def format_dates(api):
api['date'] = dt.datetime.strptime(api['date'], "%m/%d/%Y").strftime("%Y-%m-%d")
return api
# Class to pass in date generated at runtime to template
class UserOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
## Special runtime argument e.g. date
parser.add_value_provider_argument('--date',
type=str,
default=(dt.date.today()).strftime("%Y-%m-%d"),
help='Run date in YYYY-MM-DD format.')
def run(argv=None):
"""
Main entry point; defines the static arguments to be passed in.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--api_key',
type=str,
default=API_KEY,
help='API key for Rank API.')
parser.add_argument('--campaign',
type=str,
default=CAMPAIGN,
help='Campaign ID for Rank API')
parser.add_argument('--se',
type=str,
default=SE,
help='Search Engine ID for Rank API')
parser.add_argument('--domain',
type=str,
default=DOMAIN,
help='Domain for Rank API')
parser.add_argument('--dataset',
type=str,
default=DATASET,
help='BigQuery Dataset to write tables to. Must already exist.')
parser.add_argument('--table_name',
type=str,
default=TABLE_NAME,
help='The BigQuery table name. Should not already exist.')
parser.add_argument('--project',
type=str,
default=PROJECT,
help='Your GCS project.')
parser.add_argument('--runner',
type=str,
default="DataflowRunner",
help='Type of DataFlow runner.')
args, pipeline_args = parser.parse_known_args(argv)
# Create and set your PipelineOptions.
options = PipelineOptions(pipeline_args)
user_options = options.view_as(UserOptions)
pipeline = beam.Pipeline(options=options)
# Gets data from Rank Ranger API
api = (
pipeline
| 'create' >> beam.Create(GetAPI(data=args, date=user_options.date).get_job())
| 'format dates' >> beam.Map(format_dates)
)
# Write to bigquery based on specified schema
BQ = (api | "WriteToBigQuery" >> beam.io.WriteToBigQuery(args.table_name, args.dataset, SCHEMA))
pipeline.run()
if __name__ == '__main__':
run()
As you can see from the error message, rather than passing in a neatly formatted 'YYYY-MM-DD' parameter, it's instead passing in the full ValueProvider object which is stopping the API call from working and returning the NoneType error.
(Apache) C:\Users\user.name\Documents\Alchemy\Dataflow\production_pipeline\templates>python main.py --runner DataflowRunner --project <PROJECT> --staging_location gs://<STORAGE-BUCKET>/staging --temp_location gs://<STORAGE-BUCKET>/temp --template_location gs://<STORAGE-BUCKET>/template/<TEMPLATE> --region europe-west2
INFO:root:Endpoint: https://www.rankranger.com/api/v2/?rank_stats&key=<API_KEY>&date=RuntimeValueProvider(option: date, type: str, default_value: '2020-08-25')&campaign_id=<CAMPAIGN>&se_id=<SE>&domain=<DOMAIN>&output=json
Traceback (most recent call last):
File "main.py", line 267, in <module>
run()
File "main.py", line 257, in run
| 'format dates' >> beam.Map(format_dates)
File "C:\Users\user.name\Anaconda3\envs\Apache\lib\site-packages\apache_beam\transforms\core.py", line 2590, in __init__
self.values = tuple(values)
TypeError: 'NoneType' object is not iterable
Any help would be hugely appreciated!

You are correct in your diagnosis. You should consider migrating to Flex Templates which solve this (and other) issues and provide much more flexibility.

Related

TypeError: DriverAction() got an unexpected keyword argument 'nargs'

I am currently working on a CLI tool using python's argparse module. It continuously throws this error anytime I run the program. I tried removing both the help and nargs supported arguments and still, nothing seems to work.
Full Error traceback
Traceback (most recent call last):
File "/home/sharhan/.local/share/virtualenvs/backupdb-CXGj3PlQ/bin/backupdb", line 11, in <module>
load_entry_point('backupdb', 'console_scripts', 'backupdb')()
File "/home/sharhan/DEV/PYTHON/CLI-TOOLS/backupdb/src/backupdb/cli.py", line 42, in main
args = create_parser().parse_args()
File "/home/sharhan/DEV/PYTHON/CLI-TOOLS/backupdb/src/backupdb/cli.py", line 27, in create_parser
parser.add_argument(
File "/usr/lib/python3.9/argparse.py", line 1428, in add_argument
action = action_class(**kwargs)
TypeError: DriverAction() got an unexpected keyword argument 'nargs'
This is the code:
from argparse import Action, ArgumentParser
from os import pardir
def DriverAction(Action):
"""
Namespace is a class that stores the arguments passed in the CLI.
collect driver & destination from values and store them in the namespace
using the dot notation appropriately
"""
def __call__(self, parser, namespace, values, option_string=None):
driver, destination = values
namespace.driver = driver.lower()
namespace.destination = destination
def create_parser():
parser = ArgumentParser(description="""
Back up PostgreSQl databases Locally or to AWS S3
"""
)
parser.add_argument(
"url",
help="URL of the database to backup"
)
parser.add_argument(
"--driver",
nargs=2,
action=DriverAction,
required=True,
help="How and where to store the backup",
)
return parser
def main():
import boto3
from backupdb import pgdump, storage
args = create_parser().parse_args()
dump = pgdump.dump(args.url)
if args.driver == 's3':
client = boto3.client('s3')
storage.s3(client, dump.stdout, args.destination, 'mybackupdb.sql')
else:
with open(args.destination, 'wb') as outfile:
storage.local(dump.stdout, outfile)

ModuleNotFoundError: No module named 'common.config' oanda api python

I am trying to connect to Oanda REST API using juypter notebook with the following code:
#!/usr/bin/env python
import sys
import select
import argparse
import common.config
from .account import Account
def main():
"""
Create an API context, and use it to fetch an Account state and then
continually poll for changes to it.
The configuration for the context and Account to fetch is parsed from the
config file provided as an argument.
"""
parser = argparse.ArgumentParser()
#
# The config object is initialized by the argument parser, and contains
# the REST APID host, port, accountID, etc.
#
common.config.add_argument(parser)
parser.add_argument(
"--poll-interval",
type=int,
default=5,
help="The number of seconds between polls for Account changes"
)
args = parser.parse_args()
account_id = args.config.active_account
#
# The v20 config object creates the v20.Context for us based on the
# contents of the config file.
#
api = args.config.create_context()
#
# Fetch the details of the Account found in the config file
#
response = api.account.get(account_id)
#
# Extract the Account representation from the response and use
# it to create an Account wrapper
#
account = Account(
response.get("account", "200")
)
def dump():
account.dump()
print("Press <ENTER> to see current state for Account {}".format(
account.details.id
))
dump()
while True:
i, _, _ = select.select([sys.stdin], [], [], args.poll_interval)
if i:
sys.stdin.readline()
dump()
#
# Poll for all changes to the account since the last
# Account Transaction ID that was seen
#
response = api.account.changes(
account_id,
sinceTransactionID=account.details.lastTransactionID
)
account.apply_changes(
response.get(
"changes",
"200"
)
)
account.apply_state(
response.get(
"state",
"200"
)
)
account.details.lastTransactionID = response.get(
"lastTransactionID",
"200"
)
if __name__ == "__main__":
main()
It is showing this error:
ModuleNotFoundError Traceback (most recent call last)
in ----> 1 import common.view
2 from position.view import print_positions_map 3 from order.view
import print_orders_map 4 from trade.view import print_trades_map 5
ModuleNotFoundError: No module named 'common.view'
I added the 2 line on the top of the code then run it correctly.I think it's because of error path.
import sys
sys.path.append('/Users/apple/Documents/code/PythonX86/OandaAPI/example/v20-python-samples/src')

Attribute error while creating custom template using python in Google Cloud DataFlow

I am facing issue while creating custom template for Cloud Dataflow.
its simple code that takes data from input bucket and loads in BigQuery.
We want to load many tables so trying to create custom template.
once this works, next step would be passing dataset also as parameter.
Error message :
AttributeError: 'StaticValueProvider' object has no attribute 'datasetId'
Code
class ContactUploadOptions(PipelineOptions):
"""
Runtime Parameters given during template execution
path and organization parameters are necessary for execution of pipeline
campaign is optional for committing to bigquery
"""
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--input',
type=str,
help='Path of the file to read from'
)
parser.add_value_provider_argument(
'--output',
type=str,
help='Output BQ table for the pipeline')
def run(argv=None):
"""The main function which creates the pipeline and runs it."""
global PROJECT
from google.cloud import bigquery
# Retrieve project Id and append to PROJECT form GoogleCloudOptions
# Initialize runtime parameters as object
contact_options = PipelineOptions().view_as(ContactUploadOptions)
PROJECT = PipelineOptions().view_as(GoogleCloudOptions).project
client = bigquery.Client(project=PROJECT)
dataset = client.dataset('pharma')
data_ingestion = DataIngestion()
pipeline_options = PipelineOptions()
# Save main session state so pickled functions and classes
# defined in __main__ can be unpickled
pipeline_options.view_as(SetupOptions).save_main_session = True
# Parse arguments from command line.
#data_ingestion = DataIngestion()
# Instantiate pipeline
options = PipelineOptions()
p = beam.Pipeline(options=options)
(p
| 'Read from a File' >> beam.io.ReadFromText(contact_options.input, skip_header_lines=0)
| 'String To BigQuery Row' >> beam.Map(lambda s: data_ingestion.parse_method(s))
| 'Write to BigQuery' >> beam.io.Write(
beam.io.BigQuerySink(
contact_options.output,
schema='assetid:INTEGER,assetname:STRING,prodcd:INTEGER',
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
)
My command is as below :
python3 -m pharm_template --runner DataflowRunner --project jupiter-120 --staging_location gs://input-cdc/temp/staging --temp_location gs://input-cdc/temp/ --template_location gs://code-cdc/temp/templates/jupiter_pipeline_template
What I tried :
I tried passing --input and --output
I also tried --experiment=use_beam_bq_sink but to no avail.
I also tried passing datasetID
datasetId = StaticValueProvider(str, 'pharma')
but no luck.
If any one has created template that loads in BQ , then I can take cue and fix this issue.

Python Dataflow Template, making Runtime Parameters globally accessible

So the aim of the pipeline is to be able to use Runtime Variables to be able to open a csv file and name a BigQuery table.
All I need is to be able to access these variables globally, or within a ParDo, such as parsing it into the function.
I have tried creating a dummy string, then running a FlatMap to access the Runtime Parameters and make them global, although it returns nothing.
eg.
class CustomPipelineOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--path',
type=str,
help='csv storage path')
parser.add_value_provider_argument(
'--table_name',
type=str,
help='Table Id')
def run()
def rewrite_values(element):
""" Rewrite default env values"""
# global project_id
# global campaign_id
# global organization_id
# global language
# global file_path
try:
logging.info("File Path with str(): {}".format(str(custom_options.path)))
logging.info("----------------------------")
logging.info("element: {}".format(element))
project_id = str(cloud_options.project)
file_path = custom_options.path.get()
table_name = custom_options.table_name.get()
logging.info("project: {}".format(project_id))
logging.info("File path: {}".format(file_path))
logging.info("language: {}".format(table_name))
logging.info("----------------------------")
except Exception as e:
logging.info("Error format----------------------------")
raise KeyError(e)
return file_path
pipeline_options = PipelineOptions()
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
custom_options = pipeline_options.view_as(CustomPipelineOptions)
pipeline_options.view_as(SetupOptions).save_main_session = True
# Beginning of the pipeline
p = beam.Pipeline(options=pipeline_options)
init_data = (p
| beam.Create(["Start"])
| beam.FlatMap(rewrite_values))
pipeline processing, running pipeline etc.
I can access the project variable no problem, although everything else returns as blank.
If I make the custom_options variable global, or when I pass a specific customs object into a function, such as: | 'Read data' >> beam.ParDo(ReadGcsBlobs(path_file=custom_options.path)), it only returns something such as RuntimeValueProvider(option: path, type: str, default_value: None).
If I use | 'Read data' >> beam.ParDo(ReadGcsBlobs(path_file=custom_options.path.get())), the variable is and empty string.
So in essence, I just need to access these variables globally, is it possible?
Finally to clarify, I do not want to use the ReadFromText method, I can use the runtime variable there, although to incorporate the runtime options into the dict created from the csv file will be to costly as I am working with huge csv files.
For me it worked by declaring cloud_options and custom_options as global:
import argparse, logging
import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class CustomPipelineOptions(PipelineOptions):
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument(
'--path',
type=str,
help='csv storage path')
parser.add_value_provider_argument(
'--table_name',
type=str,
help='Table Id')
def rewrite_values(element):
""" Rewrite default env values"""
# global project_id
# global campaign_id
# global organization_id
# global language
# global file_path
try:
logging.info("File Path with str(): {}".format(str(custom_options.path.get())))
logging.info("----------------------------")
logging.info("element: {}".format(element))
project_id = str(cloud_options.project)
file_path = custom_options.path.get()
table_name = custom_options.table_name.get()
logging.info("project: {}".format(project_id))
logging.info("File path: {}".format(file_path))
logging.info("language: {}".format(table_name))
logging.info("----------------------------")
except Exception as e:
logging.info("Error format----------------------------")
raise KeyError(e)
return file_path
def run(argv=None):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
global cloud_options
global custom_options
pipeline_options = PipelineOptions(pipeline_args)
cloud_options = pipeline_options.view_as(GoogleCloudOptions)
custom_options = pipeline_options.view_as(CustomPipelineOptions)
pipeline_options.view_as(SetupOptions).save_main_session = True
# Beginning of the pipeline
p = beam.Pipeline(options=pipeline_options)
init_data = (p
| beam.Create(["Start"])
| beam.FlatMap(rewrite_values))
result = p.run()
# result.wait_until_finish
if __name__ == '__main__':
run()
After setting the PROJECT and BUCKET variables I staged the template with:
python script.py \
--runner DataflowRunner \
--project $PROJECT \
--staging_location gs://$BUCKET/staging \
--temp_location gs://$BUCKET/temp \
--template_location gs://$BUCKET/templates/global_options
And execute it with providing path and table_name options:
gcloud dataflow jobs run global_options \
--gcs-location gs://$BUCKET/templates/global_options \
--parameters path=test_path,table_name=test_table
And the runtime parameters seem to be logged fine inside the FlatMap:

executescript in python not passing flow file to next processor

I am trying to run executescript process in Apache Nifi using python but having problem with passing flow file to next processor in my data flow.
If I run the standalone flow file creation and writing snippet it works and I can read flow file in the next processor but when I try to enrich it, it simply does not pass the flow file. In fact no error is generated and somehow I have no clue how to proceed. I am bit new with python and nifi and appreciate your help with this particular issue.
Below is the code I am using and you can see its very simple. I just want to create and write some string to flow file using some logic. But no luck so far
import urllib2
import json
import datetime
import csv
import time
import sys
import traceback
from org.apache.nifi.processor.io import OutputStreamCallback
from org.python.core.util import StringUtil
class WriteContentCallback(OutputStreamCallback):
def __init__(self, content):
self.content_text = content
def process(self, outputStream):
try:
outputStream.write(StringUtil.toBytes(self.content_text))
except:
traceback.print_exc(file=sys.stdout)
raise
page_id = "dsssssss"
access_token = "sdfsdfsf%sdfsdf"
def scrapeFacebookPageFeedStatus(page_id, access_token):
flowFile = session.create()
flowFile = session.write(flowFile, WriteContentCallback("Hello there this is my data"))
flowFile = session.write()
session.transfer(flowFile, REL_SUCCESS)
print "\nDone!\n%s Statuses Processed in %s" % \
(num_processed, datetime.datetime.now() - scrape_starttime)
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
I believe the problem is the check for __main__:
if __name__ == '__main__':
scrapeFacebookPageFeedStatus(page_id, access_token)
__builtin__ was the actual module name in my experiment. You could either remove that check, or add a different one if you want to preserve your separate testing path.

Categories

Resources