gae mapreduce generator error no attribute validate_bucket_name - python

This is my first GAE project. I got my serial code to work on the dev_app (I am using the GoogleAppEngineLauncher on Mac). Since my code takes too long to finish I am trying to use mapreduce to speed up the process. I tried the following code but keep getting the following error. I am not sure if this is because of some error in my code or if I am missing any statements in the *yaml files. Kindly help!
class ShuffleDictPipeline(base_handler.PipelineBase):
def run(self, *args, **kwargs):
""" run """
mapper_params = {
"entity_kind": "coremic.RandomDict",
"batch_size": 500,
"filters": [("idx", "=", ndb_custom_key)]
}
reducer_params = {
"mime_type": "text/plain"
}
output = yield mapreduce_pipeline.MapreducePipeline(
"calc_shuff_core_microb",
mapper_spec="coremic.shuffle_dict_coremic_map",
mapper_params=mapper_params,
reducer_spec="coremic.shuffle_dict_coremic_reduce",
reducer_params=reducer_params,
input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
output_writer_spec="mapreduce.output_writers.BlobstoreOutputWriter",
shards=16)
yield StoreOutput(output)
Error:
ERROR 2016-03-05 20:03:21,706 pipeline.py:2432]
Generator mapreduce.mapper_pipeline.MapperPipeline(*(u'calc_shuff_core_microb-map', u'coremic.shuffle_dict_coremic_map', u'mapreduce.input_readers.DatastoreInputReader'), **{'output_writer_spec': u'mapreduce.output_writers._GoogleCloudStorageKeyValueOutputWriter', 'params': {u'batch_size': 500, u'bucket_name': u'app_default_bucket', u'entity_kind': u'coremic.RandomDict',... (324 bytes))#b96dd511c0454fd99413d267b7388857 raised exception. AttributeError: 'NoneType' object has no attribute 'validate_bucket_name'
Traceback (most recent call last):
File "/Users/rr/GAE/coremic/pipeline/pipeline.py", line 2156, in evaluate
self, pipeline_key, root_pipeline_key, caller_output)
File "/Users/rr/GAE/coremic/pipeline/pipeline.py", line 1110, in _run_internal
return self.run(*self.args, **self.kwargs)
File "/Users/rr/GAE/coremic/mapreduce/mapper_pipeline.py", line 102, in run
queue_name=self.queue_name,
File "/Users/rr/GAE/coremic/mapreduce/control.py", line 125, in start_map
in_xg_transaction=in_xg_transaction)
File "/Users/rr/GAE/coremic/mapreduce/handlers.py", line 1730, in _start_map
mapper_output_writer_class.validate(mapper_spec)
File "/Users/rr/GAE/coremic/mapreduce/output_writers.py", line 1075, in validate
return cls.WRITER_CLS.validate(mapper_spec)
File "/Users/rr/GAE/coremic/mapreduce/output_writers.py", line 723, in validate
super(_GoogleCloudStorageOutputWriter, cls).validate(mapper_spec)
File "/Users/rr/GAE/coremic/mapreduce/output_writers.py", line 604, in validate
cloudstorage.validate_bucket_name(
AttributeError: 'NoneType' object has no attribute 'validate_bucket_name'

I am still working on getting everything to work, but couple of things helped.
1.1 Install google cloud storage client lib on SDK to access the bucket. cloud google com appengine docs python googlecloudstorageclient
1.2 Set up (create) the bucket.
Then follow steps from https://plus.google.com/+EmlynORegan/posts/6NPaRKxMkf3
Note how the mapper params has changed.
2 - In mapreduce pipelines, replace
"mapreduce.output_writers.BlobstoreOutputWriter"
with
"mapreduce.output_writers.GoogleCloudStorageConsistentOutputWriter"
3 - update reducer params to:
{
"mime_type": "text/plain",
"output_writer": {
"bucket_name": ,
"tmp_bucket_name":
}
}
Other very useful link:
https://gist.github.com/nlathia/ab670053ed460c4ca02f/89178e132b894fe5467c09164d3827f70e4ae2f8

You can do 1 of 2 things. Either
Create a google cloud storage bucket associated with your project, because at the moment none is associated with it, hence the NoneType. Once done, you can add that to your mapper_params.
mapper_params = {
...
"bucket_name": "<your google cloud storage bucket name>",
...
}
OR
Create a default bucket by visiting your app engine's application settings in the cloud console https://console.cloud.google.com/appengine/settings?project=

Install GoogleAppEngineCloudStorageClient in your project.
output_writes.py does the following:
try:
# Check if the full cloudstorage package exists. The stub part is in runtime.
cloudstorage = None
import cloudstorage
if hasattr(cloudstorage, "_STUB"):
cloudstorage = None
# "if" is needed because apphosting/ext/datastore_admin:main_test fails.
if cloudstorage:
from cloudstorage import cloudstorage_api
from cloudstorage import errors as cloud_errors
except ImportError:
pass # CloudStorage library not available
So, when importing cloudstorage fails, the value of cloudstorage variable = None. And that causes the exception later.

Related

Flink Python Datastream API Kafka Consumer

Im new to pyflink. Im tryig to write a python program to read data from kafka topic and prints data to stdout. I followed the link Flink Python Datastream API Kafka Producer Sink Serializaion. But i keep seeing NoSuchMethodError due to version mismatch. I have added the flink-sql-kafka-connector available at https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka_2.11/1.13.0/flink-sql-connector-kafka_2.11-1.13.0.jar. Can someone help me in with a proper example to do this? Following is my code
import json
import os
from pyflink.common import SimpleStringSchema
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer
from pyflink.common.typeinfo import Types
def my_map(obj):
json_obj = json.loads(json.loads(obj))
return json.dumps(json_obj["name"])
def kafkaread():
env = StreamExecutionEnvironment.get_execution_environment()
env.add_jars("file:///automation/flink/flink-sql-connector-kafka_2.11-1.10.1.jar")
deserialization_schema = SimpleStringSchema()
kafkaSource = FlinkKafkaConsumer(
topics='test',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': '10.234.175.22:9092', 'group.id': 'test'}
)
ds = env.add_source(kafkaSource).print()
env.execute('kafkaread')
if __name__ == '__main__':
kafkaread()
But python doesnt recognise the jar file and throws the following error.
Traceback (most recent call last):
File "flinkKafka.py", line 31, in <module>
kafkaread()
File "flinkKafka.py", line 20, in kafkaread
kafkaSource = FlinkKafkaConsumer(
File "/automation/flink/venv/lib/python3.8/site-packages/pyflink/datastream/connectors.py", line 186, in __init__
j_flink_kafka_consumer = _get_kafka_consumer(topics, properties, deserialization_schema,
File "/automation/flink/venv/lib/python3.8/site-packages/pyflink/datastream/connectors.py", line 336, in _get_kafka_consumer
j_flink_kafka_consumer = j_consumer_clz(topics,
File "/automation/flink/venv/lib/python3.8/site-packages/pyflink/util/exceptions.py", line 185, in wrapped_call
raise TypeError(
TypeError: Could not found the Java class 'org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer'. The Java dependencies could be specified via command line argument '--jarfile' or the config option 'pipeline.jars'
What is the correct location to add the jar file?
I see that you downloaded flink-sql-connector-kafka_2.11-1.13.0.jar, but the code loades flink-sql-connector-kafka_2.11-1.10.1.jar.
May be you can have a check
just need to check the path to flink-sql-connector jar
You should add jar file of flink-sql-connector-kafka, it depends on your pyflink and scala version. If versions are true, check your path in add_jars function if the jar package is here.

google ads api - "argument should be integer or bytes-like object, not 'str'"

I've been trying to follow the examples and documentation for the python ad_manager library for the google ads API, but I haven't been able to complete a successful request. I currently have my developer token, client_id, client_secret, and refresh_token in my google ads YAML file, but I'm constantly getting the error "argument should be integer or bytes-like object, not 'str'" when calling the function WaitForReport following the example code below. I was wondering if anyone had any advice on how I could tackle this issue.
import tempfile
# Import appropriate modules from the client library.
from googleads import ad_manager
from googleads import errors
def main(client):
# Initialize a DataDownloader.
report_downloader = client.GetDataDownloader(version='v202111')
# Create report job.
report_job = {
'reportQuery': {
'dimensions': ['COUNTRY_NAME', 'LINE_ITEM_ID', 'LINE_ITEM_NAME'],
'columns': ['UNIQUE_REACH_FREQUENCY', 'UNIQUE_REACH_IMPRESSIONS',
'UNIQUE_REACH'],
'dateRangeType': 'REACH_LIFETIME'
}
}
try:
# Run the report and wait for it to finish.
report_job_id = report_downloader.WaitForReport(report_job)
except errors.AdManagerReportError as e:
print('Failed to generate report. Error was: %s' % e)
# Change to your preferred export format.
export_format = 'CSV_DUMP'
report_file = tempfile.NamedTemporaryFile(suffix='.csv.gz', delete=False)
# Download report data.
report_downloader.DownloadReportToFile(
report_job_id, export_format, report_file)
report_file.close()
# Display results.
print('Report job with id "%s" downloaded to:\n%s' % (
report_job_id, report_file.name))
if __name__ == '__main__':
# Initialize client object.
ad_manager_client = ad_manager.AdManagerClient.LoadFromStorage()
main(ad_manager_client)
Edit:
Below is the stack trace:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/googleads/common.py", line 984, in MakeSoapRequest
return soap_service_method(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/zeep/proxy.py", line 46, in __call__
return self._proxy._binding.send(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/zeep/wsdl/bindings/soap.py", line 135, in send
return self.process_reply(client, operation_obj, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/zeep/wsdl/bindings/soap.py", line 229, in process_reply
return self.process_error(doc, operation)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/zeep/wsdl/bindings/soap.py", line 317, in process_error
raise Fault(
zeep.exceptions.Fault: Unknown fault occured
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "google_ads.py", line 72, in <module>
main(ad_manager_client)
File "google_ads.py", line 33, in main1
report_job_id = report_downloader.WaitForReport(report_job)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/googleads/ad_manager.py", line 784, in WaitForReport
report_job_id = service.runReportJob(report_job)['id']
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/googleads/common.py", line 989, in MakeSoapRequest
underlying_exception = e.detail.find(
TypeError: argument should be integer or bytes-like object, not 'str'
In your YAML file, do you have your account number in quotes? (either single or double?)
Additionally, I would highly recommend not going with this API if you have the option. It will be sunset in April and will no longer work. The newer google ads API (as opposed to the AdWords API) is available, stable and much easier to work with. The ad manager examples are good too.
The problem seems to be that zeep raises a WebFault which includes the returned XML response as a string in zeep.Fault.detail.
Somewhat counter-intuitive, this attribute is not a string, but a bytes sequence because zeep.wsdl.utils.etree_to_string calls etree.tostring() with encoding="utf-8" instead of encoding="unicode"—the latter would make sure it's a proper string.
googleads then tries to look for specific error strings inside the XML using find(), but even though find() is defined both on str and bytes, the type of the substring to look for needs to align.
Thus, in
underlying_exception = e.detail.find(
'{%s}ApiExceptionFault' % self._GetBindingNamespace())
bytes.find() is called with a str argument, causing the ValueError you experience.
I'd argue that zeep.wsdl.utils.etree_to_string() should be adjusted to actually return a str instead of bytes. You could try opening an issue on Zeep's Github repository.

Is there a particular syntax for using python relative imports alongside connexion?

I am currently trying to build an api using connexion. However, I am having some issues using relative local module imports through the connexion module, which modifies the underlying flask app. Here is a simplified overview of my file structure:
hall_of_fame_api
controller
____init____.py
routes.py
model
____init____.py
routes.py
____init____.py
config.py
create_db.py
swagger.yml
I am getting an error when I try to run 'python config.py' in my terminal. Here is config.py:
import os
import connexion
from flask_sqlalchemy import SQLAlchemy
from flask_marshmallow import Marshmallow
basedir = os.path.abspath(os.path.dirname(__file__))
# Create the Connexion application instance
connex_app = connexion.App(__name__, specification_dir=basedir)
# Get the underlying Flask app instance
app = connex_app.app
connex_app.add_api('swagger.yml')
# Configure the SQLAlchemy part of the app instance
app.config['SQLALCHEMY_ECHO'] = True
app.config['SQLALCHEMY_DATABASE_URI'] = 'postgresql://doadmin:password#nba-player-db-do-user-7027314-0.db.ondigitalocean.com:25060/nba_test_1?sslmode=require'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
# Create the SQLAlchemy db instance
db = SQLAlchemy(app)
# Initialize Marshmallow
ma = Marshmallow(app)
And here is the error it gives:
Failed to add operation for GET /api/players
Failed to add operation for GET /api/players
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/connexion/apis/abstract.py", line 209, in add_paths
self.add_operation(path, method)
File "/usr/local/lib/python3.7/site-packages/connexion/apis/abstract.py", line 173, in add_operation
pass_context_arg_name=self.pass_context_arg_name
File "/usr/local/lib/python3.7/site-packages/connexion/operations/__init__.py", line 8, in make_operation
return spec.operation_cls.from_spec(spec, *args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/connexion/operations/swagger2.py", line 137, in from_spec
**kwargs
File "/usr/local/lib/python3.7/site-packages/connexion/operations/swagger2.py", line 96, in __init__
pass_context_arg_name=pass_context_arg_name
File "/usr/local/lib/python3.7/site-packages/connexion/operations/abstract.py", line 96, in __init__
self._resolution = resolver.resolve(self)
File "/usr/local/lib/python3.7/site-packages/connexion/resolver.py", line 40, in resolve
return Resolution(self.resolve_function_from_operation_id(operation_id), operation_id)
File "/usr/local/lib/python3.7/site-packages/connexion/resolver.py", line 66, in resolve_function_from_operation_id
raise ResolverError(str(e), sys.exc_info())
connexion.exceptions.ResolverError: <ResolverError: module 'controller.routes' has no attribute 'read_all'>
This error comes specifically from line 12 where connexion is trying to add the swagger.yml file, here is that for reference as well:
swagger: "2.0"
info:
description: This is the swagger file that goes with our server code
version: "1.0.0"
title: Swagger REST Article
consumes:
- "application/json"
produces:
- "application/json"
basePath: "/api"
# Paths supported by the server application
paths:
/players:
get:
operationId: "controller.routes.read_all"
tags:
- "People"
summary: "The people data structure supported by the server application"
description: "Read the list of people"
responses:
200:
description: "Successful read people list operation"
schema:
type: "array"
items:
properties:
fname:
type: "string"
lname:
type: "string"
timestamp:
type: "string"
Now here is where I am confused, because my routes.py file does have a function defined as read_all(), here is that file:
from model.models import Regseason, RegSchema, Playoffs, PlayoffSchema
def read_all():
return Regseason.query.all()
I have been racking my brain over this bug for almost 24 hours, any guidance would be greatly appreciated. Thanks in advance!
Please add a extra field x-openapi-router-controller below operationid. It is used by Connexion to map which module to send requests to. It is combined together with OperationId to go to correct module and function.

Pandas / Google Analytics API authentication attempt throws me a weird python error

During my pandas/Google Analytics API setup, I basically did everything as described in this link:
http://blog.yhathq.com/posts/pandas-google-analytics.html
The client_secrets.json is in the pandas/io folder. When i now try to execute a statement of the form
>>>from pandas.io import ga
>>>df = ga.read_ga(metrics, dimensions, start_date)
the following error occurs:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "\Anaconda\lib\site-packages\pandas\io\ga.py", line 110, in read_ga
reader = GAnalytics(**reader_kwds)
File "\Anaconda\lib\site-packages\pandas\io\ga.py", line 179, in __init__
self._service = self._init_service(secrets)
File "\Anaconda\lib\site-packages\pandas\io\ga.py", line 191, in _init_service
http = self.authenticate(secrets)
File "\Anaconda\lib\site-packages\pandas\io\ga.py", line 151, in authenticate
return auth.authenticate(flow, self.token_store)
File "\Anaconda\lib\site-packages\pandas\io\auth.py", line 108, in authenticate
credentials = tools.run(flow, storage)
AttributeError: 'module' object has no attribute 'run'
According to the yhat link, my browser should open for authentication.
Note: I did not not create the Client ID for "installed application", since I did not have this choice in the menu when creating the ID. Instead, i chose "other". This shouldn't be the cause of the error, though.
Second Note: I recently updated my pandas to 0.17.1. When importing pandas.io.ga, i got the message that the .ga module is deprecated. Furthermore, i manually installed the gflags module, because it was needed when I tried to import .io.ga the first time.
Either file a ticket with the owners of Pandas to change (currently) line 108 of pandas/io/auth.py from run() to run_flow(), or make the fix yourself and file a PR. (Yes, it would've been nice if Google had just made run_flow() and alias of run(), but as you can imagine, this is not how this change evolved, so we have to live with it.)
For other developers running into this error: If you have the latest version (as of Feb 2016) of the Google APIs Client Library for Python, just rename your call from tools.run() to tools.run_flow(), and you should be good-to-go. More about this change in a PSA (public service announcement) blogpost I wrote back in mid-2015 but update periodically to stay current.
The fastest way to upgrade your Client Library is with:
pip install -U google-api-python-client # or pip3 for 3.x

How to integrate APScheduler and Imp?

I have built a plugin-based application where "plugins" (python modules) can be loaded by imp and then scheduled for later execution by APScheduler, I was able to successfully integrate them but I want to implement persistence in case of crashes or application reestarts, so I changed the default memory job store to the SqlAlchemyJobStore, it works quite well the first time you execute the program: tasks are loaded, scheduled, saved at the database and executed at the right time.
Problem is when I try to load the application again I get this traceback:
ERROR:apscheduler.jobstores.default:Unable to restore job "d3e0f0068df54d15986e9b7b6757f665" -- removing it
Traceback (most recent call last):
File "/home/jesus/.local/lib/python2.7/site-packages/apscheduler/jobstores/sqlalchemy.py", line 126, in _get_jobs
jobs.append(self._reconstitute_job(row.job_state))
File "/home/jesus/.local/lib/python2.7/site-packages/apscheduler/jobstores/sqlalchemy.py", line 114, in _reconstitute_job
job.__setstate__(job_state)
File "/home/jesus/.local/lib/python2.7/site-packages/apscheduler/job.py", line 228, in __setstate__
self.func = ref_to_obj(self.func_ref)
File "/home/jesus/.local/lib/python2.7/site-packages/apscheduler/util.py", line 257, in ref_to_obj
raise LookupError('Error resolving reference %s: could not import module' % ref)
LookupError: Error resolving reference __init__:run: could not import module
So it is obvious that there is a problem when attempting to import the function again
Here is my scheduler initialization:
executors = {'default': ThreadPoolExecutor(5)}
jobstores = {'default': SQLAlchemyJobStore(url='sqlite:///jobs.sqlite')}
self.scheduler = BackgroundScheduler(executors = executors,jobstores=jobstores)
I have a "tests" dictionary containing the "plugins" that should be loaded and some parameters, "load_plugin" uses imp to load a plugin by it's name.
for test,parameters in tests.items():
if test in pluggins:
module=load_plugin(pluggins[test])
self.jobs[test]=self.scheduler.add_job(module.run,"interval",seconds=parameters["interval"],name=test)
Any idea about how can I handle reconstituting jobs?
Something in the automatic detection of the module name is going wrong. Hard to say what, but the alternative is to manually give it the proper lookup path as a string (e.g. "package.module:function"). If you can do this, you can avoid this problem.

Categories

Resources