Streaming results with Blaze and SqlAlchemy - python

I am trying to use Blaze/Odo to read a large (~70M rows) result set from Redshift. By default SqlAlchemy witll try to read the whole result into memory, before starting to process it. This can be prevented by either
execution_options(stream_results=True) on the engine/session or yield_per(sane_number) on the query. When working from Blaze SqlAchemy queries are generated behind the covers, leaving the execution_options approach. Unfortunately the following throws and error.
from sqlalchemy import create_engine
from blaze import Data
redshift_params = (redshift_user, redshift_pass, redshift_endpoint, port, dbname)
engine_string = "redshift+psycopg2://%s:%s#%s:%d/%s" % redshift_params
engine = create_engine(engine_string,
execution_options=dict(stream_results=True)
)
db = Data(engine)
The exception is:
...
/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.pyc in __buffer_rows(self)
1124 return
1125 size = getattr(self, '_bufsize', 1)
-> 1126 self.__rowbuffer = collections.deque(self.cursor.fetchmany(size))
1127 self._bufsize = self.size_growth.get(size, size)
1128 if self._max_row_buffer is not None:
InternalError: (psycopg2.InternalError) opening multiple cursors from within the same client connection is not allowed.
If I leave out the execution_options=dict(stream_results=True) then the above works, but doing something like
odo(db.mytable, 'mytable.bcolz')
will run out of memory for large tables.
Using execution_options(stream_results=True) does work with pandas.read_csv. The following code works fine, using only moderate amounts of memory:
from sqlalchemy import create_engine
import pandas as pd
redshift_params = (redshift_user, redshift_pass, redshift_endpoint, port, dbname)
engine_string = "postgresql+psycopg2://%s:%s#%s:%d/%s" % redshift_params
engine = create_engine(engine_string,
execution_options=dict(stream_results=True)
)
compression='bz2'
res = pd.read_sql_query(queryString
engine,
chunksize=2**20)
for i, df in enumerate(res):
df.to_csv('results-%s.csv.%s' % (i, compression), compression=compression)
This is the complete stack trace:
...
Data(engine)
No handlers could be found for logger "sqlalchemy.pool.QueuePool"
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/mahler/anaconda/lib/python2.7/site-packages/blaze/interactive.py", line 122, in Data
dshape = discover(data)
File "/home/mahler/anaconda/lib/python2.7/site-packages/multipledispatch/dispatcher.py", line 164, in __call__
return func(*args, **kwargs)
File "/home/mahler/anaconda/lib/python2.7/site-packages/odo/backends/sql.py", line 242, in discover
return discover(metadata)
File "/home/mahler/anaconda/lib/python2.7/site-packages/multipledispatch/dispatcher.py", line 164, in __call__
return func(*args, **kwargs)
File "/home/mahler/anaconda/lib/python2.7/site-packages/odo/backends/sql.py", line 248, in discover
metadata.reflect(views=metadata.bind.dialect.supports_views)
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/sql/schema.py", line 3623, in reflect
bind.dialect.get_view_names(conn, schema)
File "<string>", line 2, in get_view_names
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/reflection.py", line 42, in cache
return fn(self, con, *args, **kw)
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/dialects/postgresql/base.py", line 2347, in get_view_names
for row in connection.execute(s)]
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.py", line 713, in __iter__
row = self.fetchone()
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.py", line 1026, in fetchone
self.cursor, self.context)
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1341, in _handle_dbapi_exception
exc_info
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/util/compat.py", line 200, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb)
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.py", line 1017, in fetchone
row = self._fetchone_impl()
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.py", line 1139, in _fetchone_impl
self.__buffer_rows()
File "/home/mahler/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/result.py", line 1126, in __buffer_rows
self.__rowbuffer = collections.deque(self.cursor.fetchmany(size))
sqlalchemy.exc.InternalError: (psycopg2.InternalError) opening multiple cursors from within the same client connection is not allowed.

Related

Error shows up when using df.to_parquet("filename")

I want to save the data set as a parquet file, called power.parquet, and I use df.to_parquet(<filename>). But it gives me this errer "ValueError: Error converting column "Global_reactive_power" to bytes using encoding UTF8. Original error: bad argument type for built-in operation" And I installed the fastparquet package.
from fastparquet import write, ParquetFile
dat.to_parquet("power.parquet")
df_parquet = ParquetFile("power.parquet").to_pandas()
df_parquet.head() # Test your final value
`*Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 259, in convert
out = array_encode_utf8(data)
File "fastparquet/speedups.pyx", line 50, in fastparquet.speedups.array_encode_utf8
TypeError: bad argument type for built-in operation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/folders/4f/bm2th1p56tz4rq_zffc8g3940000gn/T/ipykernel_85477/3080656655.py", line 1, in <module>
dat.to_parquet("power.parquet", compression="GZIP")
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/core.py", line 4560, in to_parquet
return to_parquet(self, path, *args, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py", line 732, in to_parquet
return compute_as_if_collection(
File "/opt/anaconda3/lib/python3.9/site-packages/dask/base.py", line 315, in compute_as_if_collection
return schedule(dsk2, keys, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/threaded.py", line 79, in get
results = get_async(
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 507, in get_async
raise_exception(exc, tb)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 315, in reraise
raise exc
File "/opt/anaconda3/lib/python3.9/site-packages/dask/local.py", line 220, in execute_task
result = _execute_task(task, data)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/opt/anaconda3/lib/python3.9/site-packages/dask/utils.py", line 35, in apply
return func(*args, **kwargs)
File "/opt/anaconda3/lib/python3.9/site-packages/dask/dataframe/io/parquet/fastparquet.py", line 1167, in write_partition
rg = make_part_file(
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 716, in make_part_file
rg = make_row_group(f, data, schema, compression=compression,
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 701, in make_row_group
chunk = write_column(f, coldata, column,
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 554, in write_column
repetition_data, definition_data, encode[encoding](data, selement), 8 * b'\x00'
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 354, in encode_plain
out = convert(data, se)
File "/opt/anaconda3/lib/python3.9/site-packages/fastparquet/writer.py", line 284, in convert
raise ValueError('Error converting column "%s" to bytes using '
ValueError: Error converting column "Global_reactive_power" to bytes using encoding UTF8. Original error: bad argument type for built-in operation
*
I tried by adding object_coding = "bytes".I want to solve this problem.

sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL error: wrong version number

I have a Flask based backend application and when running the code locally, this error does not happen, but it does happen when deployed in my server in a kubernetes pod.
Interestingly enough, when you run the code for the first time, it fails instantly (when running the Future) but when you run it again the error still happens, but the code continues (and completes successfully). Pretty bizarre.
The error I am getting is:
2021-07-30 14:59:23,537 - mit_backend.logic.optimize - INFO - Inside first pass run code
2021-07-30 14:59:23,587 - mit_backend.logic.optimize - INFO - Optimising region ML10F
2021-07-30 14:59:23,588 - mit_backend.logic.optimize - INFO - Optimising region ML47F
2021-07-30 14:59:23,589 - mit_backend.logic.optimize - INFO - Optimising region ML40F
--- Logging error ---
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1277, in _execute_context
cursor, statement, parameters, context
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/default.py", line 593, in do_execute
cursor.execute(statement, parameters)
psycopg2.OperationalError: SSL error: wrong version number
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/lib64/python3.6/concurrent/futures/process.py", line 175, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/code/mit_backend/logic/optimize.py", line 241, in optimize_first_pass_run
config = get_csv_config(user_identity)
File "/code/mit_backend/modules/v1/__init__.py", line 623, in get_csv_config
res = CSVConfig.query.filter(CSVConfig.user_id == user_identity).first()
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3429, in first
ret = list(self[0:1])
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3203, in __getitem__
return list(res)
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__
return self._execute_and_instances(context)
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances
result = conn.execute(querycontext.statement, self._params)
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1011, in execute
return meth(self, multiparams, params)
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1130, in _execute_clauseelement
distilled_params,
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1317, in _execute_context
e, statement, parameters, cursor, context
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1511, in _handle_dbapi_exception
sqlalchemy_exception, with_traceback=exc_info[2], from_=e
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/util/compat.py", line 182, in raise_
raise exception
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1277, in _execute_context
cursor, statement, parameters, context
File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/default.py", line 593, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL error: wrong version number
[SQL: SELECT csv_config.user_id AS csv_config_user_id, csv_config."csvHeaders" AS "csv_config_csvHeaders", csv_config."deliveryName" AS "csv_config_deliveryName", csv_config.id AS csv_config_id, csv_config.line1 AS csv_config_line1, csv_config.line2 AS csv_config_line2, csv_config.quantity AS csv_config_quantity, csv_config."routeNumber" AS "csv_config_routeNumber", csv_config.suburb AS csv_config_suburb, csv_config.weight AS csv_config_weight
FROM csv_config
WHERE csv_config.user_id = %(user_id_1)s
LIMIT %(param_1)s]
[parameters: {'user_id_1': 'sftp', 'param_1': 1}]
(Background on this error at: http://sqlalche.me/e/13/e3q8)
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/code/mit_backend/logic/optimize.py", line 300, in first_pass_run
future_df_result = future.result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 425, in result
return self.__get_result()
File "/usr/lib64/python3.6/concurrent/futures/_base.py", line 384, in __get_result
raise self._exception
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL error: wrong version number
[SQL: SELECT csv_config.user_id AS csv_config_user_id, csv_config."csvHeaders" AS "csv_config_csvHeaders", csv_config."deliveryName" AS "csv_config_deliveryName", csv_config.id AS csv_config_id, csv_config.line1 AS csv_config_line1, csv_config.line2 AS csv_config_line2, csv_config.quantity AS csv_config_quantity, csv_config."routeNumber" AS "csv_config_routeNumber", csv_config.suburb AS csv_config_suburb, csv_config.weight AS csv_config_weight
FROM csv_config
WHERE csv_config.user_id = %(user_id_1)s
LIMIT %(param_1)s]
[parameters: {'user_id_1': 'sftp', 'param_1': 1}]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib64/python3.6/logging/__init__.py", line 994, in emit
msg = self.format(record)
File "/usr/lib64/python3.6/logging/__init__.py", line 840, in format
return fmt.format(record)
File "/usr/lib64/python3.6/logging/__init__.py", line 577, in format
record.message = record.getMessage()
File "/usr/lib64/python3.6/logging/__init__.py", line 338, in getMessage
msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
File "/usr/lib64/python3.6/threading.py", line 884, in _bootstrap
self._bootstrap_inner()
File "/usr/lib64/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/usr/lib64/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib64/python3.6/socketserver.py", line 654, in process_request_thread
self.finish_request(request, client_address)
File "/usr/lib64/python3.6/socketserver.py", line 364, in finish_request
self.RequestHandlerClass(request, client_address, self)
File "/usr/lib64/python3.6/socketserver.py", line 724, in __init__
self.handle()
File "/usr/local/lib/python3.6/site-packages/werkzeug/serving.py", line 345, in handle
BaseHTTPRequestHandler.handle(self)
File "/usr/lib64/python3.6/http/server.py", line 418, in handle
self.handle_one_request()
File "/usr/local/lib/python3.6/site-packages/werkzeug/serving.py", line 379, in handle_one_request
return self.run_wsgi()
File "/usr/local/lib/python3.6/site-packages/werkzeug/serving.py", line 323, in run_wsgi
execute(self.server.app)
File "/usr/local/lib/python3.6/site-packages/werkzeug/serving.py", line 312, in execute
application_iter = app(environ, start_response)
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 2464, in __call__
return self.wsgi_app(environ, start_response)
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 2447, in wsgi_app
response = self.full_dispatch_request()
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 1950, in full_dispatch_request
rv = self.dispatch_request()
File "/usr/local/lib/python3.6/site-packages/flask/app.py", line 1936, in dispatch_request
return self.view_functions[rule.endpoint](**req.view_args)
File "/usr/local/lib/python3.6/site-packages/flask_restx/api.py", line 375, in wrapper
resp = resource(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/flask/views.py", line 89, in view
return self.dispatch_request(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/flask_restx/resource.py", line 44, in dispatch_request
resp = meth(*args, **kwargs)
File "/code/mit_backend/modules/v1/controllers/manifest.py", line 161, in post
files, df, sftp_date_latest = do_optimize(addresses, user_identity)
File "/code/mit_backend/logic/optimize.py", line 38, in do_optimize
df = first_pass_run(df, user_identity)
File "/code/mit_backend/logic/optimize.py", line 307, in first_pass_run
LOG.error("Exception is ", e.__cause__)
Message: 'Exception is '
Arguments: (_RemoteTraceback('\n"""\nTraceback (most recent call last):\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1277, in _execute_context\n cursor, statement, parameters, context\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/default.py", line 593, in do_execute\n cursor.execute(statement, parameters)\npsycopg2.OperationalError: SSL error: decryption failed or bad record mac\n\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File "/usr/lib64/python3.6/concurrent/futures/process.py", line 175, in _process_worker\n r = call_item.fn(*call_item.args, **call_item.kwargs)\n File "/code/mit_backend/logic/optimize.py", line 241, in optimize_first_pass_run\n config = get_csv_config(user_identity)\n File "/code/mit_backend/modules/v1/__init__.py", line 623, in get_csv_config\n res = CSVConfig.query.filter(CSVConfig.user_id == user_identity).first()\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3429, in first\n ret = list(self[0:1])\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3203, in __getitem__\n return list(res)\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3535, in __iter__\n return self._execute_and_instances(context)\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/orm/query.py", line 3560, in _execute_and_instances\n result = conn.execute(querycontext.statement, self._params)\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1011, in execute\n return meth(self, multiparams, params)\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/sql/elements.py", line 298, in _execute_on_connection\n return connection._execute_clauseelement(self, multiparams, params)\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1130, in _execute_clauseelement\n distilled_params,\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1317, in _execute_context\n e, statement, parameters, cursor, context\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1511, in _handle_dbapi_exception\n sqlalchemy_exception, with_traceback=exc_info[2], from_=e\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/util/compat.py", line 182, in raise_\n raise exception\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/base.py", line 1277, in _execute_context\n cursor, statement, parameters, context\n File "/usr/local/lib64/python3.6/site-packages/sqlalchemy/engine/default.py", line 593, in do_execute\n cursor.execute(statement, parameters)\nsqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL error: decryption failed or bad record mac\n\n[SQL: SELECT csv_config.user_id AS csv_config_user_id, csv_config."csvHeaders" AS "csv_config_csvHeaders", csv_config."deliveryName" AS "csv_config_deliveryName", csv_config.id AS csv_config_id, csv_config.line1 AS csv_config_line1, csv_config.line2 AS csv_config_line2, csv_config.quantity AS csv_config_quantity, csv_config."routeNumber" AS "csv_config_routeNumber", csv_config.suburb AS csv_config_suburb, csv_config.weight AS csv_config_weight \nFROM csv_config \nWHERE csv_config.user_id = %(user_id_1)s \n LIMIT %(param_1)s]\n[parameters: {\'user_id_1\': \'sftp\', \'param_1\': 1}]\n(Background on this error at: http://sqlalche.me/e/13/e3q8)\n"""',),)
The error happens inside a futures ProcessPoolExecutor:
def first_pass_run(df, user_identity):
LOG.info("Inside first pass run code")
first_json = True
df_final = None
df_grouped = df.groupby('Route Number')
auth_claim = get_authorization_claims_from_header()
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
_futures = []
for region_name, _df in df_grouped:
_futures.append(
executor.submit(
optimize_first_pass_run, region_name, _df, user_identity, auth_claim
))
for future in concurrent.futures.as_completed(_futures):
try:
future_df_result = future.result()
if first_json:
df_final = future_df_result
first_json = False
else:
df_final = df_final.append(future_df_result, ignore_index=True)
except Exception as e:
LOG.error("Exception is ", e.__cause__)
I start my SQLAlchemy engine in a pessimistic fashion:
pg_db = SQLAlchemy(app)
engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'], pool_pre_ping=True)
The code is not running on WSGI or gunicorn, it's just as simple as:
if __name__ == '__main__':
LOG.info('running environment: %s', os.environ.get('ENV', 'production'))
app.config['DEBUG'] = os.environ.get('ENV') == 'development'
app.run(host='0.0.0.0', port=5001, debug=False, threaded=True)
Any clues why this might be happening? Thank you.
I haven't been able to find a proper fix for this, but I implemented the following things that although the exception is still being thrown, the code retries and ends up succeeding:
Removing the connection pool
engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'], poolclass=NullPool)
Event listeners
#event.listens_for(engine, "connect")
def connect(dbapi_connection, connection_record):
connection_record.info['pid'] = os.getpid()
#event.listens_for(engine, "checkout")
def checkout(dbapi_connection, connection_record, connection_proxy):
pid = os.getpid()
if connection_record.info['pid'] != pid:
connection_record.connection = connection_proxy.connection = None
raise exc.DisconnectionError(
"Connection record belongs to pid %s, "
"attempting to check out in pid %s" %
(connection_record.info['pid'], pid)
)
but, as per the documentation, the two definitions above relate to an optimistic way of handling the connection and using Pools
Reconnecting before initializing the Process Pool
engine.dispose()
with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:
_futures = []
for region_name, _df in df_grouped:
_futures.append(
executor.submit(
optimize_first_pass_run, region_name, _df, user_identity, auth_claim, True
))
for future in concurrent.futures.as_completed(_futures):
future_df_result = future.result()
Adding retries when trying to insert objects in the database
def add_new_record(row):
attempts = 0
while attempts <= 5:
attempts = attempts + 1
try:
pg_db.session.add(row)
pg_db.session.commit()
return True
except (SQLAlchemyError, psycopg2.OperationalError, sqlalchemy.exc.OperationalError) as e:
if attempts < 5:
LOG.warn("Attempt failed. Trying rollback")
pg_db.session.rollback()
LOG.warn(e.__cause__)
LOG.warn(e.__str__())
time.sleep(5)
else:
LOG.error("Maximum number of retries reached. Raising an error")
raise e
return False

python2.7 throwing error when calling bigquery api

I am using google-api-python-client for inserting a json record to bigquery and when I try to unittest the method using python unittest, I am getting error in exactly this line
The code is as follows:
def write_to_bigquery(self, timeseries, metadata):
response = {}
json_msg_list = []
stats = {}
if not timeseries or "points" not in timeseries:
logging.debug("No timeseries data to write to BigQuery")
msgs_written = 0
metadata["msg_without_timeseries"] = 1
error_msg_cnt = 0
else:
rows = build_rows(timeseries, metadata)
print("rows", rows) //This gets printed
bigquery = build('bigquery', 'v2', cache_discovery=False)
print("after rows", rows) //Control does not reach here
body = {
"kind": "bigquery#tableDataInsertAllRequest",
"skipInvalidRows": "false",
"rows": json_row_list
}
logging.debug('body: {}'.format(json.dumps(body, sort_keys=True, indent=4)))
response = bigquery.tabledata().insertAll(
projectId=app_identity.get_application_id(),
datasetId=config.BIGQUERY_DATASET,
tableId=config.BIGQUERY_STATS_TABLE,
body=body
).execute()
logging.debug("BigQuery said... = {}".format(response))
and this is the error I get
Traceback (most recent call last):
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1535, in __call__
rv = self.handle_exception(request, response, e)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1529, in __call__
rv = self.router.dispatch(request, response)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1278, in default_dispatcher
return route.handler_adapter(request, response)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1102, in __call__
return handler.dispatch()
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 572, in dispatch
return self.handle_exception(e, self.app.debug)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 570, in dispatch
return method(*args, **kwargs)
File "main.py", line 422, in post
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 570, in dispatch
return method(*args, **kwargs)
File "main.py", line 422, in post
self.write_to_bigquery(data, metadata)
File "main.py", line 296, in write_to_bigquery
bigquery = build('bigquery', 'v2', cache_discovery=False)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/discovery.py", line 258, in build
adc_key_path=adc_key_path,
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/discovery.py", line 423, in build_from_document
credentials = _auth.default_credentials()
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_auth.py", line 44, in default_credentials
credentials, project_id = checker()
File "/usr/local/lib/python2.7/dist-packages/google/auth/_default.py", line 186, in _get_gae_credentials
project_id = app_engine.get_project_id()
File "/usr/local/lib/python2.7/dist-packages/google/auth/app_engine.py", line 77, in get_project_id
return app_identity.get_application_id()
File "/usr/lib/google-cloud-sdk/platform/google_appengine/google/appengine/api/app_identity/app_identity.py", line 455, in get_application_id
_, domain_name, display_app_id = _ParseFullAppId(full_app_id)
File "/usr/lib/google-cloud-sdk/platform/google_appengine/google/appengine/api/app_identity/app_identity.py", line 436, in _ParseFullAppId
psep = app_id.find(_PARTITION_SEPARATOR)
AttributeError: 'NoneType' object has no attribute 'find'
I am new to python and bigquery so any help is appreciated thanks
I would recommend you using the BigQuery Python SDK
For that, you first need to install it in you Python. You can do that by running:
pip install google-cloud-bigquery
After that you use a code like this insert json records to your table:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
table_id = "project_id.dataset.table"
# Your JSON keys must correspond to your table column names
json_list = [{"your": "json", "data":"here"},{"your": "json", "data":"here"},{"your": "json", "data":"here"}, ...]
# Get table reference
table = client.get_table(table_id)
rows_to_insert = json_list
# Insert the data into your table
errors = client.insert_rows(table, rows_to_insert)
Finally, I'd like to say that Python 2 is considered deprecated already. If possible, update it to Python 3

How to Push data from xlsx excel sheet to sqlalchemy database using Transpose and pandas

I am trying to push the excel file by using to_sql function and using data frame after transpose the dataframe...
and this is my code looks like
import pandas as pd
import os
import sqlalchemy
# MySQL Connection
MYSQL_USER = 'xxxxx'
MYSQL_PASSWORD = 'xxxxxxxx'
MYSQL_HOST_IP = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_DATABASE = 'xlsx_test_db'
# connect db
engine = sqlalchemy.create_engine('mysql+mysqlconnector://' + MYSQL_USER + ':' + MYSQL_PASSWORD + '#' + MYSQL_HOST_IP + ':' + str(
MYSQL_PORT) + '/' + MYSQL_DATABASE, echo=False)
engine.connect()
mydir = (os.getcwd()).replace('\\', '/') + '/'
raw_lte = pd.read_excel(r'' + mydir + 'MNM_Rotterdam_5_Daily_Details-20191216081027.xlsx', sheet_name='raw_4G')
dft = raw_lte.T
dft.columns = dft.iloc[0]
dft = dft.iloc[1:]
# reading and insert one file at a time
for file in os.listdir('.'):
# only process excels files
file_basename, extension = file.split('.')
if extension == 'xlsx':
dft.to_sql(file_basename.lower(), con=engine, if_exists='replace')
and this the error
Traceback
Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1193, in _execute_context
context)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\default.py", line 507, in do_execute
cursor.execute(statement, parameters)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\cursor.py", line 551, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\connection.py", line 490, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\connection.py", line 395, in _handle_result
raise errors.get_exception(packet)
mysql.connector.errors.ProgrammingError: 1170 (42000): BLOB/TEXT column 'index' used in key specification without a key length
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/DELL/PycharmProjects/automateDB/swap.py", line 36, in <module>
dft.to_sql(file_basename.lower(), con=engine, if_exists='replace')
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\pandas\core\generic.py", line 2532, in to_sql
dtype=dtype, method=method)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\pandas\io\sql.py", line 460, in to_sql
chunksize=chunksize, dtype=dtype, method=method)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\pandas\io\sql.py", line 1173, in to_sql
table.create()
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\pandas\io\sql.py", line 585, in create
self._execute_create()
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\pandas\io\sql.py", line 569, in _execute_create
self.table.create()
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\schema.py", line 778, in create
checkfirst=checkfirst)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1940, in _run_visitor
conn._run_visitor(visitorcallable, element, **kwargs)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1549, in _run_visitor
**kwargs).traverse_single(element)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 121, in traverse_single
return meth(obj, **kw)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\ddl.py", line 796, in visit_table
self.traverse_single(index)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\visitors.py", line 121, in traverse_single
return meth(obj, **kw)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\ddl.py", line 823, in visit_index
self.connection.execute(CreateIndex(index))
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 948, in execute
return meth(self, multiparams, params)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\sql\ddl.py", line 68, in _execute_on_connection
return connection._execute_ddl(self, multiparams, params)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1009, in _execute_ddl
compiled
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1200, in _execute_context
context)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1413, in _handle_dbapi_exception
exc_info
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\util\compat.py", line 203, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\util\compat.py", line 186, in reraise
raise value.with_traceback(tb)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\base.py", line 1193, in _execute_context
context)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\sqlalchemy\engine\default.py", line 507, in do_execute
cursor.execute(statement, parameters)
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\cursor.py", line 551, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\connection.py", line 490, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "C:\Users\DELL\PycharmProjects\MyALLRefProf\venv\lib\site-packages\mysql\connector\connection.py", line 395, in _handle_result
raise errors.get_exception(packet)
sqlalchemy.exc.ProgrammingError: (mysql.connector.errors.ProgrammingError) 1170 (42000): BLOB/TEXT column 'index' used in key specification without a key length [SQL: 'CREATE INDEX `ix_mnm_rotterdam_5_daily_details-20191216081027_index` ON `mnm_rotterdam_5_daily_details-20191216081027` (`index`)'] (Background on this error at: http://sqlalche.me/e/f405)
I think there's a problem with the excel format, but I don't know to solve this issue
Note:
I tried to use this
raw_4G.to_sql(file_basename.lower(), con=engine, if_exists='replace')
insted of
dft.to_sql(file_basename.lower(), con=engine, if_exists='replace')
It works but it gives me error in the run time with this
Traceback (most recent call last):
File "C:/Users/DELL/PycharmProjects/automateDB/swap.py", line 34, in <module>
file_basename, extension = file.split('.')
ValueError: not enough values to unpack (expected 2, got 1)

SQLAlchemy PostgreSQL UPSERT array of values raises UnsupportedCompilationError

Trying to perform an UPSERT on an array of values to a PostgreSQL > 9.5.
Trying to build the statement as in the SQLALchemy docs, but there is no explanation about how to do it for an array instead of a single row. The insert statement builds properly so I suppose it's possible to do so with the on_conflict_do_update function.
Having this code :
stock_table = Table("stock_history", metadata,
Column('date', sqlalchemy.types.NVARCHAR(length=255), primary_key=True),
Column('product_id', sqlalchemy.types.INTEGER(), primary_key=True),
Column('product_sku', sqlalchemy.types.NVARCHAR(length=255)),
Column('on_hand_qty', sqlalchemy.dialects.postgresql.DOUBLE_PRECISION()),
Column('available_qty', sqlalchemy.dialects.postgresql.DOUBLE_PRECISION()),
Column('output_qty', sqlalchemy.dialects.postgresql.DOUBLE_PRECISION())
)
stock_today = pandas.read_sql_query(queryStock, odoo_engine)
insert_stmt = sqlalchemy.dialects.postgresql.insert(stock_table).values(stock_today)
upser_stmt = insert_stmt.on_conflict_do_update(
index_elements=['date', 'product_id'],
set_=stock_today.to_dict(orient='dict')
)
I'm getting the following error:
AttributeError: 'StrSQLCompiler' object has no attribute 'visit_on_conflict_do_update'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pompeiiETL.py", line 15, in <module>
pompeiiJobs.runStockJob(dwh_engine, odoo_prod_engine)
File "/Users/alex/Development/DataLab/pompeii-datalab/pompeiiETL/jobs.py", line 54, in runStockJob
print(upser_stmt)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/elements.py", line 446, in __str__
return str(self.compile())
File "<string>", line 1, in <lambda>
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/elements.py", line 436, in compile
return self._compiler(dialect, bind=bind, **kw)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/elements.py", line 442, in _compiler
return dialect.statement_compiler(dialect, self, **kw)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/compiler.py", line 435, in __init__
Compiled.__init__(self, dialect, statement, **kwargs)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/compiler.py", line 216, in __init__
self.string = self.process(self.statement, **compile_kwargs)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/compiler.py", line 242, in process
return obj._compiler_dispatch(self, **kwargs)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/visitors.py", line 81, in _compiler_dispatch
return meth(self, **kw)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/compiler.py", line 2041, in visit_insert
insert_stmt._post_values_clause, **kw)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/compiler.py", line 242, in process
return obj._compiler_dispatch(self, **kwargs)
File "/Users/alex/Development/DataLab/ETLenv/lib/python3.6/site-packages/sqlalchemy/sql/visitors.py", line 79, in _compiler_dispatch
raise exc.UnsupportedCompilationError(visitor, cls)
sqlalchemy.exc.UnsupportedCompilationError: Compiler <sqlalchemy.sql.compiler.StrSQLCompiler object at 0x105b55be0> can't render element of type <class 'sqlalchemy.dialects.postgresql.dml.OnConflictDoUpdate'>
What I'm ding wrong? Is there a better way to do an upsert?
Thanks!
You are trying to get string representation of Insert-object that has no proper bind, if we write
db_uri = make_url('your-postgres-db-uri-here')
engine = create_engine(db_uri)
upser_stmt.bind = engine
print(upser_stmt)
it works
We can also create insert statement with bind specified
insert_stmt = sqlalchemy.dialects.postgresql.insert(stock_table,
bind=engine).values(stock_today)
upser_stmt = insert_stmt.on_conflict_do_update(
index_elements=['date', 'product_id'],
set_=stock_today.to_dict(orient='dict')
)

Categories

Resources