How to minimize code when using sqlalchemy query and pandas dataframes

How to minimize code when using sqlalchemy query and pandas dataframes - python

I am trying to upgrade my query code by modernizing it.
My old code (bellow). First query joins two tables and selects the rating for each song title together with artist for title and the second gets the genres for each title (association table is used):
items = []
query = db.session.query(Rating, Song).filter(Rating.id==Song.id).all()
for x in query:
dic = {
"rating": x[0],
"title": x[1].title,
"artist": x[1].artist,
"genre": Genre.query.filter(Genre.songs.any(title=x[1].title)).all(),
}
items.append(dic)
My cleaner code. I use pandas dataframes now instead of dictionaries. This gives me the error ArgumentError: SQL expression element or literal value expected, got somethingsomething
query = db.session.query(Rating, Song).filter(Rating.id==Song.id).all()
df = pd.DataFrame(query, columns=["rating", "title"])
for item in df.title:
df['genre'] = (Genre.query.filter(Genre.songs.any(title=item)).all())
How do I get this to work?
Are there more effiecient ways of coding this?
Complete error produced
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Programs\Python\Python37\lib\site-packages\sqlalchemy\engine\base.py in execute(self, statement, *multiparams, **params)
1194 try:
-> 1195 meth = statement._execute_on_connection
1196 except AttributeError as err:
AttributeError: 'BaseQuery' object has no attribute '_execute_on_connection'
The above exception was the direct cause of the following exception:
ObjectNotExecutableError Traceback (most recent call last)
<ipython-input-4-99ffacdf2d91> in <module>
1 query = db.session.query(Rating, Song).filter(Rating.id==Song.id)
----> 2 df = pd.read_sql_query(query, db.engine)
3 df
~\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
381 coerce_float=coerce_float,
382 parse_dates=parse_dates,
--> 383 chunksize=chunksize,
384 )
385
~\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
1292 args = _convert_params(sql, params)
1293
-> 1294 result = self.execute(*args)
1295 columns = result.keys()
1296
~\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
1160 def execute(self, *args, **kwargs):
1161 """Simple passthrough to SQLAlchemy connectable"""
-> 1162 return self.connectable.execution_options().execute(*args, **kwargs)
1163
1164 def read_table(
<string> in execute(self, statement, *multiparams, **params)
~\AppData\Local\Programs\Python\Python37\lib\site-packages\sqlalchemy\util\deprecations.py in warned(fn, *args, **kwargs)
388 if not skip_warning:
389 _warn_with_version(message, version, wtype, stacklevel=3)
--> 390 return fn(*args, **kwargs)
391
392 doc = func.__doc__ is not None and func.__doc__ or ""
~\AppData\Local\Programs\Python\Python37\lib\site-packages\sqlalchemy\engine\base.py in execute(self, statement, *multiparams, **params)
3036 """
3037 connection = self.connect(close_with_result=True)
-> 3038 return connection.execute(statement, *multiparams, **params)
3039
3040 #util.deprecated_20(
~\AppData\Local\Programs\Python\Python37\lib\site-packages\sqlalchemy\engine\base.py in execute(self, statement, *multiparams, **params)
1196 except AttributeError as err:
1197 util.raise_(
-> 1198 exc.ObjectNotExecutableError(statement), replace_context=err
1199 )
1200 else:
~\AppData\Local\Programs\Python\Python37\lib\site-packages\sqlalchemy\util\compat.py in raise_(***failed resolving arguments***)
209
210 try:
--> 211 raise exception
212 finally:
213 # credit to
ObjectNotExecutableError: Not an executable object: <flask_sqlalchemy.BaseQuery object at 0x000001CBC5F14A48>

First, create just one query to return all the data you need in one go, where the grouping of Genres is done uisng the GROUP_CONCAT function:
query = (
db.session
.query(
Rating.rating,
Song.title,
Song.artist,
db.func.GROUP_CONCAT(Genre.category, ", ").label("genres")
)
.select_from(Song)
.where(Rating.id == Song.id)
.join(Genre, Song.genres)
.group_by(
Rating.rating,
Song.title,
Song.artist,
)
)
Then use the pandas method to get it into a dataframe:
df = pd.read_sql_query(query.statement, db.engine)
Where print(df) should produce something like this:
rating title artist genres
0 2.0 title 2 art-2 pop
1 3.0 title 3 art-3 rock, pop
2 4.0 title 4 art-3 other
3 5.0 title 5 art-4 rock, pop, other

Related

trying to create a pivot table with pandas and numpy

I'm using the following to try and get a pivot table that multiplies the quantity times the effective price and groups it by Product name:
df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
This is the stack trace - not sure why this isn't working.
ValueError Traceback (most recent call last)
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
970 try:
--> 971 result = self._aggregate_multiple_funcs([func], _axis=self.axis)
972
/usr/local/lib/python3.9/site-packages/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _axis)
544 if not len(results):
--> 545 raise ValueError("no results")
546
ValueError: no results
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-35-f616d7b46a13> in <module>
----> 1 df_sorted.pivot_table(values=['Quantity', 'EffectivePrice'], index=['ProductName'], aggfunc=np.multiply )
/usr/local/lib/python3.9/site-packages/pandas/core/frame.py in pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
6824 from pandas.core.reshape.pivot import pivot_table
6825
-> 6826 return pivot_table(
6827 self,
6828 values=values,
/usr/local/lib/python3.9/site-packages/pandas/core/reshape/pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
110
111 grouped = data.groupby(keys, observed=observed)
--> 112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
114 agged = agged.dropna(how="all")
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
981 # raised directly by _aggregate_multiple_funcs
982 raise
--> 983 result = self._aggregate_frame(func)
984 except AttributeError:
985 # catch exception from line 969
/usr/local/lib/python3.9/site-packages/pandas/core/groupby/generic.py in _aggregate_frame(self, func, *args, **kwargs)
1173 if axis != obj._info_axis_number:
1174 for name, data in self:
-> 1175 fres = func(data, *args, **kwargs)
1176 result[name] = fres
1177 else:
ValueError: invalid number of arguments```

My understanding is you cannot apply multi-column operation in pivot table. Maybe using groupby + transform can do. Based on the explanation I recommend this code (I am not sure the expected result is what you want):
df_sorted['TotalPrice'] = df_sorted['Quantity']*df_sorted['EffectivePrice']
result = df_sorted.groupby('ProductName')['TotalPrice'].sum()
For this sample dataframe:
Quantity EffectivePrice ProductName
0 1 12 A
1 1 13 B
2 2 14 A
The output is like this:
ProductName
A 40
B 13

Python Multiprocessing Parallel Insert Into Oracle SQL

I'm currently trying to create a table and insert values in an Oracle SQL database.
I managed to make it work using df.to_sql(name=table_name, con=conn, if_exists='append', index=False) but it took 1h30m to upload a DataFrame of only 10000 rows * 5 columns.
This made me look into Multiprocessing, so I tried following the answer Siddhi Kiran Bajracharya gave in this thread
Which turned out like this:
import pandas as pd
from sqlalchemy import create_engine
import config
LOCATION = r"C:\Oracle\instantclient_19_6"
os.environ["PATH"] = LOCATION + ";" + os.environ["PATH"]
conn = create_engine('oracle+cx_oracle://' + config.user + ':' + config.pw +
'#' + config.host + ':' + config.port + '/?service_name=' + config.db +'?charset=latin-1')
import math
from multiprocessing.dummy import Pool as ThreadPool
def insert_df(df, *args, **kwargs):
nworkers = 4 # number of workers that executes insert in parallel fashion
chunk = math.floor(df.shape[0] / nworkers) # number of chunks
chunks = [(chunk * i, (chunk * i) + chunk) for i in range(nworkers)]
chunks.append((chunk * nworkers, df.shape[0]))
pool = ThreadPool(nworkers)
def worker(chunk):
i, j = chunk
df.iloc[i:j, :].to_sql(*args, **kwargs)
pool.map(worker, chunks)
pool.close()
pool.join()
insert_df(df, f'{table_name}', conn, if_exists='append', index=False)
The problem is that this last code runs for 20mins, only inserts 9 rows into the Table, and then raises the following error DatabaseError: (cx_Oracle.DatabaseError) ORA-00955: name is already used by an existing object
Full Traceback:
---------------------------------------------------------------------------
DatabaseError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1248 self.dialect.do_execute(
-> 1249 cursor, statement, parameters, context
1250 )
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
579 def do_execute(self, cursor, statement, parameters, context=None):
--> 580 cursor.execute(statement, parameters)
581
DatabaseError: ORA-00955: name is already used by an existing object
The above exception was the direct cause of the following exception:
DatabaseError Traceback (most recent call last)
<ipython-input-73-b50275447767> in <module>
20
21
---> 22 insert_df(df, f'{table_name}', conn, if_exists='append', index=False)
<ipython-input-73-b50275447767> in insert_df(df, *args, **kwargs)
14 df.iloc[i:j, :].to_sql(*args, **kwargs)
15
---> 16 pool.map(worker, chunks)
17 pool.close()
18 pool.join()
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
266 in a list that is returned.
267 '''
--> 268 return self._map_async(func, iterable, mapstar, chunksize).get()
269
270 def starmap(self, func, iterable, chunksize=None):
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py in mapstar(args)
42
43 def mapstar(args):
---> 44 return list(map(*args))
45
46 def starmapstar(args):
<ipython-input-73-b50275447767> in worker(chunk)
12 def worker(chunk):
13 i, j = chunk
---> 14 df.iloc[i:j, :].to_sql(*args, **kwargs)
15
16 pool.map(worker, chunks)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in to_sql(self, name, con, schema, if_exists, index, index_label, chunksize, dtype, method)
2710 chunksize=chunksize,
2711 dtype=dtype,
-> 2712 method=method,
2713 )
2714
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype, method)
516 chunksize=chunksize,
517 dtype=dtype,
--> 518 method=method,
519 )
520
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype, method)
1317 dtype=dtype,
1318 )
-> 1319 table.create()
1320 table.insert(chunksize, method=method)
1321 if not name.isdigit() and not name.islower():
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in create(self)
654 )
655 else:
--> 656 self._execute_create()
657
658 def _execute_insert(self, conn, keys, data_iter):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\sql.py in _execute_create(self)
636 # Inserting table into database, add to MetaData object
637 self.table = self.table.tometadata(self.pd_sql.meta)
--> 638 self.table.create()
639
640 def create(self):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\sql\schema.py in create(self, bind, checkfirst)
868 if bind is None:
869 bind = _bind_or_error(self)
--> 870 bind._run_visitor(ddl.SchemaGenerator, self, checkfirst=checkfirst)
871
872 def drop(self, bind=None, checkfirst=False):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _run_visitor(self, visitorcallable, element, connection, **kwargs)
2044 ):
2045 with self._optional_conn_ctx_manager(connection) as conn:
-> 2046 conn._run_visitor(visitorcallable, element, **kwargs)
2047
2048 class _trans_ctx(object):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _run_visitor(self, visitorcallable, element, **kwargs)
1613
1614 def _run_visitor(self, visitorcallable, element, **kwargs):
-> 1615 visitorcallable(self.dialect, self, **kwargs).traverse_single(element)
1616
1617
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\sql\visitors.py in traverse_single(self, obj, **kw)
136 meth = getattr(v, "visit_%s" % obj.__visit_name__, None)
137 if meth:
--> 138 return meth(obj, **kw)
139
140 def iterate(self, obj):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\sql\ddl.py in visit_table(self, table, create_ok, include_foreign_key_constraints, _is_metadata_operation)
824 table,
825 include_foreign_key_constraints= # noqa
--> 826 include_foreign_key_constraints,
827 )
828 # fmt: on
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object_, *multiparams, **params)
986 raise exc.ObjectNotExecutableError(object_)
987 else:
--> 988 return meth(self, multiparams, params)
989
990 def _execute_function(self, func, multiparams, params):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\sql\ddl.py in _execute_on_connection(self, connection, multiparams, params)
70
71 def _execute_on_connection(self, connection, multiparams, params):
---> 72 return connection._execute_ddl(self, multiparams, params)
73
74 def execute(self, bind=None, target=None):
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_ddl(self, ddl, multiparams, params)
1048 compiled,
1049 None,
-> 1050 compiled,
1051 )
1052 if self._has_events or self.engine._has_events:
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1251 except BaseException as e:
1252 self._handle_dbapi_exception(
-> 1253 e, statement, parameters, cursor, context
1254 )
1255
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1471 util.raise_from_cause(newraise, exc_info)
1472 elif should_wrap:
-> 1473 util.raise_from_cause(sqlalchemy_exception, exc_info)
1474 else:
1475 util.reraise(*exc_info)
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info)
396 exc_type, exc_value, exc_tb = exc_info
397 cause = exc_value if exc_value is not exception else None
--> 398 reraise(type(exception), exception, tb=exc_tb, cause=cause)
399
400
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause)
150 value.__cause__ = cause
151 if value.__traceback__ is not tb:
--> 152 raise value.with_traceback(tb)
153 raise value
154
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1247 if not evt_handled:
1248 self.dialect.do_execute(
-> 1249 cursor, statement, parameters, context
1250 )
1251 except BaseException as e:
C:\ProgramData\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
578
579 def do_execute(self, cursor, statement, parameters, context=None):
--> 580 cursor.execute(statement, parameters)
581
582 def do_execute_no_params(self, cursor, statement, context=None):
DatabaseError: (cx_Oracle.DatabaseError) ORA-00955: name is already used by an existing object
[SQL:
CREATE TABLE "TEST_TABLE_DELETE" (
"id" CLOB,
"name" CLOB,
"var1" CLOB,
"var2" CLOB,
"var3" CLOB,
"var4" CLOB,
"var5" CLOB,
"var6" CLOB,
"var7" CLOB,
"var8" CLOB,
"var9" CLOB,
"var10" CLOB,
"var11" CLOB,
"var12" FLOAT,
"var13" CLOB,
"var14" CLOB
)
]
(Background on this error at: http://sqlalche.me/e/4xp6)
Any pointers to help me solve this issue would be greatly appreciated.
Thanks!
Luti

if you're using to_sql, with string columns in your dataframe, you better do something like this:
dtyp = {c:types.VARCHAR(data[c].str.len().max()) for c in data.columns[data.dtypes == 'object'].tolist()}
data.to_sql('table_name.....',con=...,if_exists='append'
, index=False
, dtype = dtyp)
For 10k rows, it should be very fast.

Python Sql code error - sqlite3.OperationalError: too many SQL variables

I am trying the below code but i am getting error
if not os.path.isfile('train.db'):
disk_engine = create_engine('sqlite:///train.db')
start = dt.datetime.now()
chunksize = 15000
j = 0
index_start = 1
for df in pd.read_csv('final_features.csv', names=['Unnamed: 0','id','is_duplicate','cwc_min','cwc_max','csc_min','csc_max','ctc_min','ctc_max','last_word_eq','first_word_eq','abs_len_diff','mean_len','token_set_ratio','token_sort_ratio','fuzz_ratio','fuzz_partial_ratio','longest_substr_ratio','freq_qid1','freq_qid2','q1len','q2len','q1_n_words','q2_n_words','word_Common','word_Total','word_share','freq_q1+q2','freq_q1-q2','0_x','1_x','2_x','3_x','4_x','5_x','6_x','7_x','8_x','9_x','10_x','11_x','12_x','13_x','14_x','15_x','16_x','17_x','18_x','19_x','20_x','21_x','22_x','23_x','24_x','25_x','26_x','27_x','28_x','29_x','30_x','31_x','32_x','33_x','34_x','35_x','36_x','37_x','38_x','39_x','40_x','41_x','42_x','43_x','44_x','45_x','46_x','47_x','48_x','49_x','50_x','51_x','52_x','53_x','54_x','55_x','56_x','57_x','58_x','59_x','60_x','61_x','62_x','63_x','64_x','65_x','66_x','67_x','68_x','69_x','70_x','71_x','72_x','73_x','74_x','75_x','76_x','77_x','78_x','79_x','80_x','81_x','82_x','83_x','84_x','85_x','86_x','87_x','88_x','89_x','90_x','91_x','92_x','93_x','94_x','95_x','0_y','1_y','2_y','3_y','4_y','5_y','6_y','7_y','8_y','9_y','10_y','11_y','12_y','13_y','14_y','15_y','16_y','17_y','18_y','19_y','20_y','21_y','22_y','23_y','24_y','25_y','26_y','27_y','28_y','29_y','30_y','31_y','32_y','33_y','34_y','35_y','36_y','37_y','38_y','39_y','40_y','41_y','42_y','43_y','44_y','45_y','46_y','47_y','48_y','49_y','50_y','51_y','52_y','53_y','54_y','55_y','56_y','57_y','58_y','59_y','60_y','61_y','62_y','63_y','64_y','65_y','66_y','67_y','68_y','69_y','70_y','71_y','72_y','73_y','74_y','75_y','76_y','77_y','78_y','79_y','80_y','81_y','82_y','83_y','84_y','85_y','86_y','87_y','88_y','89_y','90_y','91_y','92_y','93_y','94_y','95_y'], chunksize=chunksize, iterator=True, encoding='utf-8', ):
df.index += index_start
j+=1
print('{} rows'.format(j*chunksize))
df.to_sql('data', disk_engine, if_exists='append')
index_start = df.index[-1] + 1
This is the o/p that i am getting
15000 rows
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
506 def do_execute(self, cursor, statement, parameters, context=None):
--> 507 cursor.execute(statement, parameters)
508
OperationalError: too many SQL variables
The above exception was the direct cause of the following exception:
OperationalError Traceback (most recent call last)
<ipython-input-83-b376654c990a> in <module>()
14 j+=1
15 print('{} rows'.format(j*chunksize))
---> 16 df.to_sql('data', disk_engine, if_exists='append')
17 index_start = df.index[-1] + 1
~\Anaconda3\lib\site-packages\pandas\core\generic.py in to_sql(self, name, con, schema, if_exists, index, index_label, chunksize, dtype)
2125 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2126
-> 2127 ExcelWriter can also be used to append to an existing Excel file:
2128
2129 >>> with pd.ExcelWriter('output.xlsx',
~\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype)
448 index=True,
449 index_label=None,
--> 450 chunksize=None,
451 dtype=None,
452 method=None,
~\Anaconda3\lib\site-packages\pandas\io\sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype)
1147
1148 #staticmethod
-> 1149 def _query_iterator(
1150 result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
1151 ):
~\Anaconda3\lib\site-packages\pandas\io\sql.py in insert(self, chunksize)
661 ----------
662 conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
--> 663 keys : list of str
664 Column names
665 data_iter : generator of list
~\Anaconda3\lib\site-packages\pandas\io\sql.py in _execute_insert(self, conn, keys, data_iter)
636 return str(CreateTable(self.table).compile(self.pd_sql.connectable))
637
--> 638 def _execute_create(self):
639 # Inserting table into database, add to MetaData object
640 self.table = self.table.tometadata(self.pd_sql.meta)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object, *multiparams, **params)
946 raise exc.ObjectNotExecutableError(object)
947 else:
--> 948 return meth(self, multiparams, params)
949
950 def _execute_function(self, func, multiparams, params):
~\Anaconda3\lib\site-packages\sqlalchemy\sql\elements.py in _execute_on_connection(self, connection, multiparams, params)
267 def _execute_on_connection(self, connection, multiparams, params):
268 if self.supports_execution:
--> 269 return connection._execute_clauseelement(self, multiparams, params)
270 else:
271 raise exc.ObjectNotExecutableError(self)
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_clauseelement(self, elem, multiparams, params)
1058 compiled_sql,
1059 distilled_params,
-> 1060 compiled_sql, distilled_params
1061 )
1062 if self._has_events or self.engine._has_events:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1198 parameters,
1199 cursor,
-> 1200 context)
1201
1202 if self._has_events or self.engine._has_events:
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1411 util.raise_from_cause(
1412 sqlalchemy_exception,
-> 1413 exc_info
1414 )
1415 else:
~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info)
201 exc_type, exc_value, exc_tb = exc_info
202 cause = exc_value if exc_value is not exception else None
--> 203 reraise(type(exception), exception, tb=exc_tb, cause=cause)
204
205 if py3k:
~\Anaconda3\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause)
184 value.__cause__ = cause
185 if value.__traceback__ is not tb:
--> 186 raise value.with_traceback(tb)
187 raise value
188
~\Anaconda3\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1191 statement,
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
1195 self._handle_dbapi_exception(
~\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
505
506 def do_execute(self, cursor, statement, parameters, context=None):
--> 507 cursor.execute(statement, parameters)
508
509 def do_execute_no_params(self, cursor, statement, context=None):
I have already tried with various chunksize values but it won't worked. Can anyone please suggest me to fix this error. I am running this code in jupyter notebook. I already have updated versions of pandas and other libraries so there is no compatability issue.

This error is related to the number of parameters being passed to sqlite3. In essence, what's happening behind the scenes is that there's a SQL query being issued to the db engine: INSERT INTO myTable (col1, col2, col3,..., col_n) VALUES (?, ?, ?,..., ?), where the ? are the values from your dataframe being passed to the database.
This error occurred because your dataframe is very wide (has lots of columns), so during insertion, many parameters are being passed into the SQL statement. You can actually see in the error stack that justifies my explanation:
--> 507 cursor.execute(statement, parameters)
Simply, SQLite can only handle a limited number of parameters being passed. This is simply a limitation of SQLite. You can scroll down to #9 in this page for more info on this.
Setting chunksize will not resolve your problem. My suggestion is using another db like postgres or mysql.

"not all arguments converted during string formatting" when to_sql

I'm trying the following piece of code, which I found in a 2016 book:
import MySQLdb
import pandas as pd
# database setup omitted for the sake of brevity
nr_customers = 100
colnames = ["movie%i" %i for i in range(1, 33)]
pd.np.random.seed(2015)
generated_customers = pd.np.random.randint(0,2,32 * nr_customers).reshape(nr_customers,32)
data = pd.DataFrame(generated_customers, columns = list(colnames))
data.to_sql('cust',mc,index=True,if_exists='replace',index_label='cust_id')
And it's just giving me the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/MySQLdb/cursors.py in execute(self, query, args)
242 try:
--> 243 query = query % args
244 except TypeError as m:
TypeError: not all arguments converted during string formatting
During handling of the above exception, another exception occurred:
ProgrammingError Traceback (most recent call last)
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1430 else:
-> 1431 cur.execute(*args)
1432 return cur
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/MySQLdb/cursors.py in execute(self, query, args)
244 except TypeError as m:
--> 245 self.errorhandler(self, ProgrammingError, str(m))
246
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/MySQLdb/connections.py in defaulterrorhandler(***failed resolving arguments***)
51 if errorclass is not None:
---> 52 raise errorclass(errorvalue)
53 else:
ProgrammingError: not all arguments converted during string formatting
During handling of the above exception, another exception occurred:
DatabaseError Traceback (most recent call last)
<ipython-input-24-125bb185f2f4> in <module>
4 generated_customers = pd.np.random.randint(0,2,32 * nr_customers).reshape(nr_customers,32)
5 data = pd.DataFrame(generated_customers, columns = list(colnames))
----> 6 data.to_sql('cust',mc,index=True,if_exists='replace',index_label='cust_id')
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/core/generic.py in to_sql(self, name, con, schema, if_exists, index, index_label, chunksize, dtype, method)
2529 sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
2530 index=index, index_label=index_label, chunksize=chunksize,
-> 2531 dtype=dtype, method=method)
2532
2533 def to_pickle(self, path, compression='infer',
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype, method)
458 pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
459 index_label=index_label, schema=schema,
--> 460 chunksize=chunksize, dtype=dtype, method=method)
461
462
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype, method)
1544 if_exists=if_exists, index_label=index_label,
1545 dtype=dtype)
-> 1546 table.create()
1547 table.insert(chunksize, method)
1548
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in create(self)
570
571 def create(self):
--> 572 if self.exists():
573 if self.if_exists == 'fail':
574 raise ValueError(
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in exists(self)
558
559 def exists(self):
--> 560 return self.pd_sql.has_table(self.name, self.schema)
561
562 def sql_schema(self):
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in has_table(self, name, schema)
1556 "WHERE type='table' AND name={wld};").format(wld=wld)
1557
-> 1558 return len(self.execute(query, [name, ]).fetchall()) > 0
1559
1560 def get_table(self, table_name, schema=None):
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1443 "Execution failed on sql '{sql}': {exc}".format(
1444 sql=args[0], exc=exc))
-> 1445 raise_with_traceback(ex)
1446
1447 #staticmethod
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/compat/__init__.py in raise_with_traceback(exc, traceback)
418 if traceback == Ellipsis:
419 _, _, traceback = sys.exc_info()
--> 420 raise exc.with_traceback(traceback)
421 else:
422 # this version of raise is a syntax error in Python 3
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
1429 cur.execute(*args, **kwargs)
1430 else:
-> 1431 cur.execute(*args)
1432 return cur
1433 except Exception as exc:
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/MySQLdb/cursors.py in execute(self, query, args)
243 query = query % args
244 except TypeError as m:
--> 245 self.errorhandler(self, ProgrammingError, str(m))
246
247 if isinstance(query, unicode):
~/anaconda3/envs/TestEnv/lib/python3.7/site-packages/MySQLdb/connections.py in defaulterrorhandler(***failed resolving arguments***)
50 raise errorvalue
51 if errorclass is not None:
---> 52 raise errorclass(errorvalue)
53 else:
54 raise Exception(errorvalue)
DatabaseError: Execution failed on sql 'SELECT name FROM sqlite_master WHERE type='table' AND name=?;': not all arguments converted during string formatting
Which I can resume in "DatabaseError: Execution failed on sql 'SELECT name FROM sqlite_master WHERE type='table' AND name=?;': not all arguments converted during string formatting"
I've tried different approaches, like using f"${}" and so on, but the error is the same.
The code isn't completely the same as in the book, since I had to remove the flavor = 'mysql' argument used in to_sql.
I'm using:
mysql Ver 8.0.15 for osx10.13 on x86_64 (Homebrew)
Python 3.7.2
conda 4.6.7
pandas 0.24.2 py37h0a44026_0
mysql-connector-c 6.1.11 hccea1a4_0
mysqlclient 1.3.14 py37h1de35cc_0

Nevermind. Just changed to use sqlalchemy with pymysql and saved a lot of time and LOCs:
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://user:password#localhost/database')
...
data.to_sql(table, con = engine)

Reading SQLite DB table into Python

I am having trouble reading data into Python from a sqlite database located in the same directory as the Jupyter notebook where I am doing my work.
The error message (below) led me to believe that the table Player_Attributes does not exist in the database, but after exploring it with DB Browser for SQLite, I see that is is indeed there.
Any guidance would be appreciated.
Code:
cnx = sqlite3.connect('database.sqlite')
df = pd.read_sql_query('SELECT * FROM Player_Attributes', cnx)
Error message:
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) /home/captain/anaconda3/lib/python3.5/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs) 1403 else:
-> 1404 cur.execute(*args) 1405 return cur
OperationalError: no such table: Player_Attributes
During handling of the above exception, another exception occurred:
DatabaseError Traceback (most recent call last) <ipython-input-4-c51fe6ea9537> in <module>()
5
6 cnx = sqlite3.connect('database.sqlite')
----> 7 df = pd.read_sql_query('SELECT * FROM Player_Attributes', cnx)
/home/captain/anaconda3/lib/python3.5/site-packages/pandas/io/sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
330 return pandas_sql.read_query(
331 sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 332 parse_dates=parse_dates, chunksize=chunksize)
333
334
/home/captain/anaconda3/lib/python3.5/site-packages/pandas/io/sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize) 1437 1438 args = _convert_params(sql, params)
-> 1439 cursor = self.execute(*args) 1440 columns = [col_desc[0] for col_desc in cursor.description] 1441
/home/captain/anaconda3/lib/python3.5/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs) 1414 ex = DatabaseError( 1415 "Execution failed on sql '%s': %s" % (args[0], exc))
-> 1416 raise_with_traceback(ex) 1417 1418 #staticmethod
/home/captain/anaconda3/lib/python3.5/site-packages/pandas/compat/__init__.py in raise_with_traceback(exc, traceback)
342 if traceback == Ellipsis:
343 _, _, traceback = sys.exc_info()
--> 344 raise exc.with_traceback(traceback)
345 else:
346 # this version of raise is a syntax error in Python 3
/home/captain/anaconda3/lib/python3.5/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs) 1402 cur.execute(*args, **kwargs) 1403 else:
-> 1404 cur.execute(*args) 1405 return cur 1406 except Exception as exc:
DatabaseError: Execution failed on sql 'SELECT * FROM Player_Attributes': no such table: Player_Attributes

cnx = sqlite3.connect('database.sqlite')
In this line, instead of just specifying the name of the document, you have to provide the complete path of the file.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to minimize code when using sqlalchemy query and pandas dataframes - python

Related

trying to create a pivot table with pandas and numpy

Python Multiprocessing Parallel Insert Into Oracle SQL

Python Sql code error - sqlite3.OperationalError: too many SQL variables

"not all arguments converted during string formatting" when to_sql

Reading SQLite DB table into Python

Categories

Resources