How can I get keyword arguments in sqlalchemy.FunctionElement? - python

I am trying to create a function which would produce statement equivalent to datetime.utcnow() + timedelta(days=5, minutes=4). I want to be able to call it like utc_after(days=5, minutes=4).
It should be similar to utcnow(), as described in SQLAlchemy documentation.
Here is an example what I got working so far (one dialect only for brevity):
from sqlalchemy.sql.expression import FunctionElement
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.types import DateTime
class utc_after(FunctionElement):
type = DateTime()
name = 'utc_after'
#compiles(utc_after, 'sqlite')
def sqlite_utc_after(element, compiler, **kwargs):
days, hours = list(element.clauses)
return "datetime('now', '+%s day', '+%s hours')" % (days.value, hours.value)
It works. I can use it as in:
week_after_submission = db.Column(db.DateTime, default=utc_after(7, 0))
Obviously this is only a stub of the final code (it needs +/- formatting, minutes, seconds and so on).
The question is: how can I use keyword arguments from FunctionElement so I could have:
next_week = db.Column(db.DateTime, default=utc_after(days=7))
When I specify utc_after(days=7), element.clauses is empty.
So far I tried using kwargs from sqlite_utc_after (these are empty), digging through element properties, and searching clues in documentation, without results.

Set the keyword argument as a property.
from sqlalchemy.sql.functions import GenericFunction
from sqlalchemy.ext.compiler import compiles
from sqlalchemy import Table, Column, String, MetaData
class last_value(GenericFunction):
def __init__(self, *clauses, ignore_nulls=False, **kwargs):
self.ignore_nulls = ignore_nulls
super().__init__(*clauses, **kwargs)
#compiles(last_value)
def visit_last_value(element, compiler, **kwargs):
clauses = element.clauses
ignore_nulls = element.ignore_nulls
ignore_nulls_clause = ' ignore nulls' if ignore_nulls else ''
return 'last_value(%s%s)' % (compiler.process(clauses), ignore_nulls_clause)
def test_last_value():
m = MetaData()
t = Table('test', m, Column('a', String), Column('b', String))
c = func.last_value(t.c.a, ignore_nulls=True)
s = select([c])
assert ' '.join(str(s).split()) == 'SELECT last_value(test.a ignore nulls) AS last_value_1 FROM test'

Related

Mocking sqlobject function call for test db

I am trying to mock sqlbuilder.func for test cases with pytest
I successfully mocked sqlbuilder.func.TO_BASE64 with correct output but when I tried mocking sqlbuilder.func.FROM_UNIXTIME I didn't get any error but the resulted output is incorrect with the generated query. Below is the minimal working example of the problem.
models.py
from sqlobject import (
sqlbuilder,
sqlhub,
SQLObject,
StringCol,
BLOBCol,
TimestampCol,
)
class Store(SQLObject):
name = StringCol()
sample = BLOBCol()
createdAt = TimestampCol()
DATE_FORMAT = "%Y-%m-%d"
def retrieve(name):
query = sqlbuilder.Select([
sqlbuilder.func.TO_BASE64(Store.q.sample),
],
sqlbuilder.AND(
Store.q.name == name,
sqlbuilder.func.FROM_UNIXTIME(Store.q.createdAt, DATE_FORMAT) >= sqlbuilder.func.FROM_UNIXTIME("2018-10-12", DATE_FORMAT)
)
)
connection = sqlhub.getConnection()
query = connection.sqlrepr(query)
print(query)
queryResult = connection.queryAll(query)
return queryResult
conftest.py
import pytest
from models import Store
from sqlobject import sqlhub
from sqlobject.sqlite import sqliteconnection
#pytest.fixture(autouse=True, scope="session")
def sqlite_db_session(tmpdir_factory):
file = tmpdir_factory.mktemp("db").join("sqlite.db")
conn = sqliteconnection.SQLiteConnection(str(file))
sqlhub.processConnection = conn
init_tables()
yield conn
conn.close()
def init_tables():
Store.createTable(ifNotExists=True)
test_ex1.py
import pytest
from sqlobject import sqlbuilder
from models import retrieve
try:
import mock
from mock import MagicMock
except ImportError:
from unittest import mock
from unittest.mock import MagicMock
def TO_BASE64(x):
return x
def FROM_UNIXTIME(x, y):
return 'strftime("%Y%m%d", datetime({},"unixepoch", "localtime"))'.format(x)
# #mock.patch("sqlobject.sqlbuilder.func.TO_BASE64")
# #mock.patch("sqlobject.sqlbuilder.func.TO_BASE64", MagicMock(side_effect=lambda x: x))
# #mock.patch("sqlobject.sqlbuilder.func.TO_BASE64", new_callable=MagicMock(side_effect=lambda x: x))
#mock.patch("sqlobject.sqlbuilder.func.TO_BASE64", TO_BASE64)
#mock.patch("sqlobject.sqlbuilder.func.FROM_UNIXTIME", FROM_UNIXTIME)
def test_retrieve():
result = retrieve('Some')
assert result == []
Current SQL:
SELECT store.sample FROM store WHERE (((store.name) = ('Some')) AND (1))
Expected SQL:
SELECT
store.sample
FROM
store
WHERE
store.name = 'Some'
AND
strftime(
'%Y%m%d',
datetime(store.created_at, 'unixepoch', 'localtime')
) >= strftime(
'%Y%m%d',
datetime('2018-10-12', 'unixepoch', 'localtime')
)
Edit Example
#! /usr/bin/env python
from sqlobject import *
__connection__ = "sqlite:/:memory:?debug=1&debugOutput=1"
try:
import mock
from mock import MagicMock
except ImportError:
from unittest import mock
from unittest.mock import MagicMock
class Store(SQLObject):
name = StringCol()
sample = BLOBCol()
createdAt = TimestampCol()
Store.createTable()
DATE_FORMAT = "%Y-%m-%d"
def retrieve(name):
query = sqlbuilder.Select([
sqlbuilder.func.TO_BASE64(Store.q.sample),
],
sqlbuilder.AND(
Store.q.name == name,
sqlbuilder.func.FROM_UNIXTIME(Store.q.createdAt, DATE_FORMAT) >= sqlbuilder.func.FROM_UNIXTIME("2018-10-12", DATE_FORMAT)
)
)
connection = Store._connection
query = connection.sqlrepr(query)
queryResult = connection.queryAll(query)
return queryResult
def TO_BASE64(x):
return x
def FROM_UNIXTIME(x, y):
return 'strftime("%Y%m%d", datetime({},"unixepoch", "localtime"))'.format(x)
for p in [
mock.patch("sqlobject.sqlbuilder.func.TO_BASE64",TO_BASE64),
mock.patch("sqlobject.sqlbuilder.func.FROM_UNIXTIME",FROM_UNIXTIME),
]:
p.start()
retrieve('Some')
mock.patch.stopall()
By default, sqlbuilder.func is an SQLExpression that passes its attribute (sqlbuilder.func.datetime, e.g.) to the SQL backend as a constant (sqlbuilder.func actually is an alias for sqlbuilder.ConstantSpace). See the docs about SQLExpression, the FAQ and the code for func.
When you mock an attribute in func namespace it's evaluated by SQLObject and passed to the backend in reduced form. If you want to return a string literal from the mocking function you need to tell SQLObject it's a value that has to be passed to the backend as is, unevaluated. The way to do it is to wrap the literal in SQLConstant like this:
def FROM_UNIXTIME(x, y):
return sqlbuilder.SQLConstant('strftime("%Y%m%d", datetime({},"unixepoch", "localtime"))'.format(x))
See SQLConstant.
The entire test script now looks this
#! /usr/bin/env python3.7
from sqlobject import *
__connection__ = "sqlite:/:memory:?debug=1&debugOutput=1"
try:
import mock
from mock import MagicMock
except ImportError:
from unittest import mock
from unittest.mock import MagicMock
class Store(SQLObject):
name = StringCol()
sample = BLOBCol()
createdAt = TimestampCol()
Store.createTable()
DATE_FORMAT = "%Y-%m-%d"
def retrieve(name):
query = sqlbuilder.Select([
sqlbuilder.func.TO_BASE64(Store.q.sample),
],
sqlbuilder.AND(
Store.q.name == name,
sqlbuilder.func.FROM_UNIXTIME(Store.q.createdAt, DATE_FORMAT) >= sqlbuilder.func.FROM_UNIXTIME("2018-10-12", DATE_FORMAT)
)
)
connection = Store._connection
query = connection.sqlrepr(query)
queryResult = connection.queryAll(query)
return queryResult
def TO_BASE64(x):
return x
def FROM_UNIXTIME(x, y):
return sqlbuilder.SQLConstant('strftime("%Y%m%d", datetime({},"unixepoch", "localtime"))'.format(x))
for p in [
mock.patch("sqlobject.sqlbuilder.func.TO_BASE64",TO_BASE64),
mock.patch("sqlobject.sqlbuilder.func.FROM_UNIXTIME",FROM_UNIXTIME),
]:
p.start()
retrieve('Some')
mock.patch.stopall()
The output is:
1/Query : CREATE TABLE store (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
sample TEXT,
created_at TIMESTAMP
)
1/QueryR : CREATE TABLE store (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
sample TEXT,
created_at TIMESTAMP
)
2/QueryAll: SELECT store.sample FROM store WHERE (((store.name) = ('Some')) AND ((strftime("%Y%m%d", datetime(store.created_at,"unixepoch", "localtime"))) >= (strftime("%Y%m%d", datetime(2018-10-12,"unixepoch", "localtime")))))
2/QueryR : SELECT store.sample FROM store WHERE (((store.name) = ('Some')) AND ((strftime("%Y%m%d", datetime(store.created_at,"unixepoch", "localtime"))) >= (strftime("%Y%m%d", datetime(2018-10-12,"unixepoch", "localtime")))))
2/QueryAll-> []
PS. Full disclosure: I'm the current maintainer of SQLObject.
As #phd pointed that SQLObject evaluate the expression before passing it to backend in reducted form.
Then we can also pass expression directly which SQLObject will evaluate so instead of passing string literal we can also do as below
def FROM_UNIXTIME(x, y):
return sqlbuilder.func.strftime("%Y%m%d", sqlbuilder.func.datetime(x, "unixepoch", "localtime"))
Output:
SELECT store.sample FROM store WHERE (((store.name) = ('Some')) AND ((strftime("%Y%m%d", datetime(store.created_at,"unixepoch", "localtime"))) >= (strftime("%Y%m%d", datetime(2018-10-12,"unixepoch", "localtime")))))

Reset index name in elasticsearch dsl

I'm trying to create an ETL that extracts from mongo, process the data and loads into elastic. I will do a daily load so I thought of naming my index with the current date. This will help me for a later processing I need to do with this first index.
I used elasticsearch dsl guide: https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html
The problem that I have comes from my little experience with working with classes. I don't know how to reset the Index name from the class.
Here is my code for the class (custom_indices.py):
from elasticsearch_dsl import Document, Date, Integer, Keyword, Text
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Search
import datetime
class News(Document):
title = Text(analyzer='standard', fields={'raw': Keyword()})
manual_tagging = Keyword()
class Index:
name = 'processed_news_'+datetime.datetime.now().strftime("%Y%m%d")
def save(self, ** kwargs):
return super(News, self).save(** kwargs)
def is_published(self):
return datetime.now() >= self.processed
And this is the part of the code where I create the instance to that class:
from custom_indices import News
import elasticsearch
import elasticsearch_dsl
from elasticsearch_dsl.connections import connections
import pandas as pd
import datetime
connections.create_connection(hosts=['localhost'])
News.init()
for index, doc in df.iterrows():
new_insert = News(meta={'id': doc.url_hashed},
title = doc.title,
manual_tagging = doc.customTags,
)
new_insert.save()
Every time I call the "News" class I would expect to have a new name. However, the name doesn't change even if I load the class again (from custom_indices import News). I know this is only a problem I have when testing but I'd like to know how to force that "reset". Actually, I originally wanted to change the name outside the class with this line right before the loop:
News.Index.name = "NEW_NAME"
However, that didn't work. I was still seeing the name defined on the class.
Could anyone please assist?
Many thanks!
PS: this must be just an object oriented programming issue. Apologies for my ignorance on the subject.
Maybe you could take advantage of the fact that Document.init() accepts an index keyword argument. If you want the index name to get set automatically, you could implement init() in the News class and call super().init(...) in your implementation.
A simplified example (python 3.x):
from elasticsearch_dsl import Document
from elasticsearch_dsl.connections import connections
import datetime
class News(Document):
#classmethod
def init(cls, index=None, using=None):
index_name = index or 'processed_news_' + datetime.datetime.now().strftime("%Y%m%d")
return super().init(index=index_name, using=using)
You can override the index when you call save() .
new_insert.save('processed_news_' + datetime.datetime.now().strftime("%Y%m%d"))
Example as following.
# coding: utf-8
import datetime
from elasticsearch_dsl import Keyword, Text, \
Index, Document, Date
from elasticsearch_dsl.connections import connections
HOST = "localhost:9200"
index_names = [
"foo-log-",
"bar-log-",
]
default_settings = {"number_of_shards": 4, "number_of_replicas": 1}
index_settings = {
"foo-log-": {
"number_of_shards": 40,
"number_of_replicas": 1
}
}
class LogDoc(Document):
level = Keyword(ignore_above=256)
date = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
hostname = Text(fields={'fields': Keyword(ignore_above=256)})
message = Text()
createTime = Date(format="yyyy-MM-dd'T'HH:mm:ss.SSS")
def auto_create_index():
'''自动创建ES索引'''
connections.create_connection(hosts=[HOST])
for day in range(3):
dt = datetime.datetime.now() + datetime.timedelta(days=day)
for index in index_names:
name = index + dt.strftime("%Y-%m-%d")
settings = index_settings.get(index, default_settings)
idx = Index(name=name)
idx.document(LogDoc)
idx.settings(**settings)
try:
idx.create()
except Exception as e:
print(e)
continue
print("create index %s" % name)
if __name__ == '__main__':
auto_create_index()

Error while calling method defined inside a class

I am trying to create a class and I can't seem to get it to work? I'm fairly new to Python, so any assistance would be appreciated. Also, not sure if this is the most efficient way to create and use an object. I am trying to build a well model and this is one piece of that model, once I get this simple issue figured out the rest should be fairly easy. Thanks.
import sys
import os
import csv
import pyodbc
import pandas as pd
import pandas.io.sql as psql
from pandas import Series, DataFrame
from time import gmtime, strftime
#Drill Pipe Class
class DP:
#Properties
DP_ID = 1.00
DP_OD = 1.00
DP_Name = 'Drill Pipe'
#test global
idwel = '6683AFCEA5DF429CAC123213F85EB9B3'
#Constructor <- Accepts idwell to get info
def __init__(self,idwell):
self.id = idwell
#..
#WV DB connecton Function -> return as dataframe -Updated 7/5/17
def WV_Read_Query(_query):
try:
cnxn = pyodbc.connect("DSN=SQL_R_WV")
cur = cnxn.cursor()
df = psql.read_sql(_query, cnxn)
cnxn.close()
#print(df)
return df
except "Error":
return "Query Error...!"
#..
def get_DP_Data(_id):
_id = str(_id)
DP_Query = """Select Top 1
DS.des as 'dp_name',DS.SZIDNOM as 'dp_id',
DS.SZODNOM as 'dp_od',DS.SYSCREATEDATE as 'date'
From [dbo].[US_WVJOBDRILLSTRINGCOMP] DS
Where IDWELL = '""" + _id +"""'
AND Des = 'Drill Pipe' Order by SYSCREATEDATE Desc"""
mud_Data = WV_Read_Query(DP_Query)
return mud_Data
#..
DP_Table = get_DP_Data(id)
def get_DP_ID(self, DP_Table):
dp_id = DP_Table['dp_id']
return dp_id
#..
def get_DP_OD(self, DP_Table):
dp_od = DP_Table['dp_od']
return dp_od
#..
def get_Date(self, DP_Table):
u_date = DP_Table['date']
return u_date
#..
def get_Des(self, DP_Table):
des = DP_Table['dp_name']
return des
#..
#Print DP Info
def DP_Info(self):
Des = get_Des()
ID = get_DP_ID()
OD = get_DP_OD()
Updated = strftime("%Y-%m-%d %H:%M:%S", gmtime())
return Des + "\nDP Id:\t" + ID + "\nDP Id:\t" + OD + "\nUpdated:\t" + Updated
#..
#...
dp = DP('6683AFCEA5DF429CAC123213F85EB9B3')
dp_info = dp.DP_Info()
print(dp_info)
Traceback (most recent call last): File "u:\Development\Python
Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 71, in
class DP: File "u:\Development\Python Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 108, in DP
DP_Table = get_DP_Data(id) File "u:\Development\Python Scripts\HCP\CUC Export Files 8_7_17\Well_Model.py", line 104, in
get_DP_Data
mud_Data = WV_Read_Query(DP_Query) NameError: name 'WV_Read_Query' is not defined
If you are defining non-static, non-class methods within a function, the first argument is always an instance of that class. We usually call this argument self:
def WV_Read_Query(self, _query):
...
And,
def get_DP_Data(self, _id):
Furthermore, you call a these methods on the object self:
self.WV_Read_Query(DP_Query)
You might wonder why the function is defined with 2 arguments, but only 1 passed to. That's because the instance is implicitly passed as the first parameter, automatically.
This is equivalent to
DP.WV_Read_Query(self, DP_Query)
Where you call the method on the class, but explicitly pass the instance to it.
Further reading:
Python classes
What is the difference between class and instance methods?
You need to access it with self. So
def get_DP_Data(self, _id):
_id = str(_id)
DP_Query = """Select Top 1
DS.des as 'dp_name',DS.SZIDNOM as 'dp_id',
DS.SZODNOM as 'dp_od',DS.SYSCREATEDATE as 'date'
From [dbo].[US_WVJOBDRILLSTRINGCOMP] DS
Where IDWELL = '""" + _id +"""'
AND Des = 'Drill Pipe' Order by SYSCREATEDATE Desc"""
mud_Data = self.WV_Read_Query(DP_Query)
return mud_Data
You also need to add self to several of your methods. The class instance will always be the first parameter in a method unless you define it as a staticmethod using the decorator staticmethod.

cx_Oracle: How can I receive each row as a dictionary?

By default, cx_Oracle returns each row as a tuple.
>>> import cx_Oracle
>>> conn=cx_Oracle.connect('scott/tiger')
>>> curs=conn.cursor()
>>> curs.execute("select * from foo");
>>> curs.fetchone()
(33, 'blue')
How can I return each row as a dictionary?
You can override the cursor's rowfactory method. You will need to do this each time you perform the query.
Here's the results of the standard query, a tuple.
curs.execute('select * from foo')
curs.fetchone()
(33, 'blue')
Returning a named tuple:
def makeNamedTupleFactory(cursor):
columnNames = [d[0].lower() for d in cursor.description]
import collections
Row = collections.namedtuple('Row', columnNames)
return Row
curs.rowfactory = makeNamedTupleFactory(curs)
curs.fetchone()
Row(x=33, y='blue')
Returning a dictionary:
def makeDictFactory(cursor):
columnNames = [d[0] for d in cursor.description]
def createRow(*args):
return dict(zip(columnNames, args))
return createRow
curs.rowfactory = makeDictFactory(curs)
curs.fetchone()
{'Y': 'brown', 'X': 1}
Credit to Amaury Forgeot d'Arc:
http://sourceforge.net/p/cx-oracle/mailman/message/27145597
A very short version:
curs.rowfactory = lambda *args: dict(zip([d[0] for d in curs.description], args))
Tested on Python 3.7.0 & cx_Oracle 7.1.2
Old question but adding some helpful links with a Python recipe
According to cx_Oracle documentation:
Cursor.rowfactory
This read-write attribute specifies a method to call for each row that
is retrieved from the database. Ordinarily a tuple is returned for
each row but if this attribute is set, the method is called with the
tuple that would normally be returned, and the result of the method is
returned instead.
The cx_Oracle - Python Interface for Oracle Database Also points to GitHub repository for lots of helpful sample examples. Please check GenericRowFactory.py.
Googled: This PPT can be further helpful: [PDF]CON6543 Python and Oracle Database - RainFocus
Recipe
Django database backend for Oracle under the hood uses cx_Oracle. In earlier versions ( Django 1.11- ) they have written _rowfactory(cursor, row) That also cast cx_Oracle's numeric data types into relevant Python data and strings into unicode.
If you have installed Django Please check base.py as follows:
$ DJANGO_DIR="$(python -c 'import django, os; print(os.path.dirname(django.__file__))')"
$ vim $DJANGO_DIR/db/backends/oracle/base.py
One can borrow _rowfactory() from $DJANGO_DIR/db/backends/oracle/base.py and can apply below decorator naming to make it return namedtuple instead of simple tuple.
mybase.py
import functools
from itertools import izip, imap
from operator import itemgetter
from collections import namedtuple
import cx_Oracle as Database
import decimal
def naming(rename=False, case=None):
def decorator(rowfactory):
#functools.wraps(rowfactory)
def decorated_rowfactory(cursor, row, typename="GenericRow"):
field_names = imap(case, imap(itemgetter(0), cursor.description))
return namedtuple(typename, field_names)._make(rowfactory(cursor, row))
return decorated_rowfactory
return decorator
use it as:
#naming(rename=False, case=str.lower)
def rowfactory(cursor, row):
casted = []
....
....
return tuple(casted)
oracle.py
import cx_Oracle as Database
from cx_Oracle import *
import mybase
class Cursor(Database.Cursor):
def execute(self, statement, args=None):
prepareNested = (statement is not None and self.statement != statement)
result = super(self.__class__, self).execute(statement, args or [])
if prepareNested:
if self.description:
self.rowfactory = lambda *row: mybase.rowfactory(self, row)
return result
def close(self):
try:
super(self.__class__, self).close()
except Database.InterfaceError:
"already closed"
class Connection(Database.Connection):
def cursor(self):
Cursor(self)
connect = Connection
Now, instead of import cx_oracle import oracle in user script as:
user.py
import oracle
dsn = oracle.makedsn('HOSTNAME', 1521, service_name='dev_server')
db = connect('username', 'password', dsn)
cursor = db.cursor()
cursor.execute("""
SELECT 'Grijesh' as FirstName,
'Chauhan' as LastName,
CAST('10560.254' AS NUMBER(10, 2)) as Salary
FROM DUAL
""")
row = cursor.fetchone()
print ("First Name is %s" % row.firstname) # => Grijesh
print ("Last Name is %s" % row.lastname) # => Chauhan
print ("Salary is %r" % row.salary) # => Decimal('10560.25')
Give it a Try!!
Building up on answer by #maelcum73 :
curs.rowfactory = lambda *args: dict(zip([d[0] for d in curs.description], args))
The issue with this solution is that you need to re-set this after every execution.
Going one step further, you can create a shell around the cursor object like so:
class dictcur(object):
# need to monkeypatch the built-in execute function to always return a dict
def __init__(self, cursor):
self._original_cursor = cursor
def execute(self, *args, **kwargs):
# rowfactory needs to be set AFTER EACH execution!
self._original_cursor.execute(*args, **kwargs)
self._original_cursor.rowfactory = lambda *a: dict(
zip([d[0] for d in self._original_cursor.description], a)
)
# cx_Oracle's cursor's execute method returns a cursor object
# -> return the correct cursor in the monkeypatched version as well!
return self._original_cursor
def __getattr__(self, attr):
# anything other than the execute method: just go straight to the cursor
return getattr(self._original_cursor, attr)
dict_cursor = dictcur(cursor=conn.cursor())
Using this dict_cursor, every subsequent dict_cursor.execute() call will return a dictionary. Note: I tried monkeypatching the execute method directly, however that was not possible because it is a built-in method.

results from an sqlachemy query as iterator

I am struggling to create an iterator from a query from sqlalchemy.
Here is what I tried so far
create a table
from sqlalchemy import create_engine, Column, MetaData, Table , Integer, String
engine = create_engine('sqlite:///test90.db')
conn = engine.connect()
metadata = MetaData()
myTable = Table('myTable', metadata,
Column('Doc_id', Integer, primary_key=True),
Column('Doc_Text', String))
metadata.create_all(engine)
conn.execute(myTable.insert(), [{'Doc_id': 1, 'Doc_Text' : 'first sentence'},
{'Doc_id': 2, 'Doc_Text' : 'second sentence'},
{'Doc_id': 3, 'Doc_Text' : 'third sentence'},
{'Doc_id': 4, 'Doc_Text' : 'fourth sentence'}
])
I read everything I could on iterator but do not get it.
Here the class I created to get an iterator but it does not work
(it overflows although I specify a break)
from sqlalchemy import create_engine
class RecordsIterator:
def __init__(self, xDB, xSQL):
self.engine = create_engine(xDB)
self.conn = self.engine.connect()
self.xResultCollection = self.conn.execute(xSQL)
def __iter__(self):
return self
def next (self):
while self.xResultCollection.closed is False:
xText = (self.xResultCollection.fetchone())[1]
xText = xText.encode('utf-8')
yield xText.split()
if not self.xResultCollection:
break
x1 = RecordsIterator(xDB = 'sqlite:///test91.db', xSQL = 'select * from myTable')
In case you are wondering why I am not just using a generator .
I need to feed the iterator in gensim.Word2Vec and unfortunately, it does not take a generator
import gensim
gensim.models.Word2Vec(x1)
Thanks in advance
Your check if not self.xResultCollection will always return False, as the truth value of the result object will always be True.
In your next method you have a for and a while loop, which shouldn't really be needed, the next method should just return one element, there's no need for a loop there.
As self.xResultCollection is itself an iterable you could just do:
class RecordsIterator:
def __init__(self, xDB, xSQL):
self.engine = create_engine(xDB)
self.conn = self.engine.connect()
self.resultIterator = iter(self.conn.execute(xSQL))
def __iter__(self):
return self
def next (self):
return next(self.resultIterator)[1].encode('utf-8').split()
For those interested in a using this with gensim.
It turns out that the problem was that gensim wants an iterator, on which we can return (iterating over results of a query cursor, consumes it).
see discussions here
this is what seems to work for me
import gensim
from sqlalchemy import create_engine
xDB = 'sqlite:///test91.db'
xSQL = 'select * from myTable'
engine = create_engine(xDB)
conn = engine.connect()
xResultIterator = conn.execute(xSQL)
class MyIterator(object):
def __init__(self, xResults, xNrCol):
self.xResults = xResults
self.xNrCol = xNrCol
def __iter__(self):
for xRecord in self.xResults:
xText = (xRecord[self.xNrCol]).lower().encode('utf8')
xToken = xText.split()
if not xToken:
continue
yield xToken
self.xResults = conn.execute(xSQL) ### THIS SEEMS TO FIX IT
#to use
q1 = MyIterator(xResultIterator, xNrCol = 1)
model = gensim.models.Word2Vec(sentences = q1 , min_count = 1)
and here the vocabulary
model.vocab.keys()
I run this on a postgresql with 1 Million entries (titles of scientific papers) in about 90 seconds without problem
I hope this will help someone else

Categories

Resources