Sapcy 3.0 : can't add costum lookups for Lemmatizer - python

I used the code below to add custom Lookups to a custom Lanuage class:
def create_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", LOOKUP)
lookups.add_table("lemma_rules", json_to_dict('lemma_rules.json'))
lookups.add_table("lemma_index", json_to_dict('lemma_index.json'))
lookups.add_table("lemma_exc", json_to_dict('lemma_exc.json'))
return lookups
def json_to_dict(filename):
location = os.path.realpath(
os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(location, filename)) as f_in:
return json.load(f_in)
#CustomeLanguage.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
):
lemmatizer = Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
lemmatizer.lookups = create_lookups()
return lemmatizer
But when I instantiate the CustomLanguage there is no lookup table in nlp.vocab.lookups. What is the problem and how can I solve it?

The lemmatizer lookups are no longer in the vocab. They're stored in the lemmatizer component under nlp.get_pipe("lemmatizer").lookups instead.
If your lemmatizer factory creates the lemmatizer like this, anyone loading the model will need to have these JSON files available or the model won't load. (The lookup tables are saved in the model, but your make_lemmatizer method just hasn't been written with this in mind.)
Instead, create a custom lemmatizer class that loads these tables in its initialize method and then your code would look like this to add a lemmatizer and load its tables once.
nlp = spacy.blank("lg")
nlp.add_pipe("lemmatizer").initialize()
nlp.to_disk("/path/to/model")
Once you've run initialize() once for the lemmatizer, the tables are saved with the model directory and you don't need to run it again when you reload the model.
It could look something like this, which would also allow you to pass in a Lookups object to initialize instead if you'd prefer:
class CustomLemmatizer(Lemmatizer):
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
lookups: Optional[Lookups] = None,
):
if lookups is None:
self.lookups = create_lookups()
else:
self.lookups = lookups

Related

How to create a dataclass with optional fields that outputs field in json only if the field is not None

I am unclear about how to use a #dataclass to convert a mongo doc into a python dataclass. With my NSQL documents they may or may not contain some of the fields. I only want to output a field (using asdict) from the dataclass if that field was present in the mongo document.
Is there a way to create a field that will be output with dataclasses.asdict only if it exists in the mongo doc?
I have tried using post_init but have not figured out a solution.
# in this example I want to output the 'author' field ONLY if it is present in the mongo document
#dataclass
class StoryTitle:
_id: str
title: str
author: InitVar[str] = None
dateOfPub: int = None
def __post_init__(self, author):
print(f'__post_init__ got called....with {author}')
if author is not None:
self.newauthor = author
print(f'self.author is now {self.newauthor}')
# foo and bar approximate documents in mongodb
foo = dict(_id='b23435xx3e4qq', title = 'goldielocks and the big bears', author='mary', dateOfPub = 220415)
newFoo = StoryTitle(**foo)
json_foo = json.dumps(asdict(newFoo))
print(json_foo)
bar = dict(_id='b23435xx3e4qq', title = 'War and Peace', dateOfPub = 220415)
newBar = StoryTitle(**bar)
json_bar = json.dumps(asdict(newBar))
print(json_bar)
My output json does not (of course) have the 'author' field. Anyone know how to accomplish this? I suppose I could just create my own asdict method ...
The dataclasses.asdict helper function doesn't offer a way to exclude fields with default or un-initialized values unfortunately -- however, the dataclass-wizard library does.
The dataclass-wizard is a (de)serialization library I've created, which is built on top of dataclasses module. It adds no extra dependencies outside of stdlib, only the typing-extensions module for compatibility reasons with earlier Python versions.
To skip dataclass fields with default or un-initialized values in serialization for ex. with asdict, the dataclass-wizard provides the skip_defaults option. However, there is also a minor issue I noted with your code above. If we set a default for the author field as None, that means that we won't be able to distinguish between null values and also the case when author field is not present when de-serializing the json data.
So in below example, I've created a CustomNull object similar to the None singleton in python. The name and implementation doesn't matter overmuch, however in our case we use it as a sentinel object to determine if a value for author is passed in or not. If it is not present in the input data when from_dict is called, then we simply exclude it when serializing data with to_dict or asdict, as shown below.
from __future__ import annotations # can be removed in Python 3.10+
from dataclasses import dataclass
from dataclass_wizard import JSONWizard
# create our own custom `NoneType` class
class CustomNullType:
# these methods are not really needed, but useful to have.
def __repr__(self):
return '<null>'
def __bool__(self):
return False
# this is analogous to the builtin `None = NoneType()`
CustomNull = CustomNullType()
# in this example I want to output the 'author' field ONLY if it is present in the mongo document
#dataclass
class StoryTitle(JSONWizard):
class _(JSONWizard.Meta):
# skip default values for dataclass fields when `to_dict` is called
skip_defaults = True
_id: str
title: str
# note: we could also define it like
# author: str | None = None
# however, using that approach we won't know if the value is
# populated as a `null` when de-serializing the json data.
author: str | None = CustomNull
# by default, the `dataclass-wizard` library uses regex to case transform
# json fields to snake case, and caches the field name for next time.
# dateOfPub: int = None
date_of_pub: int = None
# foo and bar approximate documents in mongodb
foo = dict(_id='b23435xx3e4qq', title='goldielocks and the big bears', author='mary', dateOfPub=220415)
new_foo = StoryTitle.from_dict(foo)
json_foo = new_foo.to_json()
print(json_foo)
bar = dict(_id='b23435xx3e4qq', title='War and Peace', dateOfPub=220415)
new_bar = StoryTitle.from_dict(bar)
json_bar = new_bar.to_json()
print(json_bar)
# lastly, we try de-serializing with `author=null`. the `author` field should still
# be populated when serializing the instance, as it was present in input data.
bar = dict(_id='b23435xx3e4qq', title='War and Peace', dateOfPub=220415, author=None)
new_bar = StoryTitle.from_dict(bar)
json_bar = new_bar.to_json()
print(json_bar)
Output:
{"_id": "b23435xx3e4qq", "title": "goldielocks and the big bears", "author": "mary", "dateOfPub": 220415}
{"_id": "b23435xx3e4qq", "title": "War and Peace", "dateOfPub": 220415}
{"_id": "b23435xx3e4qq", "title": "War and Peace", "author": null, "dateOfPub": 220415}
Note: the dataclass-wizard can be installed with pip:
$ pip install dataclass-wizard

getattr() without 'class object'

How can I use getattr without "Class" per se ?
So I have this situation: I have 'columns' that are asking mysql for specific data in a specific order. data is printed via flask/apache so that user has ability to manipulate this data. Now, From flask, POST methdd, I'm receiving changed(?) values and I am storing them in python attributes.I need to check if values within those attributes are same as in original data. Sure, I could hardcore it but I would like have possibility of change columns dynamically.
columns = ["username", "email", "admin"]
data = ("john", "john#snow.com", "True")
username = "john"
email = "different#email.com"
admin = False
Not sure how can I approach it ?
for i in data:
if i == getattr(???, 'username'):
print("it's the same")
or something like this?:
for i in data:
if i == getattr(data, '?????'):
print("it's the same")
Everything is within flask, I cannot embed it into the Class per se. So I don't have 'self' etc.
If I could create class I would probably make something like
class Myclass:
def __init__(self):
self.columns = ["username", "email", "admin"]
self.data = ("john", "john#snow.com", "True")
self.result = []
self.username = "john"
self.email = "different#email.com"
self.admin = False
def test(self):
for i in self.data:
if i == getattr(self, self.columns[self.data.index(i)]):
self.result.append("same")
else:
self.result.append("different")
return self.result
Myclass().test()
['same', 'different', 'different']
It turned out that I was looking for simple eval(). getattr() is designed for different purposes.
so simple:
for i in data:
if i == eval(cols[data.index(i)]):
print("it's the same")
did the trick
Flask is just Python code. You can create a class and use that if that fits your use-case. Or, if you used Flask-SQLAlchemy to manage database-backed data you'd have classes and instances anyway (and get easier data updates to boot).
And classes and instances are not the only objects with attributes; modules and functions have attributes too (although you wouldn't store your data as attributes on either of those), and when you look up methods on anything, you are looking up attributes too.
Pick a storage, then either wrap that storage with an instance of a class, and use getattr(), or pick a different data structure and use the methods for that data structure to get at the different fields. A dictionary, for instance, would make it trivial to get the current value for a given name.
If you do stick to instances, then note that in your loop you'd want to zip your columns and data values together:
for name, value in zip(columns, data):
if getattr(self, name) == value:
self.result.append("same")
else:
self.result.append("different")
Note that you do not have to add "self." in front, the whole point of getattr() is do the same work the . syntax does.
You probably want to put your columns and data lists together as a dictionary:
self.data = {'username': 'john', 'email': 'john#snow.com', 'admin': 'True'}
because that's how you'd process POST data from a form anyway; that way you can iterate over the dict.items() pairs, or use just the columns list to access values:
for name, value in self.data.items():
# ...
or use dict.get() to retrieve values, allowing for missing entries:
for name in self.columns:
if getattr(self, name) == self.data.get(name):
# ...

Python Metaprogramming. Generating massive amount of similar tests where a variable value is changed

We have a massive amount of test cases classes to perform some checks. I need to generate all those same tests but change the value of a variable.
For example:
class DynamicPivotSeriesTestCase(AuthenticatedApiTestCase):
'''
Test all series against a fixed category trying every format
'''
#binding
ftn = utils.format_test_name
# Test Params
base_url, formats = API_FORMATS['pivot']
base_test_name = 'test_pivot_series_{series}'
# Test vars
series = VARS
for srs in series:
params = {'series': srs, 'cats': 'campaign', 'from': '2013-07-31', 'to': '2013-07-31'}
test_name = ftn(base_test_name, params)
locals()[test_name] = utils.make_check_code(200, base_url, params, formats)
class DynamicPivotDerivedSeriesTestCase(AuthenticatedApiTestCase):
#binding
ftn = utils.format_test_name
# Test Params
base_url, formats = API_FORMATS['pivot']
base_test_name = 'test_pivot_derived_series_{series}'
# Test vars
series = DERIVED_VARS
for srs in series:
params = {'series': srs, 'cats': 'campaign', 'from': '2013-07-31', 'to': '2013-07-31'}
test_name = ftn(base_test_name, params)
locals()[test_name] = utils.make_check_code(200, base_url, params, formats)
There are like 150 tests like that, I can't copy paste the code. I need to iterate over globals, access to every (class_name, class object), check if the class is a test class, and if so I need to instantiate a new test class that has the same body as the class test currently in access, but I need to set a different value to base_url variable. This is what I don' t understand how to achieve.
simple update of object attribute:
if __name__ == '__main__':
for name, thing in globals().iteritems():
if issubclass(thing, AuthenticatedApiTestCase):
obj = thing()
obj.base_url = something_new
if you can't instantiate then you can do
new_classes = []
if issubclass(thing, AuthenticatedApiTestCase):
class NewClasS(thing):
base_url = something_new
new_classes.append(NewClass)
ok that's probably not exactly what you want - you'll want to dynamically assign the class name etc... but maybe this solves your initial problem of dynamically generating new classes with modified class vars
there are other ways - class decorators, metaclasses - it really depends on other details about what you are trying to do

Peewee - How to Convert a Dict into a Model

Lets say I have
import peewee
class Foo(Model):
name = CharField()
I would like to do the following:
f = {id:1, name:"bar"}
foo = Foo.create_from_dict(f)
Is this native in Peewee? I was unable to spot anything in the source code.
I've wrote this function which works but would rather use the native function if it exists:
#clazz is a string for the name of the Model, i.e. 'Foo'
def model_from_dict(clazz, dictionary):
#convert the string into the actual model class
clazz = reduce(getattr, clazz.split("."), sys.modules[__name__])
model = clazz()
for key in dictionary.keys():
#set the attributes of the model
model.__dict__['_data'][key] = dictionary[key]
return model
I have a web page that displays all the foos and allows the user to edit them. I would like to be able to pass a JSON string to the controller, where I would convert it to a dict and then make Foos out of it, so I can update as necessary.
If you have a dict, you can simply:
class User(Model):
name = CharField()
email = CharField()
d = {'name': 'Charlie', 'email': 'foo#bar.com'}
User.create(**d)
You could use PickledKeyStore which allows you to save any value as a python dict and it works like Python's pickle library.

How would you inherit from and override the django model classes to create a listOfStringsField?

I want to create a new type of field for django models that is basically a ListOfStrings. So in your model code you would have the following:
models.py:
from django.db import models
class ListOfStringsField(???):
???
class myDjangoModelClass():
myName = models.CharField(max_length=64)
myFriends = ListOfStringsField() #
other.py:
myclass = myDjangoModelClass()
myclass.myName = "bob"
myclass.myFriends = ["me", "myself", "and I"]
myclass.save()
id = myclass.id
loadedmyclass = myDjangoModelClass.objects.filter(id__exact=id)
myFriendsList = loadedclass.myFriends
# myFriendsList is a list and should equal ["me", "myself", "and I"]
How would you go about writing this field type, with the following stipulations?
We don't want to do create a field which just crams all the strings together and separates them with a token in one field like this. It is a good solution in some cases, but we want to keep the string data normalized so tools other than django can query the data.
The field should automatically create any secondary tables needed to store the string data.
The secondary table should ideally have only one copy of each unique string. This is optional, but would be nice to have.
Looking in the Django code it looks like I would want to do something similar to what ForeignKey is doing, but the documentation is sparse.
This leads to the following questions:
Can this be done?
Has it been done (and if so where)?
Is there any documentation on Django about how to extend and override their model classes, specifically their relationship classes? I have not seen a lot of documentation on that aspect of their code, but there is this.
This is comes from this question.
There's some very good documentation on creating custom fields here.
However, I think you're overthinking this. It sounds like you actually just want a standard foreign key, but with the additional ability to retrieve all the elements as a single list. So the easiest thing would be to just use a ForeignKey, and define a get_myfield_as_list method on the model:
class Friends(model.Model):
name = models.CharField(max_length=100)
my_items = models.ForeignKey(MyModel)
class MyModel(models.Model):
...
def get_my_friends_as_list(self):
return ', '.join(self.friends_set.values_list('name', flat=True))
Now calling get_my_friends_as_list() on an instance of MyModel will return you a list of strings, as required.
What you have described sounds to me really similar to the tags.
So, why not using django tagging?
It works like a charm, you can install it independently from your application and its API is quite easy to use.
I also think you're going about this the wrong way. Trying to make a Django field create an ancillary database table is almost certainly the wrong approach. It would be very difficult to do, and would likely confuse third party developers if you are trying to make your solution generally useful.
If you're trying to store a denormalized blob of data in a single column, I'd take an approach similar to the one you linked to, serializing the Python data structure and storing it in a TextField. If you want tools other than Django to be able to operate on the data then you can serialize to JSON (or some other format that has wide language support):
from django.db import models
from django.utils import simplejson
class JSONDataField(models.TextField):
__metaclass__ = models.SubfieldBase
def to_python(self, value):
if value is None:
return None
if not isinstance(value, basestring):
return value
return simplejson.loads(value)
def get_db_prep_save(self, value):
if value is None:
return None
return simplejson.dumps(value)
If you just want a django Manager-like descriptor that lets you operate on a list of strings associated with a model then you can manually create a join table and use a descriptor to manage the relationship. It's not exactly what you need, but this code should get you started.
Thanks for all those that answered. Even if I didn't use your answer directly the examples and links got me going in the right direction.
I am not sure if this is production ready, but it appears to be working in all my tests so far.
class ListValueDescriptor(object):
def __init__(self, lvd_parent, lvd_model_name, lvd_value_type, lvd_unique, **kwargs):
"""
This descriptor object acts like a django field, but it will accept
a list of values, instead a single value.
For example:
# define our model
class Person(models.Model):
name = models.CharField(max_length=120)
friends = ListValueDescriptor("Person", "Friend", "CharField", True, max_length=120)
# Later in the code we can do this
p = Person("John")
p.save() # we have to have an id
p.friends = ["Jerry", "Jimmy", "Jamail"]
...
p = Person.objects.get(name="John")
friends = p.friends
# and now friends is a list.
lvd_parent - The name of our parent class
lvd_model_name - The name of our new model
lvd_value_type - The value type of the value in our new model
This has to be the name of one of the valid django
model field types such as 'CharField', 'FloatField',
or a valid custom field name.
lvd_unique - Set this to true if you want the values in the list to
be unique in the table they are stored in. For
example if you are storing a list of strings and
the strings are always "foo", "bar", and "baz", your
data table would only have those three strings listed in
it in the database.
kwargs - These are passed to the value field.
"""
self.related_set_name = lvd_model_name.lower() + "_set"
self.model_name = lvd_model_name
self.parent = lvd_parent
self.unique = lvd_unique
# only set this to true if they have not already set it.
# this helps speed up the searchs when unique is true.
kwargs['db_index'] = kwargs.get('db_index', True)
filter = ["lvd_parent", "lvd_model_name", "lvd_value_type", "lvd_unique"]
evalStr = """class %s (models.Model):\n""" % (self.model_name)
evalStr += """ value = models.%s(""" % (lvd_value_type)
evalStr += self._params_from_kwargs(filter, **kwargs)
evalStr += ")\n"
if self.unique:
evalStr += """ parent = models.ManyToManyField('%s')\n""" % (self.parent)
else:
evalStr += """ parent = models.ForeignKey('%s')\n""" % (self.parent)
evalStr += "\n"
evalStr += """self.innerClass = %s\n""" % (self.model_name)
print evalStr
exec (evalStr) # build the inner class
def __get__(self, instance, owner):
value_set = instance.__getattribute__(self.related_set_name)
l = []
for x in value_set.all():
l.append(x.value)
return l
def __set__(self, instance, values):
value_set = instance.__getattribute__(self.related_set_name)
for x in values:
value_set.add(self._get_or_create_value(x))
def __delete__(self, instance):
pass # I should probably try and do something here.
def _get_or_create_value(self, x):
if self.unique:
# Try and find an existing value
try:
return self.innerClass.objects.get(value=x)
except django.core.exceptions.ObjectDoesNotExist:
pass
v = self.innerClass(value=x)
v.save() # we have to save to create the id.
return v
def _params_from_kwargs(self, filter, **kwargs):
"""Given a dictionary of arguments, build a string which
represents it as a parameter list, and filter out any
keywords in filter."""
params = ""
for key in kwargs:
if key not in filter:
value = kwargs[key]
params += "%s=%s, " % (key, value.__repr__())
return params[:-2] # chop off the last ', '
class Person(models.Model):
name = models.CharField(max_length=120)
friends = ListValueDescriptor("Person", "Friend", "CharField", True, max_length=120)
Ultimately I think this would still be better if it were pushed deeper into the django code and worked more like the ManyToManyField or the ForeignKey.
I think what you want is a custom model field.

Categories

Resources