I receive to many DNA objects in my read DNA function - python

class DnaSeq:
def __init__(self, accession, seq):
self.accession = accession
self.seq = seq
def __len__(self):
if self.seq == None:
raise ValueError
elif self.seq =='':
raise ValueError
else:
return len(self.seq)
def __str__(self):
if self.accession =='':
raise ValueError
elif self.accession == None:
raise ValueError
else:
return f"<DnaSeq accession='{self.accession}'>"
def read_dna(filename):
DnaSeq_objects = []
new_dna_seq = DnaSeq("s1", "AAA")
with open(filename, 'r') as seq:
for line in seq.readlines():
if line.startswith('>'):
new_dna_seq.accession = line
else:
new_dna_seq.seq = line.strip()
DnaSeq_objects.append(new_dna_seq)
return DnaSeq_objects
this is the .fa file I tried to read
> s0
> ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGT GTTAATCTTACAACCAGAACTCAAT
> s1
> GTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTC
> s2
> ACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTAC
> s3
> TCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCT
> s4
> AGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTT
> s5
> TTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGA
It's supposed to return 6 DNA objects but I received too many.
read_dna('ex1.fa')
[<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>,
<__main__.DnaSeq object at 0x000001C67208F820>
]
How can I fix this, so that it receives the right amount

Your code is reading every line beginning with > as an accession, but it's not populating the .seq attribute because it's not finding any sequences. In the FASTA format, only the header/description/accession ID line begins with >. The sequence line(s) don't have any prefix, they're just single-letter bases or amino acids.
There's actually a lot more you need to do. You need to have a default value for self.seq, you need to parse the sequences for spaces and other irrelevant characters, and you need to be able to concatenate multiple sequence lines. Instead of rolling your own code, I highly recommend checking out Biopython.
I decided to give you some example code that will help you on your way, using a couple of neat Python constructs to condense things down a bit and clean up your original code. Please don't use this exact code as your assignment! It may contain concepts that you haven't learned about yet, or that you don't fully understand, and your professor will quickly be able to see it's not your original work. Play around with the code, make sure you understand what it does, try to think of any edge cases where it might not work as expected (such as having an accession without a sequence, or having a sequence spread over multiple lines). Then, come up with your own algorithm and submit that.
class DnaSeq:
def __init__(self, accession, seq):
self.accession = accession
self.seq = seq
def __len__(self):
if self.seq:
return len(self.seq)
else:
raise ValueError("Sequence missing")
def __repr__(self):
if self.accession and self.seq:
return f"<DnaSeq accession='{self.accession}', seq='{self.seq[:15]}...'>"
else:
raise ValueError("Accession ID or sequence missing")
def read_dna(filename):
DnaSeq_objects = []
with open(filename, 'r') as f:
# get rid of any whitespace on either end of the line
contents = [line.strip() for line in f.readlines()]
while len(contents): # while there are lines left to process
if len(contents[0]) == 0: # there was just whitespace and now it's an empty string
contents.pop(0) # pull the first item off the list
continue # go to the next line in the list
# no point in creating dummy values when we can use the real thing
new_dna_seq = DnaSeq(contents.pop(0).lstrip("> "), contents.pop(0))
DnaSeq_objects.append(new_dna_seq)
return DnaSeq_objects
results = [str(seq_obj) for seq_obj in read_dna("ex1.fa")]
print("\n".join(results))
# "<DnaSeq accession='s0', seq='ATGTTTGTTTTTC...'>",
# "<DnaSeq accession='s1', seq='GTTAATCTTACAA...'>",
# "<DnaSeq accession='s2', seq='ACTCAGGACTTGT...'>",
# "<DnaSeq accession='s3', seq='TCTGGGACCAATG...'>",
# "<DnaSeq accession='s4', seq='AGACCCAGTCCCT...'>",
# "<DnaSeq accession='s5', seq='TTTGTAATGATCC...'>"

In Your Loop You Should change The condition:
for line in seq.readlines():
if line.startswith('>'):
new_dna_seq.accession = line
else:
new_dna_seq.seq = line.strip()
DnaSeq_objects.append(new_dna_seq)
To:
for line in seq.readlines():
if line.startswith('> s'):
new_dna_seq.accession = line.strip().replace('> ', '')
else:
new_dna_seq.seq = line.strip().replace('> ', '')
DnaSeq_objects.append(new_dna_seq)
in Your, if statement checks if the line starts with '> s' and indent the object appending within the else block.
I have also removed'>', from your accession and sequence as That seems unnecessary.

since DnaSeq_objects is a list, just return DnaSeq_objects[:6]. Even if the list contains less than 6 elements, this syntax will not throw an error and will just return all elements

Related

I am trying to import xml file with some empty attributes to table. getting this error AttributeError: 'NoneType' object has no attribute 'strip'

def XML_get_fields_and_types_and_data_levels_3(xml_file_name):
data_2d = []
for child in root:
grandchildren = child.findall(".//")
fields = []
types = []
data_1d = []
data_2d.append(data_1d)
for grandchild in grandchildren:
data_1d.append(convert_string_to_type(grandchild.text))
if grandchild.tag not in fields:
fields.append(grandchild.tag)
types.append(get_type_of_string(grandchild.text))
return (fields, types, data_2d)
def get_type_of_string(string):
clean_string = string.strip()
try:
if clean_string is not None:
clean_string = string.strip()
return string.strip()
if "." in clean_string:
clean_string = string.split()
if isinstance(clean_string, list):
point_or_segment = [float(i) for i in clean_string]
if len(point_or_segment) == 2:
return 'POINT'
else:
return 'LSEG'
else:
val = float(clean_string)
return 'REAL'
else:
val = int(clean_string)
return 'INTEGER'
except ValueError:
return 'TEXT'
the issue is the line-of-code (loc) after your method definition,
def get_type_of_string(string):
clean_string = string.strip()
there string might be None, so the exception is raised. Instead of re-writing the method for you, which would easy for me but not be very helpful for you, I suggest you to re-design this method. Here are my hints:
there is duplicated code
the split() method always returns a list, no matter if the separator is found, isn't, so there is no need for this line to exists if isinstance(clean_string, list)
then why is this conversion in place if then val is not used thereafter? the easiest way to evaluate a variable type is by using the isinstance() builtin method as you did it few lines above.
try to split this method, in simpler and smaller methods, or try to simplify its logic.
Hope this hints will help you through. Enjoy coding.

Python equivalent of Fortran list-directed input

I'd like to be able to read data from an input file in Python, similar to the way that Fortran handles a list-directed read (i.e. read (file, *) char_var, float_var, int_var).
The tricky part is that the way Fortran handles a read statement like this is very "forgiving" as far as the input format is concerned. For example, using the previous statement, this:
"some string" 10.0, 5
would be read the same as:
"some string", 10.0
5
and this:
"other string", 15.0 /
is read the same as:
"other string"
15
/
with the value of int_var retaining the same value as before the read statement. And trickier still this:
"nother string", , 7
will assign the values to char_var and int_var but float_var retains the same value as before the read statement.
Is there an elegant way to implement this?
That is indeed tricky - I found it easier to write a pure-python stated-based tokenizer than think on a regular expression to parse each line (tough it is possible).
I've used the link provided by Vladimir as the spec - the tokenizer have some doctests that pass.
def tokenize(line, separator=',', whitespace="\t\n\x20", quote='"'):
"""
>>> tokenize('"some string" 10.0, 5')
['some string', '10.0', '5']
>>> tokenize(' "other string", 15.0 /')
['other string', '15.0', '/']
>>> tokenize('"nother string", , 7')
['nother string', '', '7']
"""
inside_str = False
token_started = False
token = ""
tokens = []
separated = False
just_added = False
for char in line:
if char in quote:
if not inside_str:
inside_str = True
else:
inside_str = False
tokens.append(token)
token = ""
just_added = True
continue
if char in (whitespace + separator) and not inside_str:
if token:
tokens.append(token)
token = ""
just_added = True
elif char in separator:
if not just_added:
tokens.append("")
just_added = False
continue
token += char
if token:
tokens.append(token)
return tokens
class Character(object):
def __init__(self, length=None):
self.length = length
def __call__(self, text):
if self.length is None:
return text
if len(text) > self.length:
return text[:self.length]
return "{{:{}}}".format(self.length).format(text)
def make_types(types, default_value):
return types, [default_value] * len[types]
def fortran_reader(file, types, default_char="/", default_value=None, **kw):
types, results = make_types(types, default_value)
tokens = []
while True:
tokens = []
while len(tokens) < len(results):
try:
line = next(file)
except StopIteration:
raise StopIteration
tokens += tokenize(line, **kw)
for i, (type_, token) in enumerate(zip(types, tokens)):
if not token or token in default_char:
continue
results[i] = type_(token)
changed_types = yield(results)
if changed_types:
types, results = make_types(changed_types)
I have not teste this thoughtfully - but for the tokenizer -
it is designed to work in a Python forstatement if the same fields are repeated over and over again - or it can be used with Python's iterators send method to change the values to be read on each iteration.
Please test, and e-mail me (address at my profile) some testing file. If there is indeed nothing similar, maybe this deserves some polishing and be published in Pypi.
Since I was not able to find a solution to this problem, I decided to write my own solution.
The main drivers are a reader class, and a tokenizer. The reader gets one line at a time from the file, passes it to the tokenizer, and assigns to the variables it is given, getting the next line as necessary.
class FortranAsciiReader(file):
def read(self, *args):
"""
Read from file into the given objects
"""
num_args = len(args)
num_read = 0
encountered_slash = False
# If line contained '/' or read into all varialbes, we're done
while num_read < num_args and not encountered_slash:
line = self.readline()
if not line:
raise Exception()
values = tokenize(line)
# Assign elements one-by-one into args, skipping empty fields and stopping at a '/'
for val in values:
if val == '/':
encountered_slash = True
break
elif val == '':
num_read += 1
else:
args[num_read].assign(val)
num_read += 1
if num_read == num_args:
break
The tokenizer splits the line into tokens in accordance with the way that Fortran performs list directed reads, where ',' and white space are separators, tokens may be "repeated" via 4*token, and a / terminates input.
My implementation of the tokenizer is a bit long to reproduce here, and I also included classes to transparently provide the functionality of the basic Fortran intrinsic types (i.e. Real, Character, Integer, etc.). The whole project can be found on my github account, currently at https://github.com/bprichar/PyLiDiRe. Thanks jsbueno for inspiration for the tokenizer.

TypeError: 'top_list' object does not support indexing

I would like to print out the list sorted by the second element.
TypeError: 'top_list' object does not support indexing
Is there anyone that can help me?
class top_list(object):
def __init__(self, name, hit_rate):
self.name = name
self.hit_rate = float(hit_rate)
def __str__(self):
return "{0} {1}".format(self.name, self.hit_rate)
def top_ten():
"""Prints out the list"""
top10 = []
file = open("high_score.txt")
for i in range(0,1):
x = file.readlines()
for line in x:
line = line.split(",")
lista = top_list(line[0], float(line[1]))
top10.append(lista)
a = sorted(top10, key=lambda line: line[1])
print(a)
In your code
a = sorted(top10, key=lambda line: line[1])
you are trying to access the top_list element using subscript notation. If that is what you want to do, implement a __getitem__ method. __getitem__ allows you to use the subscript operator - list[1] translates to list.__getitem__(1).
def self.__getitem__(self, key):
if key == 1:
return self.name
else:
return self.hit_rate
Or modify the lambda function to access the element you want without using a subscript:
a = sorted(top10, key=lambda line: line.hit_rate)
Also note that using context manager for file is safer and more pythonic. You can also read the lines by iterating over the Python file object:
with open('high_score.txt', 'r') as file:
for line in file:
...
but extra caution needs to be taken to handle newlines (stripping them possibly).

Python recursive setattr()-like function for working with nested dictionaries [duplicate]

This question already has answers here:
Is it possible to index nested lists using tuples in python?
(7 answers)
Closed 7 months ago.
There are a lot of good getattr()-like functions for parsing nested dictionary structures, such as:
Finding a key recursively in a dictionary
Suppose I have a python dictionary , many nests
https://gist.github.com/mittenchops/5664038
I would like to make a parallel setattr(). Essentially, given:
cmd = 'f[0].a'
val = 'whatever'
x = {"a":"stuff"}
I'd like to produce a function such that I can assign:
x['f'][0]['a'] = val
More or less, this would work the same way as:
setattr(x,'f[0].a',val)
to yield:
>>> x
{"a":"stuff","f":[{"a":"whatever"}]}
I'm currently calling it setByDot():
setByDot(x,'f[0].a',val)
One problem with this is that if a key in the middle doesn't exist, you need to check for and make an intermediate key if it doesn't exist---ie, for the above:
>>> x = {"a":"stuff"}
>>> x['f'][0]['a'] = val
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
KeyError: 'f'
So, you first have to make:
>>> x['f']=[{}]
>>> x
{'a': 'stuff', 'f': [{}]}
>>> x['f'][0]['a']=val
>>> x
{'a': 'stuff', 'f': [{'a': 'whatever'}]}
Another is that keying for when the next item is a lists will be different than the keying when the next item is a string, ie:
>>> x = {"a":"stuff"}
>>> x['f']=['']
>>> x['f'][0]['a']=val
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'str' object does not support item assignment
...fails because the assignment was for a null string instead of a null dict. The null dict will be the right assignment for every non-list in dict until the very last one---which may be a list, or a value.
A second problem, pointed out in the comments below by #TokenMacGuy, is that when you have to create a list that does not exist, you may have to create an awful lot of blank values. So,
setattr(x,'f[10].a',val)
---may mean the algorithm will have to make an intermediate like:
>>> x['f']=[{},{},{},{},{},{},{},{},{},{},{}]
>>> x['f'][10]['a']=val
to yield
>>> x
{"a":"stuff","f":[{},{},{},{},{},{},{},{},{},{},{"a":"whatever"}]}
such that this is the setter associated with the getter...
>>> getByDot(x,"f[10].a")
"whatever"
More importantly, the intermediates should /not/ overwrite values that already exist.
Below is the junky idea I have so far---I can identify the lists versus dicts and other data types, and create them where they do not exist. However, I don't see (a) where to put the recursive call, or (b) how to 'build' the deep object as I iterate through the list, and (c) how to distinguish the /probing/ I'm doing as I construct the deep object from the /setting/ I have to do when I reach the end of the stack.
def setByDot(obj,ref,newval):
ref = ref.replace("[",".[")
cmd = ref.split('.')
numkeys = len(cmd)
count = 0
for c in cmd:
count = count+1
while count < numkeys:
if c.find("["):
idstart = c.find("[")
numend = c.find("]")
try:
deep = obj[int(idstart+1:numend-1)]
except:
obj[int(idstart+1:numend-1)] = []
deep = obj[int(idstart+1:numend-1)]
else:
try:
deep = obj[c]
except:
if obj[c] isinstance(dict):
obj[c] = {}
else:
obj[c] = ''
deep = obj[c]
setByDot(deep,c,newval)
This seems very tricky because you kind of have to look-ahead to check the type of the /next/ object if you're making place-holders, and you have to look-behind to build a path up as you go.
UPDATE
I recently had this question answered, too, which might be relevant or helpful.
I have separated this out into two steps. In the first step, the query string is broken down into a series of instructions. This way the problem is decoupled, we can view the instructions before running them, and there is no need for recursive calls.
def build_instructions(obj, q):
"""
Breaks down a query string into a series of actionable instructions.
Each instruction is a (_type, arg) tuple.
arg -- The key used for the __getitem__ or __setitem__ call on
the current object.
_type -- Used to determine the data type for the value of
obj.__getitem__(arg)
If a key/index is missing, _type is used to initialize an empty value.
In this way _type provides the ability to
"""
arg = []
_type = None
instructions = []
for i, ch in enumerate(q):
if ch == "[":
# Begin list query
if _type is not None:
arg = "".join(arg)
if _type == list and arg.isalpha():
_type = dict
instructions.append((_type, arg))
_type, arg = None, []
_type = list
elif ch == ".":
# Begin dict query
if _type is not None:
arg = "".join(arg)
if _type == list and arg.isalpha():
_type = dict
instructions.append((_type, arg))
_type, arg = None, []
_type = dict
elif ch.isalnum():
if i == 0:
# Query begins with alphanum, assume dict access
_type = type(obj)
# Fill out args
arg.append(ch)
else:
TypeError("Unrecognized character: {}".format(ch))
if _type is not None:
# Finish up last query
instructions.append((_type, "".join(arg)))
return instructions
For your example
>>> x = {"a": "stuff"}
>>> print(build_instructions(x, "f[0].a"))
[(<type 'dict'>, 'f'), (<type 'list'>, '0'), (<type 'dict'>, 'a')]
The expected return value is simply the _type (first item) of the next tuple in the instructions. This is very important because it allows us to correctly initialize/reconstruct missing keys.
This means that our first instruction operates on a dict, either sets or gets the key 'f', and is expected to return a list. Similarly, our second instruction operates on a list, either sets or gets the index 0 and is expected to return a dict.
Now let's create our _setattr function. This gets the proper instructions and goes through them, creating key-value pairs as necessary. Finally, it also sets the val we give it.
def _setattr(obj, query, val):
"""
This is a special setattr function that will take in a string query,
interpret it, add the appropriate data structure to obj, and set val.
We only define two actions that are available in our query string:
.x -- dict.__setitem__(x, ...)
[x] -- list.__setitem__(x, ...) OR dict.__setitem__(x, ...)
the calling context determines how this is interpreted.
"""
instructions = build_instructions(obj, query)
for i, (_, arg) in enumerate(instructions[:-1]):
_type = instructions[i + 1][0]
obj = _set(obj, _type, arg)
_type, arg = instructions[-1]
_set(obj, _type, arg, val)
def _set(obj, _type, arg, val=None):
"""
Helper function for calling obj.__setitem__(arg, val or _type()).
"""
if val is not None:
# Time to set our value
_type = type(val)
if isinstance(obj, dict):
if arg not in obj:
# If key isn't in obj, initialize it with _type()
# or set it with val
obj[arg] = (_type() if val is None else val)
obj = obj[arg]
elif isinstance(obj, list):
n = len(obj)
arg = int(arg)
if n > arg:
obj[arg] = (_type() if val is None else val)
else:
# Need to amplify our list, initialize empty values with _type()
obj.extend([_type() for x in range(arg - n + 1)])
obj = obj[arg]
return obj
And just because we can, here's a _getattr function.
def _getattr(obj, query):
"""
Very similar to _setattr. Instead of setting attributes they will be
returned. As expected, an error will be raised if a __getitem__ call
fails.
"""
instructions = build_instructions(obj, query)
for i, (_, arg) in enumerate(instructions[:-1]):
_type = instructions[i + 1][0]
obj = _get(obj, _type, arg)
_type, arg = instructions[-1]
return _get(obj, _type, arg)
def _get(obj, _type, arg):
"""
Helper function for calling obj.__getitem__(arg).
"""
if isinstance(obj, dict):
obj = obj[arg]
elif isinstance(obj, list):
arg = int(arg)
obj = obj[arg]
return obj
In action:
>>> x = {"a": "stuff"}
>>> _setattr(x, "f[0].a", "test")
>>> print x
{'a': 'stuff', 'f': [{'a': 'test'}]}
>>> print _getattr(x, "f[0].a")
"test"
>>> x = ["one", "two"]
>>> _setattr(x, "3[0].a", "test")
>>> print x
['one', 'two', [], [{'a': 'test'}]]
>>> print _getattr(x, "3[0].a")
"test"
Now for some cool stuff. Unlike python, our _setattr function can set unhashable dict keys.
x = []
_setattr(x, "1.4", "asdf")
print x
[{}, {'4': 'asdf'}] # A list, which isn't hashable
>>> y = {"a": "stuff"}
>>> _setattr(y, "f[1.4]", "test") # We're indexing f with 1.4, which is a list!
>>> print y
{'a': 'stuff', 'f': [{}, {'4': 'test'}]}
>>> print _getattr(y, "f[1.4]") # Works for _getattr too
"test"
We aren't really using unhashable dict keys, but it looks like we are in our query language so who cares, right!
Finally, you can run multiple _setattr calls on the same object, just give it a try yourself.
>>> class D(dict):
... def __missing__(self, k):
... ret = self[k] = D()
... return ret
...
>>> x=D()
>>> x['f'][0]['a'] = 'whatever'
>>> x
{'f': {0: {'a': 'whatever'}}}
You can hack something together by fixing two problems:
List that automatically grows when accessed out of bounds (PaddedList)
A way to delay the decision of what to create (list of dict) until you accessed it by the first time (DictOrList)
So the code will look like this:
import collections
class PaddedList(list):
""" List that grows automatically up to the max index ever passed"""
def __init__(self, padding):
self.padding = padding
def __getitem__(self, key):
if isinstance(key, int) and len(self) <= key:
self.extend(self.padding() for i in xrange(key + 1 - len(self)))
return super(PaddedList, self).__getitem__(key)
class DictOrList(object):
""" Object proxy that delays the decision of being a List or Dict """
def __init__(self, parent):
self.parent = parent
def __getitem__(self, key):
# Type of the structure depends on the type of the key
if isinstance(key, int):
obj = PaddedList(MyDict)
else:
obj = MyDict()
# Update parent references with the selected object
parent_seq = (self.parent if isinstance(self.parent, dict)
else xrange(len(self.parent)))
for i in parent_seq:
if self == parent_seq[i]:
parent_seq[i] = obj
break
return obj[key]
class MyDict(collections.defaultdict):
def __missing__(self, key):
ret = self[key] = DictOrList(self)
return ret
def pprint_mydict(d):
""" Helper to print MyDict as dicts """
print d.__str__().replace('defaultdict(None, {', '{').replace('})', '}')
x = MyDict()
x['f'][0]['a'] = 'whatever'
y = MyDict()
y['f'][10]['a'] = 'whatever'
pprint_mydict(x)
pprint_mydict(y)
And the output of x and y will be:
{'f': [{'a': 'whatever'}]}
{'f': [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {'a': 'whatever'}]}
The trick consist on creating a defaultdict of objects that can be either a dict or a list depending how you access it.
So when you have the assigment x['f'][10]['a'] = 'whatever' it will work the following way:
Get X['f']. It wont exist so it will return a DictOrList object for the index 'f'
Get X['f'][10]. DictOrList.getitem will be called with an integer index. The DictOrList object will replace itself in the parent collection by a PaddedList
Access the 11th element in the PaddedList will grow it by 11 elements and will return the MyDict element in that position
Assign "whatever" to x['f'][10]['a']
Both PaddedList and DictOrList are bit hacky, but after all the assignments there is no more magic, you have an structure of dicts and lists.
It is possible to synthesize recursively setting items/attributes by overriding __getitem__ to return a return a proxy that can set a value in the original function.
I happen to be working on a library that does a few things similar to this, so I was working on a class that can dynamically assign its own subclasses at instantiation. It makes working with this sort of thing easier, but if that kind of hacking makes you squeamish, you can get similar behavior by creating a ProxyObject similar to the one I create and by creating the individual classes used by the ProxyObject dynamically in the a function. Something like
class ProxyObject(object):
... #see below
def instanciateProxyObjcet(val):
class ProxyClassForVal(ProxyObject,val.__class__):
pass
return ProxyClassForVal(val)
You can use dictionary like I've used in FlexibleObject below would make that implementation significantly more efficient if this is the way you implement it. The code I will providing uses the FlexibleObject though. Right now it only supports classes that, like almost all of Python's builtin classes are capable of being generated by taking an instance of themselves as their sole argument to their __init__/__new__. In the next week or two, I'll add support for anything pickleable, and link to a github repository that contains it. Here's the code:
class FlexibleObject(object):
""" A FlexibleObject is a baseclass for allowing type to be declared
at instantiation rather than in the declaration of the class.
Usage:
class DoubleAppender(FlexibleObject):
def append(self,x):
super(self.__class__,self).append(x)
super(self.__class__,self).append(x)
instance1 = DoubleAppender(list)
instance2 = DoubleAppender(bytearray)
"""
classes = {}
def __new__(cls,supercls,*args,**kws):
if isinstance(supercls,type):
supercls = (supercls,)
else:
supercls = tuple(supercls)
if (cls,supercls) in FlexibleObject.classes:
return FlexibleObject.classes[(cls,supercls)](*args,**kws)
superclsnames = tuple([c.__name__ for c in supercls])
name = '%s%s' % (cls.__name__,superclsnames)
d = dict(cls.__dict__)
d['__class__'] = cls
if cls == FlexibleObject:
d.pop('__new__')
try:
d.pop('__weakref__')
except:
pass
d['__dict__'] = {}
newcls = type(name,supercls,d)
FlexibleObject.classes[(cls,supercls)] = newcls
return newcls(*args,**kws)
Then to use this to use this to synthesize looking up attributes and items of a dictionary-like object you can do something like this:
class ProxyObject(FlexibleObject):
#classmethod
def new(cls,obj,quickrecdict,path,attribute_marker):
self = ProxyObject(obj.__class__,obj)
self.__dict__['reference'] = quickrecdict
self.__dict__['path'] = path
self.__dict__['attr_mark'] = attribute_marker
return self
def __getitem__(self,item):
path = self.__dict__['path'] + [item]
ref = self.__dict__['reference']
return ref[tuple(path)]
def __setitem__(self,item,val):
path = self.__dict__['path'] + [item]
ref = self.__dict__['reference']
ref.dict[tuple(path)] = ProxyObject.new(val,ref,
path,self.__dict__['attr_mark'])
def __getattribute__(self,attr):
if attr == '__dict__':
return object.__getattribute__(self,'__dict__')
path = self.__dict__['path'] + [self.__dict__['attr_mark'],attr]
ref = self.__dict__['reference']
return ref[tuple(path)]
def __setattr__(self,attr,val):
path = self.__dict__['path'] + [self.__dict__['attr_mark'],attr]
ref = self.__dict__['reference']
ref.dict[tuple(path)] = ProxyObject.new(val,ref,
path,self.__dict__['attr_mark'])
class UniqueValue(object):
pass
class QuickRecursiveDict(object):
def __init__(self,dictionary={}):
self.dict = dictionary
self.internal_id = UniqueValue()
self.attr_marker = UniqueValue()
def __getitem__(self,item):
if item in self.dict:
val = self.dict[item]
try:
if val.__dict__['path'][0] == self.internal_id:
return val
else:
raise TypeError
except:
return ProxyObject.new(val,self,[self.internal_id,item],
self.attr_marker)
try:
if item[0] == self.internal_id:
return ProxyObject.new(KeyError(),self,list(item),
self.attr_marker)
except TypeError:
pass #Item isn't iterable
return ProxyObject.new(KeyError(),self,[self.internal_id,item],
self.attr_marker)
def __setitem__(self,item,val):
self.dict[item] = val
The particulars of the implementation will vary depending on what you want. It's obviously significantly easier to just override __getitem__ in the proxy than it is to override both __getitem__ and __getattribute__ or __getattr__. The syntax you are using in setbydot makes it look like you would be happiest with some solution that overrides a mixture of the two.
If you are just using the dictionary to compare values, using =,<=,>= etc. Overriding __getattribute__ works really nicely. If you are wanting to do something more sophisticated, you will probably be better off overriding __getattr__ and doing some checks in __setattr__ to determine whether you want to be synthesizing setting the attribute by setting a value in the dictionary or whether you want to be actually setting the attribute on the item you've obtained. Or you might want to handle it so that if your object has an attribute, __getattribute__ returns a proxy to that attribute and __setattr__ always just sets the attribute in the object (in which case, you can completely omit it). All of these things depend on exactly what you are trying to use the dictionary for.
You also may want to create __iter__ and the like. It takes a little bit of effort to make them, but the details should follow from the implementation of __getitem__ and __setitem__.
Finally, I'm going to briefly summarize the behavior of the QuickRecursiveDict in case it's not immediately clear from inspection. The try/excepts are just shorthand for checking to see whether the ifs can be performed. The one major defect of synthesizing the recursive setting rather than find a way to do it is that you can no longer be raising KeyErrors when you try to access a key that hasn't been set. However, you can come pretty close by returning a subclass of KeyError which is what I do in the example. I haven't tested it so I won't add it to the code, but you may want to pass in some human-readable representation of the key to KeyError.
But aside from all that it works rather nicely.
>>> qrd = QuickRecursiveDict
>>> qrd[0][13] # returns an instance of a subclass of KeyError
>>> qrd[0][13] = 9
>>> qrd[0][13] # 9
>>> qrd[0][13]['forever'] = 'young'
>>> qrd[0][13] # 9
>>> qrd[0][13]['forever'] # 'young'
>>> qrd[0] # returns an instance of a subclass of KeyError
>>> qrd[0] = 0
>>> qrd[0] # 0
>>> qrd[0][13]['forever'] # 'young'
One more caveat, the things being returned is not quite what it looks like. It's a proxy to what it looks like. If you want the int 9, you need int(qrd[0][13]) not qrd[0][13]. For ints this doesn't matter much since, +,-,= and all that bypass __getattribute__ but for lists, you would lose attributes like append if you didn't recast them. (You'd keep len and other builtin methods, just not attributes of list. You lose __len__.)
So that's it. The code's a little bit convoluted, so let me know if you have any questions. I probably can't answer them until tonight unless the answer's really brief. I wish I saw this question sooner, it's a really cool question, and I'll try to update a cleaner solution soon. I had fun trying to code a solution into the wee hours of last night. :)

'MarkovGenerator' object has no attribute 'together'

I come up with a problem about the class and I don't know the reason, does anyone can help me out?
The problem is in def together(), here are my code.
class MarkovGenerator(object):
def __init__(self, n, max):
self.n = n # order (length) of ngrams
self.max = max # maximum number of elements to generate
self.ngrams = dict() # ngrams as keys; next elements as values
beginning = tuple(["That", "is"]) # beginning ngram of every line
beginning2 = tuple(["on", "the"])
self.beginnings = list()
self.beginnings.append(beginning)
self.beginnings.append(beginning2)
self.sentences = list()
def tokenize(self, text):
return text.split(" ")
def feed(self, text):
tokens = self.tokenize(text)
# discard this line if it's too short
if len(tokens) < self.n:
return
# store the first ngram of this line
#beginning = tuple(tokens[:self.n])
#self.beginnings.append(beginning)
for i in range(len(tokens) - self.n):
gram = tuple(tokens[i:i+self.n])
next = tokens[i+self.n] # get the element after the gram
# if we've already seen this ngram, append; otherwise, set the
# value for this key as a new list
if gram in self.ngrams:
self.ngrams[gram].append(next)
else:
self.ngrams[gram] = [next]
# called from generate() to join together generated elements
def concatenate(self, source):
return " ".join(source)
# generate a text from the information in self.ngrams
def generate(self,i):
from random import choice
# get a random line beginning; convert to a list.
#current = choice(self.beginnings)
current = self.beginnings[i]
output = list(current)
for i in range(self.max):
if current in self.ngrams:
possible_next = self.ngrams[current]
next = choice(possible_next)
output.append(next)
# get the last N entries of the output; we'll use this to look up
# an ngram in the next iteration of the loop
current = tuple(output[-self.n:])
else:
break
output_str = self.concatenate(output)
return output_str
def together(self):
return "lalala"
if __name__ == '__main__':
import sys
import random
generator = MarkovGenerator(n=2, max=16)
for line in open("us"):
line = line.strip()
generator.feed(line)
for i in range(2):
print generator.generate(i)
print generator.together()
But I got the error saying:
Traceback (most recent call last):
File "markovoo2.py", line 112, in <module>
print generator.together()
AttributeError: 'MarkovGenerator' object has no attribute 'together'
Does anyone know know the reason?
You have indented the def together() function definition too far, it is part of the def generate() function body.
Un-indent it to match the other functions in the class body.
It looks your def together is indented too deeply. It is inside the generate method. Move it out one indentation level.

Categories

Resources