Concatenate sequence from a predefined datastructure

Concatenate sequence from a predefined datastructure - python

I've been struggling a little to build this piece of code, and I was wondering if there are others more simple/efficient way of doing this:
fsSchema = {'published': {'renders': {'SIM': ('fold1', 'fold2'), 'REN': ('fold1', 'fold2')}}}
def __buildPathFromSchema(self, schema, root=''):
metaDirs = []
for dir_ in schema.keys():
root = os.path.join(root, dir_)
if isinstance(schema[dir_], dict):
return self.__buildPathFromSchema(schema[dir_], root)
if isinstance(schema[dir_], tuple):
for i in schema[dir_]:
bottom = os.path.join(root, i)
metaDirs.append(bottom)
root = os.sep.join(os.path.split(root)[:-1])
return metaDirs
Basically what I want to do is generating paths from a predefined structure like fsSchema. Note the latest iteration is always a tuple.
The ouput looks like:
['published\renders\REN\fold1',
'published\renders\REN\fold2',
'published\renders\SIM\fold1',
'published\renders\SIM\fold2']
Thanks!

You can use a recursive function to generate all the paths:
def flatten(data):
if isinstance(data, tuple):
for v in data:
yield v
else:
for k in data:
for v in flatten(data[k]):
yield k + '\\' + v
This should be able to handle any kind of nested dictionaries:
>>> fsSchema = {'published': {'renders': {'SIM': ('fold1', 'fold2'), 'REN': ('fold1', 'fold2')}}}
>>> list(flatten(fsSchema))
['published\\renders\\REN\\fold1', 'published\\renders\\REN\\fold2', 'published\\renders\\SIM\\fold1', 'published\\renders\\SIM\\fold2']
Note that the paths are generated in "random" order since dictionaries don't have any internal ordering.

Instead of:
for dir_ in schema.keys():
...
if isinstance(schema[dir_], dict):
you can do:
for dir_name, dir_content in schema.iteritems():
...
if isinstance(dir_content, tuple):
It's both faster and more readable.

I would keep doing it recursively like you already are but split the walker off from the path generator:
def walk(data):
if hasattr(data, 'items'):
for outer_piece, subdata in data.items():
for inner_piece in walk(subdata):
yield (outer_piece, ) + inner_piece
else:
for piece in data:
yield (piece, )
def paths(data):
for path in walk(data):
yield os.sep.join(path)
The reason being that it is really two separate pieces of functionality and having them implemented as separate functions is hence easier to debug, maintain, implement and just generally think about.

Related

Make list out of list comprehension using generator

I've been trying to turn the output of my list comprehension into a variable. Quite silly, but no matter what I try, I seem to end up with an empty list (or NoneType variable).
I'm guessing it has something to do with the generator that it uses, but I'm not sure how to get around it as I need the generator to retrieve the desired results from my JSON file. (And I'm too much of a list comprehension and generator newbie to see how).
This is the working piece of code (originally posted as an answer to these questions (here and here)).
I'd like the output of the print() part to be written to a list.
def item_generator(json_Response_GT, identifier):
if isinstance(json_Response_GT, dict):
for k, v in json_Response_GT.items():
if k == identifier:
yield v
else:
yield from item_generator(v, identifier)
elif isinstance(json_Response_GT, list):
for item in json_Response_GT:
yield from item_generator(item, identifier)
res = item_generator(json_Response_GT, "identifier")
print([x for x in res])
Any help would be greatly appreciated!

A generator keeps its state, so after you iterate through it once (in order to print), another iteration will start at the end and yield nothing.
print([x for x in res]) # res is used up here
a = [x for x in res] # nothing left in res
Instead, do this:
a = [x for x in res] # or a = list(res)
# now res is used up, but a is a fixed list - it can be read and accessed as many times as you want without changing its state
print(a)

res = [x for x in item_generator(json_Response_GT, "identifier")] should do the trick.

i couldn't undestand what this lines of code do?

This part of class i did not understand what does do in this code:
for file in os.listdir(path):
if(os.path.isfile(os.path.join(path,file)) and select in file):
temp = scipy.io.loadmat(os.path.join(path,file))
temp = {k:v for k, v in temp.items() if k[0] != '_'}
for i in range(len(temp[patch_type+"_patches"])):
self.tensors.append(temp[patch_type+"_patches"][i])
self.labels.append(temp[patch_type+"_labels"][0][i])
self.tensors = np.array(self.tensors)
self.labels = np.array(self.labels)
especially this line :
temp = {k:v for k, v in temp.items() if k[0] != '_'}
the whole class is as follow :
class Datasets(Dataset):
def __init__(self,path,train,transform=None):
if(train):
select ="Training"
patch_type = "train"
else:
select = "Testing"
patch_type = "testing"
self.tensors = []
self.labels = []
self.transform = transform
for file in os.listdir(path):
if(os.path.isfile(os.path.join(path,file)) and select in file):
temp = scipy.io.loadmat(os.path.join(path,file))
temp = {k:v for k, v in temp.items() if k[0] != '_'}
for i in range(len(temp[patch_type+"_patches"])):
self.tensors.append(temp[patch_type+"_patches"][i])
self.labels.append(temp[patch_type+"_labels"][0][i])
self.tensors = np.array(self.tensors)
self.labels = np.array(self.labels)
def __len__(self):
try:
if len(self.tensors) != len(self.labels):
raise Exception("Lengths of the tensor and labels list are not the same")
except Exception as e:
print(e.args[0])
return len(self.tensors)
def __getitem__(self,idx):
sample = (self.tensors[idx],self.labels[idx])
# print(self.labels)
sample = (torch.from_numpy(self.tensors[idx]),torch.from_numpy(np.array(self.labels[idx])).long())
return sample
#tuple containing the image patch and its corresponding label

It's a dict comprehension; in this particular case, it creates a new dict from an existing dict temp, but only for items for which the key k does not start with an underscore. That check is performed by the if ... part.
It is equivalent to
new = {}
for k, v in temp.items():
if key[0] != '_':
new[k] = value
temp = new
or, slightly different:
new = {}
for key, value in temp.items():
if not key.startswith('_'):
new[key] = value
temp = new
You can see that it looks a bit nicer as a single line, since it avoids a temporary dict (new; under the hood, it still creates a nameless temporary dict though).

It is filtering out the underscore-prefixed variables from the loaded MATLAB file. From the scipy documentation the function scipy.io.loadmat returns a dictionary containing the variable names from the loaded file as keys and the matricies as values. The line of code you reference is a dictionary comprehension that clones the dictionary minus the variables that fail the conditional check.
Update
What happens here is roughly this:
Load a MATLAB file (file in your code) as a hashmap (dictionary) where the keys are the variable names from the file and the values are the matricies, assign to temp.
Iterate through those key/value pairs and drop the underscore-prefixed ones and reassign the results of that iteration to temp.
Profit

PySpark recursive key search

I have a deeply nested json esque structure that I need to search for a given key at all levels (up to 7) for all occurrences. There is data always present in level 0 that I need to associate with each occurrence of the search_key found at any level. I've tried pushing this data through the recursive calls and appending it on return, but I have ran into heap and unhashable type issues when I move this from standard Python to a PySpark RDD.
My search function is below:
def search(input, search_key, results):
if input:
for i in input:
if isinstance(i, list):
search(i, search_key, results)
elif isinstance(i, dict):
for k, v in i.iteritems():
if k == search_key:
results.append(i)
continue
elif isinstance(v, list):
search(v, search_key, results)
elif isinstance(v, dict):
search(v, search_key, results)
return results
And I have been calling it with:
origin_rdd = sc.parallelize(origin)
concept_lambda = lambda c: search(c, term, [])
results = origin_rdd.flatMap(concept_lambda)
Can anybody suggest a way to capture the top level data and have it as part of each object in results? Results could be 0 to n so a product of 7 keys always present in the top level and then all occurrences of the search term collection. I then want to transform each row in the resulting RDD into a PySpark Row for use with a PySpark DataFrame. I have not found a good way to start with a DataFrame instead of an RDD or apply the search function to a DataFrame column because the structure is highly dynamic in its schema, but happy to hear suggestions if somebody thinks that's a better route.

I was able to solve my problem by slicing and passing the base through while using deepcopy when I had a hit on my search. Somebody else trying to do something similar could tweak the slices below.
origin_rdd = sc.parallelize(origin)
concept_lambda = lambda r: search(r[-1], r[0:9], term, [])
results = origin_rdd.flatMap(concept_lambda)
The search function
def search(input, row_base, search_key, results):
if input:
for i in input:
if isinstance(i, list):
search(i, row_base, search_key, results)
if isinstance(i, dict):
for k, v in iteritems(i):
if k == search_key:
row = copy.deepcopy(row_base)
row.append(i)
results.append(row)
continue
elif isinstance(v, list):
search(v, row_base, search_key, results)
elif isinstance(v, dict):
search(v, row_base, search_key, results)
return results

Advice on making my JSON find-all-nested-occurrences method cleaner

I am parsing unknown nested json object, I do not know the structure nor depth ahead of time. I am trying to search through it to find a value. This is what I came up with, but I find it fugly. Could anybody let me know how to make this look more pythonic and cleaner?
def find(d, key):
if isinstance(d, dict):
for k, v in d.iteritems():
try:
if key in str(v):
return 'found'
except:
continue
if isinstance(v, dict):
for key,value in v.iteritems():
try:
if key in str(value):
return "found"
except:
continue
if isinstance(v, dict):
find(v)
elif isinstance(v, list):
for x in v:
find(x)
if isinstance(d, list):
for x in d:
try:
if key in x:
return "found"
except:
continue
if isinstance(v, dict):
find(v)
elif isinstance(v, list):
for x in v:
find(x)
else:
if key in str(d):
return "found"
else:
return "Not Found"

It is generally more "Pythonic" to use duck typing; i.e., just try to search for your target rather than using isinstance. See What are the differences between type() and isinstance()?
However, your need for recursion makes it necessary to recurse the values of the dictionaries and the elements of the list. (Do you also want to search the keys of the dictionaries?)
The in operator can be used for both strings, lists, and dictionaries, so no need to separate the dictionaries from the lists when testing for membership. Assuming you don't want to test for the target as a substring, do use isinstance(basestring) per the previous link. To test whether your target is among the values of a dictionary, test for membership in your_dictionary.values(). See Get key by value in dictionary
Because the dictionary values might be lists or dictionaries, I still might test for dictionary and list types the way you did, but I mention that you can cover both list elements and dictionary keys with a single statement because you ask about being Pythonic, and using an overloaded oeprator like in across two types is typical of Python.
Your idea to use recursion is necessary, but I wouldn't define the function with the name find because that is a Python built-in which you will (sort of) shadow and make the recursive call less readable because another programmer might mistakenly think you're calling the built-in (and as good practice, you might want to leave the usual access to the built in in case you want to call it.)
To test for numeric types, use `numbers.Number' as described at How can I check if my python object is a number?
Also, there is a solution to a variation of your problem at https://gist.github.com/douglasmiranda/5127251 . I found that before posting because ColdSpeed's regex suggestion in the comment made me wonder if I were leading you down the wrong path.
So something like
import numbers
def recursively_search(object_from_json, target):
if isinstance(object_from_json, (basestring, numbers.Number)):
return object_from_json==target # the recursion base cases
elif isinstance(object_from_json, list):
for element in list:
if recursively_search(element, target):
return True # quit at first match
elif isinstance(object_from_json, dict):
if target in object_from_json:
return True # among the keys
else:
for value in object_from_json.values():
if recursively_search(value, target):
return True # quit upon finding first match
else:
print ("recursively_search() did not anticipate type ",type(object_from_json))
return False
return False # no match found among the list elements, dict keys, nor dict values

From a python dictionary to list of settings

I want to be able to transform a dictionary in a list of options that
can be set (with the full path), for example this should pass:
def test_dic_to_args(self):
dic = {"x1": {"x2": "val1"}, "x2": "val3"}
des = ["x1.x2:val1", "x2:val3"]
self.assertEqual(conf.dict_to_args(dic), des)
Now I started to write it and I thought it was easy, but it's more
tricky than I thought, with queues, type checking and so on..
Is there a smart way to solve this problem?
Maybe the best option is still a recursive DFS, what do you think?

If the dictionary is supposed to be arbitrarily nested, a recursive approach is most probably easiest.
def dict_to_args(d, prefix=()):
for k, v in d.iteritems():
if isinstance(v, dict):
for x in dict_to_args(v, prefix + (k,)):
yield x
else:
yield ".".join(prefix + (k,)) + ":" + v
Example:
>>> list(dict_to_args(dic))
['x2:val3', 'x1.x2:val1']

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Concatenate sequence from a predefined datastructure - python

Instead of: for dir_ in schema.keys(): ... if isinstance(schema[dir_], dict): you can do: for dir_name, dir_content in schema.iteritems(): ... if isinstance(dir_content, tuple): It's both faster and more readable.

Related

Make list out of list comprehension using generator

i couldn't undestand what this lines of code do?

PySpark recursive key search

Advice on making my JSON find-all-nested-occurrences method cleaner

From a python dictionary to list of settings

Categories

Resources