i couldn't undestand what this lines of code do? - python

This part of class i did not understand what does do in this code:
for file in os.listdir(path):
if(os.path.isfile(os.path.join(path,file)) and select in file):
temp = scipy.io.loadmat(os.path.join(path,file))
temp = {k:v for k, v in temp.items() if k[0] != '_'}
for i in range(len(temp[patch_type+"_patches"])):
self.tensors.append(temp[patch_type+"_patches"][i])
self.labels.append(temp[patch_type+"_labels"][0][i])
self.tensors = np.array(self.tensors)
self.labels = np.array(self.labels)
especially this line :
temp = {k:v for k, v in temp.items() if k[0] != '_'}
the whole class is as follow :
class Datasets(Dataset):
def __init__(self,path,train,transform=None):
if(train):
select ="Training"
patch_type = "train"
else:
select = "Testing"
patch_type = "testing"
self.tensors = []
self.labels = []
self.transform = transform
for file in os.listdir(path):
if(os.path.isfile(os.path.join(path,file)) and select in file):
temp = scipy.io.loadmat(os.path.join(path,file))
temp = {k:v for k, v in temp.items() if k[0] != '_'}
for i in range(len(temp[patch_type+"_patches"])):
self.tensors.append(temp[patch_type+"_patches"][i])
self.labels.append(temp[patch_type+"_labels"][0][i])
self.tensors = np.array(self.tensors)
self.labels = np.array(self.labels)
def __len__(self):
try:
if len(self.tensors) != len(self.labels):
raise Exception("Lengths of the tensor and labels list are not the same")
except Exception as e:
print(e.args[0])
return len(self.tensors)
def __getitem__(self,idx):
sample = (self.tensors[idx],self.labels[idx])
# print(self.labels)
sample = (torch.from_numpy(self.tensors[idx]),torch.from_numpy(np.array(self.labels[idx])).long())
return sample
#tuple containing the image patch and its corresponding label

It's a dict comprehension; in this particular case, it creates a new dict from an existing dict temp, but only for items for which the key k does not start with an underscore. That check is performed by the if ... part.
It is equivalent to
new = {}
for k, v in temp.items():
if key[0] != '_':
new[k] = value
temp = new
or, slightly different:
new = {}
for key, value in temp.items():
if not key.startswith('_'):
new[key] = value
temp = new
You can see that it looks a bit nicer as a single line, since it avoids a temporary dict (new; under the hood, it still creates a nameless temporary dict though).

It is filtering out the underscore-prefixed variables from the loaded MATLAB file. From the scipy documentation the function scipy.io.loadmat returns a dictionary containing the variable names from the loaded file as keys and the matricies as values. The line of code you reference is a dictionary comprehension that clones the dictionary minus the variables that fail the conditional check.
Update
What happens here is roughly this:
Load a MATLAB file (file in your code) as a hashmap (dictionary) where the keys are the variable names from the file and the values are the matricies, assign to temp.
Iterate through those key/value pairs and drop the underscore-prefixed ones and reassign the results of that iteration to temp.
Profit

Related

Creating dataframe from xml

I have an xml that I want to parse out and create a dataframe. What I have been trying so far is something like this:
all_dicts = []
fields = ['f1','f2','f3','f4','f5','f6','f7']
for i in root.findall('.//item'):
d = {}
for j in product.findall('.//subitems'):
for k in j.findall('.//subitem'):
if k.attrib['name'] in fields:
d[k.attrib['name']] = k.text
all_dicts.append(d)
This gives me a list of dictionaries that I can easily do pd.DataFrame(all_dicts) to get what I want. However, the subitems tend to have multiple sub-elements that have the same name. For example, each subitem could have multiple times where k.attrib['name'] == f1, so it adds an item to the dictionary with the same key and therefore just overwrites the previous value when I need all of them. Is there a way to create such as data frame easily?
Use dict.get to check if the key exists
If the key does not exist, add it as a list
If the key does exist, append to the list
Without a comprehensive example of the xml, I can't offer a more detailed example.
all_dicts = []
fields = ['f1','f2','f3','f4','f5','f6','f7']
for i in root.findall('.//item'):
d = dict()
for j in product.findall('.//subitems'):
for k in j.findall('.//subitem'):
n = k.attrib['name']
if n in fields:
if d.get(n) == None: # check if key exist
d[n] = [k.text] # add key as a list
else:
d[n].append(k.text) # append to list
all_dicts.append(d)
Alternatively, only add the dict value as a list, if the field is 'f1'.
all_dicts = []
fields = ['f1','f2','f3','f4','f5','f6','f7']
for i in root.findall('.//item'):
d = dict()
for j in product.findall('.//subitems'):
for k in j.findall('.//subitem'):
n = k.attrib['name']
if n in fields and n == 'f1': # if field is 'f1' add list
if d.get(n) == None: # check if key exist
d[n] = [k.text] # add key as a list
else:
d[n].append(k.text) # append to list
elif n in fields: # if field isn't 'f1' just add the text
d[n] = k.text
all_dicts.append(d)

how can I create nested dictionary keys and assign them values from a list of namespaced key value pairs?

I have env vars that looks like this:
CONFIG-SOMEKEY-SOMEOTHERKEY = val345
CONFIG-SOMEKEY-SOMEOTHEROTHERKEY = val678
CONFIG-ANOTHERKEY = val222
I want to create a dictionary out of them that would look like:
{
'SOMEKEY': {
'SOMEOTHERKEY': 'val3242',
'SOMEOTHEROTHERKEY': 'val678'
}
'ANOTHERKEY': 'val222'
}
"CONFIG-" is a prefix to denote which vars this should be done with- so I can filter them easily like this:
config_fields = [i for i in os.environ if i.startswith("CONFIG-")]
But I'm unsure of how to loop over the string, split on "-" and build a dict.
While looping I was thinking I could check if its the last item and assign the value but how would it know the full path of keys it's on?
I suspect this is a job for recursion I'm just now sure exactly how to implement it
You could do:
data = ['CONFIG-SOMEKEY-SOMEOTHERKEY = val345',
'CONFIG-SOMEKEY-SOMEOTHEROTHERKEY = val678',
'CONFIG-ANOTHERKEY = val222']
result = {}
for e in data:
key, value = e.split(" = ") # split into key and value
path = key.split("-") # split the key into parts
ref = result
for part in path[1:-1]:
ref[part] = part in ref and ref[part] or {}
ref = ref[part]
ref[path[-1]] = value # take the last part of key and set the value
print(result)
Output
{'SOMEKEY': {'SOMEOTHERKEY': 'val345', 'SOMEOTHEROTHERKEY': 'val678'}, 'ANOTHERKEY': 'val222'}
This part:
ref = result
for part in path[1:-1]:
ref[part] = part in ref and ref[part] or {}
ref = ref[part]
ref[path[-1]] = value
will create the nested dictionaries, is equivalent to:
for part in path[1:-1]:
if part not in ref:
ref[part] = {}
ref = ref[part]
So if the part is in the dictionary you set ref as the value corresponding to part otherwise you create a new dictionary.
You can use the assoc_in function from toolz. Split the name on - and slice off the prefix.
import os
from toolz.dictoolz import assoc_in
CONFIG={}
for k, v in os.environ.items():
if k.startswith("CONFIG-"):
assoc_in(CONFIG, k.split('-')[1:], v)
If you don't want to add a dependency, you can see the implementation of assoc_in here. A simpler substitute might be something like
def assoc_in(d, ks, v):
for k in ks[:-1]:
d = d.setdefault(k, {})
d[ks[-1]] = v
This uses the .setdefault() method to get the nested dicts, which will add a new one if it doesn't exist yet.
You can get your environment variables like so:
import os
text = [f"{k} = {v}" for k,v in os.environ.items() if k.startswith("CONFIG-")]
print(env)
(inspired by How to access environment variable values? - especially this answer)
Then you can use dicts to iterativly splitting your values:
text = """CONFIG-SOMEKEY-SOMEOTHERKEY = val345
CONFIG-SOMEKEY-SOMEOTHEROTHERKEY = val678
CONFIG-ANOTHERKEY = val222"""
text = text.split("\n")
d = {}
curr_d = d
for part in text:
while "-" in part:
a, b = part.split("-",1)
if '-' in b:
curr_d [a] = curr_d.get(a,{})
curr_d = curr_d[a]
part = b
a, b = part.split("=",1)
curr_d[a] = b
curr_d = d
print(d)
Output:
{'CONFIG': {'SOMEOTHERKEY ': ' val345',
'SOMEOTHEROTHERKEY ': ' val678'},
'ANOTHERKEY ': ' val222'}

Tying multiple dictionary entries to one key

I have some code that I am writing, and it is trying to imitate what an API call does. However there are multiple entries that can be put in the API call that have the same keys. For example, if you look at my call to the method, there are multiple names[] that are passed as part of the API call.
names[]": ["System/CPU/User/percent", "System/CPU/System/percent"]
Here is the code that have -
def new_relic_api(api_key, query_function, datapoints):
temp = {}
if (datapoints != None):
for k, v in datapoints.iteritems():
if isinstance(v, list):
for s in v:
print (k)
print s
temp[k] = s
else:
print k
print v
temp[k] = v
r = requests.get(url, headers=headers, data=temp)
d = {"names[]": ["System/CPU/User/percent", "System/CPU/System/percent"], "values[]": "average_value", 'from': '2016-11-30T18:31:00+00:00', 'to': '2016-11-30T19:01:00+00:00', 'summarize': 'true'}
new_relic_api("${api_key}", "/servers/{server_id}/metrics/data.json", d)
However, the actual dictionary is only printing out the second names[] value inside of the requests call. How can I fix this?
Thanks
That's because you write into dictionary temp in line temp[k] = s different values from "names[]" but with the same key:
items = {}
k = "names[]"
for s in ["System/CPU/User/percent", "System/CPU/System/percent"]:
items[k] = s
# items == {"name[]": "System/CPU/System/percent"}
It means that last value of s rewrites items[k] and items will always keep only one (last) value of names[].
Another thing is it's better to check if a value equals None with is operator:
if value is None: pass
if value is not None: pass
To pass both names[] values simultaneously you don't need to split its values, use requests.get and pass names[] as a list:
temp = {"names[]": ["System/CPU/User/percent", "System/CPU/System/percent"]}
r = requests.get(url, headers=headers, data=temp)
It will be requested as smth similar to:
url?names[]=System/CPU/User/percent&names[]=System/CPU/System/percent

Using decorators vs iteration to set values?

So I have to loop through a list of objects, using some of their values to do computation, and then assign them new values.
Because many of the items in the list will be assigned the same new value, I used a dictionary to hold the list of items that will require the same value. For example:
item_dict = {}
for item in list:
value = item.value
if value not in item_dict:
item_dict[value] = [item]
else:
item_dict[value].append(item)
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
items = item_dict[value]
for item in items:
item.value = new_value
I was think about removing the for item in items loop with a decorator since all the new_value(s) for that list are the same. For example:
def dec(item):
def wrap(value):
item.value = value
return wrap
def rec(item, func):
def wrap(value):
item.value = value
func(value)
return wrap
item_dict = {}
for item in list:
value = item.value
if value not in item_dict:
item_dict[value] = dec(item)
else:
item_dict[value] = rec(item, item_dict[value])
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
items = item_dict[value]
items(new_value)
Would the decorator fashion be more efficient and how much of a memory impact will it have? Are there any better ways of doing this?
A defaultdict works well here:
from collections import defaultdict
item_dict = defaultdict(list)
for item in value_list:
item_dict[item.value].append(item)
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
for item in item_dict[value]:
item.value = new_value
I struggle to think of a way the decorator version could be better - for one thing, you have to worry about the recursion limit.
The get method works well in the first case.
item_dict = {}
for item in list:
item_dict[item.value] = item_dict.get(item.value, []) + [item]
The key to making this work is to use list addition instead of append, as append returns None.

Finding matching keys in two large dictionaries and doing it fast

I am trying to find corresponding keys in two different dictionaries. Each has about 600k entries.
Say for example:
myRDP = { 'Actinobacter': 'GATCGA...TCA', 'subtilus sp.': 'ATCGATT...ACT' }
myNames = { 'Actinobacter': '8924342' }
I want to print out the value for Actinobacter (8924342) since it matches a value in myRDP.
The following code works, but is very slow:
for key in myRDP:
for jey in myNames:
if key == jey:
print key, myNames[key]
I've tried the following but it always results in a KeyError:
for key in myRDP:
print myNames[key]
Is there perhaps a function implemented in C for doing this? I've googled around but nothing seems to work.
Thanks.
Use sets, because they have a built-in intersection method which ought to be quick:
myRDP = { 'Actinobacter': 'GATCGA...TCA', 'subtilus sp.': 'ATCGATT...ACT' }
myNames = { 'Actinobacter': '8924342' }
rdpSet = set(myRDP)
namesSet = set(myNames)
for name in rdpSet.intersection(namesSet):
print name, myNames[name]
# Prints: Actinobacter 8924342
You could do this:
for key in myRDP:
if key in myNames:
print key, myNames[key]
Your first attempt was slow because you were comparing every key in myRDP with every key in myNames. In algorithmic jargon, if myRDP has n elements and myNames has m elements, then that algorithm would take O(n×m) operations. For 600k elements each, this is 360,000,000,000 comparisons!
But testing whether a particular element is a key of a dictionary is fast -- in fact, this is one of the defining characteristics of dictionaries. In algorithmic terms, the key in dict test is O(1), or constant-time. So my algorithm will take O(n) time, which is one 600,000th of the time.
in python 3 you can just do
myNames.keys() & myRDP.keys()
for key in myRDP:
name = myNames.get(key, None)
if name:
print key, name
dict.get returns the default value you give it (in this case, None) if the key doesn't exist.
You could start by finding the common keys and then iterating over them. Set operations should be fast because they are implemented in C, at least in modern versions of Python.
common_keys = set(myRDP).intersection(myNames)
for key in common_keys:
print key, myNames[key]
Best and easiest way would be simply perform common set operations(Python 3).
a = {"a": 1, "b":2, "c":3, "d":4}
b = {"t1": 1, "b":2, "e":5, "c":3}
res = a.items() & b.items() # {('b', 2), ('c', 3)} For common Key and Value
res = {i[0]:i[1] for i in res} # In dict format
common_keys = a.keys() & b.keys() # {'b', 'c'}
Cheers!
Use the get method instead:
for key in myRDP:
value = myNames.get(key)
if value != None:
print key, "=", value
You can simply write this code and it will save the common key in a list.
common = [i for i in myRDP.keys() if i in myNames.keys()]
Copy both dictionaries into one dictionary/array. This makes sense as you have 1:1 related values. Then you need only one search, no comparison loop, and can access the related value directly.
Example Resulting Dictionary/Array:
[Name][Value1][Value2]
[Actinobacter][GATCGA...TCA][8924342]
[XYZbacter][BCABCA...ABC][43594344]
...
Here is my code for doing intersections, unions, differences, and other set operations on dictionaries:
class DictDiffer(object):
"""
Calculate the difference between two dictionaries as:
(1) items added
(2) items removed
(3) keys same in both but changed values
(4) keys same in both and unchanged values
"""
def __init__(self, current_dict, past_dict):
self.current_dict, self.past_dict = current_dict, past_dict
self.set_current, self.set_past = set(current_dict.keys()), set(past_dict.keys())
self.intersect = self.set_current.intersection(self.set_past)
def added(self):
return self.set_current - self.intersect
def removed(self):
return self.set_past - self.intersect
def changed(self):
return set(o for o in self.intersect if self.past_dict[o] != self.current_dict[o])
def unchanged(self):
return set(o for o in self.intersect if self.past_dict[o] == self.current_dict[o])
if __name__ == '__main__':
import unittest
class TestDictDifferNoChanged(unittest.TestCase):
def setUp(self):
self.past = dict((k, 2*k) for k in range(5))
self.current = dict((k, 2*k) for k in range(3,8))
self.d = DictDiffer(self.current, self.past)
def testAdded(self):
self.assertEqual(self.d.added(), set((5,6,7)))
def testRemoved(self):
self.assertEqual(self.d.removed(), set((0,1,2)))
def testChanged(self):
self.assertEqual(self.d.changed(), set())
def testUnchanged(self):
self.assertEqual(self.d.unchanged(), set((3,4)))
class TestDictDifferNoCUnchanged(unittest.TestCase):
def setUp(self):
self.past = dict((k, 2*k) for k in range(5))
self.current = dict((k, 2*k+1) for k in range(3,8))
self.d = DictDiffer(self.current, self.past)
def testAdded(self):
self.assertEqual(self.d.added(), set((5,6,7)))
def testRemoved(self):
self.assertEqual(self.d.removed(), set((0,1,2)))
def testChanged(self):
self.assertEqual(self.d.changed(), set((3,4)))
def testUnchanged(self):
self.assertEqual(self.d.unchanged(), set())
unittest.main()
def combine_two_json(json_request, json_request2):
intersect = {}
for item in json_request.keys():
if item in json_request2.keys():
intersect[item]=json_request2.get(item)
return intersect

Categories

Resources