Combine adjacent features in a list of linear features - python

Python 3.6
Task:
Given a sorted list of linear features (like in a linear referencing system),
combine adjacent linear features belonging to the same key (linear_feature[0]['key'] == linear_feature[1]['key'] and linear_feature[0]['end'] == linear_feature[1]['start'])
until the combined linear feature has (end - start) ≥ THRESHOLD.
If feature cannot be combined with subsequent adjacent features such that (end - start) ≥ THRESHOLD, combine with previous adjacent feature of the same key, or return self.
EDIT: Added a solution below in an answer post.
THRESHOLD = 3
linear_features = sorted([
{'key': 1, 'start': 0, 'end': 2, 'count': 1},
{'key': 1, 'start': 2, 'end': 4, 'count': 1},
{'key': 1, 'start': 4, 'end': 5, 'count': 1},
{'key': 2, 'start': 0, 'end': 3, 'count': 1},
{'key': 2, 'start': 3, 'end': 4, 'count': 1},
{'key': 2, 'start': 4, 'end': 5, 'count': 1},
{'key': 3, 'start': 0, 'end': 1, 'count': 1},
], key=lambda x: (x['key'], x['start']))
# This isn't necessarily an intermediate step, just here for visualization
intermediate = [
{'key': 1, 'start': 0, 'end': 4, 'count': 2}, # Adjacent features combined
{'key': 1, 'start': 4, 'end': 5, 'count': 1}, # This can't be made into a feature with (end - start) gte THRESHOLD; combine with previous
{'key': 2, 'start': 0, 'end': 3, 'count': 1},
{'key': 2, 'start': 3, 'end': 5, 'count': 2}, # This can't be made into a feature with (end - start) gte THRESHOLD; combine with previous
{'key': 3, 'start': 0, 'end': 1, 'count': 1}, # This can't be made into a new feature, and there is no previous, so self
]
desired_output = [
{'key': 1, 'start': 0, 'end': 5, 'count': 3},
{'key': 2, 'start': 0, 'end': 5, 'count': 3},
{'key': 3, 'start': 0, 'end': 1, 'count': 1},
]

I figured out a solution:
def reducer(x, THRESHOLD):
x = add_until(x, THRESHOLD)
if len(x) == 1:
return x
if len(x) == 2:
if length(x[1]) < THRESHOLD:
x[0]['end'] = x[1]['end']
x[0]['count'] += x[1]['count']
return [x[0]]
else:
return x
first, rest = x[0], x[1:]
return [first] + reducer(rest, THRESHOLD)
def add_until(x, THRESHOLD):
if len(x) == 1:
return x
first, rest = x[0], x[1:]
if length(first) >= THRESHOLD:
return [first] + add_until(rest, THRESHOLD)
else:
rest[0]['start'] = first['start']
rest[0]['count'] += first['count']
return add_until(rest, THRESHOLD)
from itertools import groupby
THRESHOLD = 3
linear_features = sorted([
{'key': 1, 'start': 0, 'end': 2, 'count': 1},
{'key': 1, 'start': 2, 'end': 4, 'count': 1},
{'key': 1, 'start': 4, 'end': 5, 'count': 1},
{'key': 2, 'start': 0, 'end': 3, 'count': 1},
{'key': 2, 'start': 3, 'end': 4, 'count': 1},
{'key': 2, 'start': 4, 'end': 5, 'count': 1},
{'key': 3, 'start': 0, 'end': 1, 'count': 1},
{'key': 4, 'start': 0, 'end': 3, 'count': 1},
{'key': 4, 'start': 3, 'end': 4, 'count': 1},
{'key': 4, 'start': 4, 'end': 5, 'count': 1},
{'key': 4, 'start': 5, 'end': 6, 'count': 1},
{'key': 4, 'start': 6, 'end': 9, 'count': 1},
], key=lambda x: (x['key'], x['start']))
def length(x):
"""x is a dict with a start and end property"""
return x['end'] - x['start']
results = []
for key, sites in groupby(linear_features, lambda x: x['key']):
sites = list(sites)
results += reducer(sites, 3)
print(results)
[
{'key': 1, 'start': 0, 'end': 5, 'count': 3},
{'key': 2, 'start': 0, 'end': 5, 'count': 3},
{'key': 3, 'start': 0, 'end': 1, 'count': 1},
{'key': 4, 'start': 0, 'end': 3, 'count': 1},
{'key': 4, 'start': 3, 'end': 6, 'count': 3},
{'key': 4, 'start': 6, 'end': 9, 'count': 1}
]

You want something like the this:
PSEUDOCODE
while f=1 < max = count of features:
if features[f-1]['key'] == features[f]['key'] and
features[f-1]['end'] == features[f]['start']:
#combine
features[f-1]['end'] = features[f]['end']
features[f-1]['count'] += 1
del features[f]; max -= 1
else:
f += 1

Related

Pandas, get first and last column index for row value

I have the following dataframe:
columns = pd.date_range(start="2022-05-21", end="2022-06-30")
data = [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
]
df = pd.DataFrame(data, columns=columns)
2022-05-21 2022-05-22 2022-05-23 ... 2022-06-28 2022-06-29 2022-06-30
0 0 0 0 ... 5 5 5
1 5 5 5 ... 1 1 1
2 5 5 5 ... 5 5 5
I have to take the first and last column index for every distinct value in the order they are. The correct output for this dataframe will be:
[
[
{'value': 0, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-06-20', 'end': '2022-06-30'}
],
[
{'value': 5, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 1, 'start': '2022-06-20', 'end': '2022-06-30'}
],
[
{'value': 5, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-06-20', 'end': '2022-06-30'}
]
]
My best approach for the moment is:
series_set = df.apply(frozenset, axis=1)
container = []
for index in range(len(df.index)):
row = df.iloc[[index]]
values = series_set.iloc[[index]]
inner_container = []
for value in values[index]:
single_value_series = row[row.columns[row.isin([value]).all()]]
dates = single_value_series.columns
result = dict(value=value, start=dates[0].strftime("%Y-%m-%d"), end=dates[-1].strftime("%Y-%m-%d"))
inner_container.append(result)
container.append(inner_container)
The result is:
[
[
{'value': 0, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-06-20', 'end': '2022-06-30'}
],
[
{'value': 1, 'start': '2022-06-20', 'end': '2022-06-30'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-05-21', 'end': '2022-05-31'}
],
[
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-05-21', 'end': '2022-06-30'}
]
]
It has several problems, only the first array is correct :)
When I convert dataframe to frozenset it is sorted and order is changed and also if some value appears more than once it is removed.
I will appreciate any idea and guidance. What I want to avoid is iterating the dataframe.
Thank you!
You can first transpose DataFrame by DataFrame.T and then aggregate minimal and maximal index with convert values to strings by Series.dt.strftime, last convert to dictionaries by DataFrame.to_dict.
For get consecutive groups is compared shifted values with Series.cumsum.
df1 = df.T.reset_index()
L = [df1.groupby(df1[x].ne(df1[x].shift()).cumsum())
.agg(value=(x, 'first'),
start=('index', 'min'),
end=('index', 'max'))
.assign(start=lambda x: x['start'].dt.strftime('%Y-%m-%d'),
end=lambda x: x['end'].dt.strftime('%Y-%m-%d'))
.to_dict(orient='records') for x in df1.columns.drop('index')]
print (L)
[[{'value': 0, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-06-20', 'end': '2022-06-30'}],
[{'value': 5, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 1, 'start': '2022-06-20', 'end': '2022-06-30'}],
[{'value': 5, 'start': '2022-05-21', 'end': '2022-05-31'},
{'value': 2, 'start': '2022-06-01', 'end': '2022-06-19'},
{'value': 5, 'start': '2022-06-20', 'end': '2022-06-30'}]]

Couldn't rename field in python list (key, value structure)

I have below list with nested lists (sort of key,values)
inp1=[{'id': 0, 'name': 98, 'value': 9}, {'id': 1, 'name': 66, 'value': 8}, {'id': 2, 'name': 29, 'value': 5}, {'id': 3, 'name': 99, 'value': 3}, {'id': 4, 'name': 15, 'value': 9}]
Am trying to replace 'name' with 'wid' and 'value' with 'wrt', how can I do it on same list?
My output should be like
inp1=[{'id': 0, 'wid': 98, 'wrt': 9}, {'id': 1, 'wid': 66, 'wrt': 8}, {'id': 2, 'wid': 29, 'wrt': 5}, {'id': 3, 'wid': 99, 'wrt': 3}, {'id': 4, 'wid': 15, 'wrt': 9}]
I tried below, but it doesn't work as list cannot be indexed with string but integer
inp1['name'] = inp1['wid']
inp1['value'] = inp1['wrt']
I tried if I can find any examples, but mostly I found only this for dictionary and not list.
You need to iterate each item, and remove the old entry (dict.pop is handy for this - it removes an entry and return the value) and assign to new keyes:
>>> inp1 = [
... {'id': 0, 'name': 98, 'value': 9},
... {'id': 1, 'name': 66, 'value': 8},
... {'id': 2, 'name': 29, 'value': 5},
... {'id': 3, 'name': 99, 'value': 3},
... {'id': 4, 'name': 15, 'value': 9}
... ]
>>>
>>> for d in inp1:
... d['wid'] = d.pop('name')
... d['wrt'] = d.pop('value')
...
>>> inp1
[{'wid': 98, 'id': 0, 'wrt': 9},
{'wid': 66, 'id': 1, 'wrt': 8},
{'wid': 29, 'id': 2, 'wrt': 5},
{'wid': 99, 'id': 3, 'wrt': 3},
{'wid': 15, 'id': 4, 'wrt': 9}]
def f(item):
if(item.has_key('name') and not item.has_key('wid')):
item['wid']=item.pop('name')
if(item.has_key('value') and not item.has_key('wrt')):
item['wrt']=item.pop('value')
map(f,inp1)
Output:
[{'wrt': 9, 'wid': 98, 'id': 0}, {'wrt': 8, 'wid': 66, 'id': 1}, {'wrt': 5, 'wid': 29, 'id': 2}, {'wrt': 3, 'wid': 99, 'id': 3}, {'wrt': 9, 'wid': 15, 'id': 4}]

Pythonic sort a list of dictionaries in a tricky order

I have a list of id's sorted in a proper oder:
ids = [1, 2, 4, 6, 5, 0, 3]
I also have a list of dictionaries, sorted in some random way:
rez = [{'val': 7, 'id': 1}, {'val': 8, 'id': 2}, {'val': 2, 'id': 3}, {'val': 0, 'id': 4}, {'val': -1, 'id': 5}, {'val': -4, 'id': 6}, {'val': 9, 'id': 0}]
My intention is to sort rez list in a way that corresponds to ids:
rez = [{'val': 7, 'id': 1}, {'val': 8, 'id': 2}, {'val': 0, 'id': 4}, {'val': -4, 'id': 6}, {'val': -1, 'id': 5}, {'val': 9, 'id': 0}, {'val': 2, 'id': 3}]
I tried:
rez.sort(key = lambda x: ids.index(x['id']))
However that way is too slow for me, as len(ids) > 150K, and each dict actually had a lot of keys (some values there are strings). Any suggestion how to do it in the most pythonic, but still fastest way?
You don't need to sort because ids specifies the entire ordering of the result. You just need to pick the correct elements by their ids:
rez_dict = {d['id']:d for d in rez}
rez_ordered = [rez_dict[id] for id in ids]
Which gives:
>>> rez_ordered
[{'id': 1, 'val': 7}, {'id': 2, 'val': 8}, {'id': 4, 'val': 0}, {'id': 6, 'val': -4}, {'id': 5, 'val': -1}, {'id': 0, 'val': 9}, {'id': 3, 'val': 2}]
This should be faster than sorting because it can be done in linear time on average, while sort is O(nlogn).
Note that this assumes that there will be one entry per id, as in your example.
I think you are on the right track. If you need to speed it up, because your list is too long and you are having quadratic complexity, you can turn the list into a dictionary first, mapping the ids to their respective indices.
indices = {id_: pos for pos, id_ in enumerate(ids)}
rez.sort(key = lambda x: indices[x['id']])
This way, indices is {0: 5, 1: 0, 2: 1, 3: 6, 4: 2, 5: 4, 6: 3}, and rez is
[{'id': 1, 'val': 7},
{'id': 2, 'val': 8},
{'id': 4, 'val': 0},
{'id': 6, 'val': -4},
{'id': 5, 'val': -1},
{'id': 0, 'val': 9},
{'id': 3, 'val': 2}]

Summarize a list of dictionaries based on common key values

I have a list of dictionaries like so:
dictlist = [{'day': 0, 'start': '8:00am', 'end': '5:00pm'},
{'day': 1, 'start': '10:00am', 'end': '7:00pm'},
{'day': 2, 'start': '8:00am', 'end': '5:00pm'},
{'day': 3, 'start': '10:00am', 'end': '7:00pm'},
{'day': 4, 'start': '8:00am', 'end': '5:00pm'},
{'day': 5, 'start': '11:00am', 'end': '1:00pm'}]
I want to summarize days that share the same 'start' and 'end' times.
For example,
summarylist = [([0,2, 4], '8:00am', '5:00pm'),
([1, 3], '10:00am', '7:00pm')
([5], '11:00am', '1:00pm')]
I have tried to adapt some other StackOverflow solutions re: sets and intersections to achieve this with no luck. I was trying to re-purpose the solution to this question to no avail. Hoping someone can point me in the right direction.
If you don't need the exact format that you provide you could use defaultdict
dictlist = [{'day': 0, 'start': '8:00am', 'end': '5:00pm'},
{'day': 1, 'start': '10:00am', 'end': '7:00pm'},
{'day': 2, 'start': '8:00am', 'end': '5:00pm'},
{'day': 3, 'start': '10:00am', 'end': '7:00pm'},
{'day': 4, 'start': '8:00am', 'end': '5:00pm'},
{'day': 5, 'start': '11:00am', 'end': '1:00pm'}]
from collections import defaultdict
dd = defaultdict(list)
for d in dictlist:
dd[(d['start'],d['end'])].append(d['day'])
Result:
>>> dd
defaultdict(<type 'list'>, {('11:00am', '1:00pm'): [5], ('10:00am', '7:00pm'): [1, 3], ('8:00am', '5:00pm'): [0, 2, 4]})
And if format is important to you could do:
>>> my_list = [(v, k[0], k[1]) for k,v in dd.iteritems()]
>>> my_list
[([5], '11:00am', '1:00pm'), ([1, 3], '10:00am', '7:00pm'), ([0, 2, 4], '8:00am', '5:00pm')]
>>> # If you need the output sorted:
>>> sorted_my_list = sorted(my_list, key = lambda k : len(k[0]), reverse=True)
>>> sorted_my_list
[([0, 2, 4], '8:00am', '5:00pm'), ([1, 3], '10:00am', '7:00pm'), ([5], '11:00am', '1:00pm')]
With itertools.groupby:
In [1]: %paste
dictlist = [{'day': 0, 'start': '8:00am', 'end': '5:00pm'},
{'day': 1, 'start': '10:00am', 'end': '7:00pm'},
{'day': 2, 'start': '8:00am', 'end': '5:00pm'},
{'day': 3, 'start': '10:00am', 'end': '7:00pm'},
{'day': 4, 'start': '8:00am', 'end': '5:00pm'},
{'day': 5, 'start': '11:00am', 'end': '1:00pm'}]
## -- End pasted text --
In [2]: from itertools import groupby
In [3]: tuplist = [(d['day'], (d['start'], d['end'])) for d in dictlist]
In [4]: key = lambda x: x[1]
In [5]: summarylist = [(sorted(e[0] for e in g),) + k
...: for k, g in groupby(sorted(tuplist, key=key), key=key)]
In [6]: summarylist
Out[6]:
[([1, 3], '10:00am', '7:00pm'),
([5], '11:00am', '1:00pm'),
([0, 2, 4], '8:00am', '5:00pm')]
You can use itertools.groupby like this.
source code:
from itertools import groupby
for k, grp in groupby(sorted(dictlist, key=lambda x:(x['end'], x['start'])), key=lambda x:(x['start'], x['end'])):
print [i['day'] for i in grp], k
output:
[5] ('11:00am', '1:00pm')
[0, 2, 4] ('8:00am', '5:00pm')
[1, 3] ('10:00am', '7:00pm')
But I think using defaultdict(#Akavall answer) is the right way in this particular case.

python: iterate through list and replace elements with corresponding dictionary values

I am trying to replace list element value with value looked up in dictionary how do I do that?
list = [1, 3, 2, 10]
d = {'id': 1, 'val': 30},{'id': 2, 'val': 53}, {'id': 3, 'val': 1}, {'id': 4, 'val': 9}, {'id': 5, 'val': 2}, {'id': 6, 'val': 6}, {'id': 7, 'val': 11}, {'id': 8, 'val': 89}, {'id': 9, 'val': 2}, {'id': 10, 'val': 4}
for i in list:
for key, v in d.iteritems():
???
???
so at the end I am expecting:
list = [30, 1, 53, 4]
thank you
D2 = dict((x['id'], x['val']) for x in D)
L2 = [D2[x] for x in L]
td = (
{'val': 30, 'id': 1},
{'val': 53, 'id': 2},
{'val': 1, 'id': 3},
{'val': 9, 'id': 4},
{'val': 2, 'id': 5},
{'val': 6, 'id': 6},
{'val': 11, 'id': 7},
{'val': 89, 'id': 8},
{'val': 2, 'id': 9},
{'val': 4, 'id': 10}
)
source_list = [1, 3, 2, 10]
final_list = []
for item in source_list:
for d in td:
if d['id'] == item:
final_list.append(d['val'])
print('Source : ', source_list)
print('Final : ', final_list)
Result
Source : [1, 3, 2, 10]
Final : [30, 1, 53, 4]

Categories

Resources