I have tuples in a list of lists and would like to extract only some elements in the tuple. Sample of the input data is below.
# input
[[('ab', 0.026412873688749918), ('dc', 0.016451082731822664), ('on', 0.014278088125928066),
('qc', 0.009752817881775656), ('mn', 0.008332886637563352), ('nt', 0.008250535392602258),
('nsw', 0.006874273287824427), ('bar', 0.005878684829852004), ('tor', 0.005741627328513831),
('wds', 0.004119216502907735)],
[('nb', 0.03053649661493629), ('ns', 0.01925207174326825), ('ham', 0.016207228280183325),
('bra', 0.013390785663058102), ('nia', 0.00878166482558038), ('knxr', 0.004648856466085521),
('nwm', 0.004463444159552605), ('md', 0.004377821331080258), ('ut', 0.004165890522922745),
('va', 0.0037484060754341083)]]
What I am trying to do is get the first items in the tuples.
# output
[['ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'],
['nb', 'ns', 'ham', 'bra', 'nia', 'knxr', 'nwm', 'md', 'ut', 'va']]
input = [
[('ab', 0.026412873688749918), ('dc', 0.016451082731822664), ('on', 0.014278088125928066),
('qc', 0.009752817881775656), ('mn', 0.008332886637563352), ('nt', 0.008250535392602258),
('nsw', 0.006874273287824427), ('bar', 0.005878684829852004), ('tor', 0.005741627328513831),
('wds', 0.004119216502907735)],
[('nb', 0.03053649661493629), ('ns', 0.01925207174326825), ('ham', 0.016207228280183325),
('bra', 0.013390785663058102), ('nia', 0.00878166482558038), ('knxr', 0.004648856466085521),
('nwm', 0.004463444159552605), ('md', 0.004377821331080258), ('ut', 0.004165890522922745),
('va', 0.0037484060754341083)]
]
As illustrated in the comments you can use list comprehensions to achieve this:
[[idx for idx, val in x] for x in input]
# Result
[['ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'],
['nb', 'ns', 'ham', 'bra', 'nia', 'knxr', 'nwm', 'md', 'ut', 'va']]
A more complex way to achieve this would be to use zip() to separate the first elements from the second elements of the tuples as shown below:
[('ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'),
(0.026412873688749918,0.016451082731822664,0.014278088125928066,0.009752817881775656,0.008332886637563352,0.008250535392602258,0.006874273287824427,0.005878684829852004,0.005741627328513831,0.004119216502907735)]
This approach can be done using:
[list(list(zip(*x))[0]) for x in input]
# Result
[['ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'],
['nb', 'ns', 'ham', 'bra', 'nia', 'knxr', 'nwm', 'md', 'ut', 'va']]
You can use loop or list comprehension to do this.
The input data is list of lists that contains tuples. Access the first element of the tuple by using tuple[0] and save it into an empty list like this:-
input_data = [
[('ab', 0.026412873688749918), ('dc', 0.016451082731822664), ('on', 0.014278088125928066),
('qc', 0.009752817881775656), ('mn', 0.008332886637563352), ('nt', 0.008250535392602258),
('nsw', 0.006874273287824427), ('bar', 0.005878684829852004), ('tor', 0.005741627328513831),
('wds', 0.004119216502907735)],
[('nb', 0.03053649661493629), ('ns', 0.01925207174326825), ('ham', 0.016207228280183325),
('bra', 0.013390785663058102), ('nia', 0.00878166482558038), ('knxr', 0.004648856466085521),
('nwm', 0.004463444159552605), ('md', 0.004377821331080258), ('ut', 0.004165890522922745),
('va', 0.0037484060754341083)]
]
data_list = []
for x in input_data:
d_list = []
for y in x:
d_list.append(y[0])
data_list.append(d_list)
# Result...
[['ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'],
['nb', 'ns', 'ham', 'bra', 'nia', 'knxr', 'nwm', 'md', 'ut', 'va']]
Using list comprehension:-
It is a shorthand way to write the for loop above by removing append() method and the initial empty lists.
data_list = [ [y[0] for y in x] for x in input_data ]
# Result...
[['ab', 'dc', 'on', 'qc', 'mn', 'nt', 'nsw', 'bar', 'tor', 'wds'],
['nb', 'ns', 'ham', 'bra', 'nia', 'knxr', 'nwm', 'md', 'ut', 'va']]
Related
Given a list of string L, and a sublist M from L. I want to find an efficient way to find the index span (index_a, index_b) from L where L[index_a: index_b] = M.
For example,
L = ['Clement', 'Joseph', 'Man', '##ey', 'was', 'an', 'American', 'businessman', 'from', 'Boston', 'who', 'was', 'president', 'of', 'a', 'contracting', 'company', 'and', 'a', 'minority', 'owner', 'and', 'treasurer', 'of', 'the', 'Boston', 'Braves', 'baseball', 'team', '.']
M = ['Clement', 'Joseph', 'Man', '##ey']
Expected_result = (0,4) # since L[0:4] = M
L = ['Clement', 'Joseph', 'Man', '##ey', 'was', 'an', 'American', 'businessman', 'from', 'Boston', 'who', 'was', 'president', 'of', 'a', 'contracting', 'company', 'and', 'a', 'minority', 'owner', 'and', 'treasurer', 'of', 'the', 'Boston', 'Braves', 'baseball', 'team', '.']
M = ['Boston']
Expected_result = (9,9) # both (9,9) and (25,25) are correct since L[9]=L[25]=M
Use a comprehension,and check the length when return the value:
def getIndex(lst1, lst2):
index1 = next((i for i in range(len(lst1) - len(lst2)) if lst1[i:i + len(lst2)] == lst2), None)
if index1 is not None: # when sub list doesn't exist in another list
return (index1, index1) if len(lst2) == 1 else (index1, index1+len(lst2))
else:
return None
L1 = ['Clement', 'Joseph', 'Man', '##ey', 'was', 'an', 'American', 'businessman', 'from', 'Boston', 'who', 'was',
'president', 'of', 'a', 'contracting', 'company', 'and', 'a', 'minority', 'owner', 'and', 'treasurer', 'of', 'the',
'Boston', 'Braves', 'baseball', 'team', '.']
M1 = ['Clement', 'Joseph', 'Man', '##ey']
L2 = ['Clement', 'Joseph', 'Man', '##ey', 'was', 'an', 'American', 'businessman', 'from', 'Boston', 'who', 'was', 'president', 'of', 'a', 'contracting', 'company', 'and', 'a', 'minority', 'owner', 'and', 'treasurer', 'of', 'the', 'Boston', 'Braves', 'baseball', 'team', '.']
M2 = ['Boston']
print(getIndex(L1, M1))
print(getIndex(L2, M2))
Result:
(0, 4)
(9, 9)
Or one line:
def getIndex(lst1, lst2):
return next(((i, i + (len(lst2) if len(lst2) > 1 else 0)) for i in range(len(lst1) - len(lst2)) if lst1[i:i + len(lst2)] == lst2), None)
def getIndices(L,M):
results = []
for i in range(len(L) - len(M)):
if L[i:i+len(M)] == M:
results.append((i, i+len(M)))
return results
Traversing the list L in blocks of len(M)and checking if it's equal to the list M
I want to create a nested dictionary from list of lists. Following is the list
ls3 = [['YOU', 'HE', 'EST8'],
['YOU', 'HE', 'OLM6'],
['YOU', 'SLO', 'WLR8'],
['ARE', 'KLP', 'EST6'],
['ARE', 'POL', 'WLR4'],
['DOING', 'TIS', 'OIL8'],
['GREAT', 'POL', 'EOL6'],
['WORK', 'KOE', 'RIW8'],
['WORK', 'KOE', 'PNG4'],
['WORK', 'ROE', 'ERC8'],
['WORK', 'ROE', 'WQD6'],
['KEEP', 'PAR', 'KOM8'],
['KEEP', 'PAR', 'RTW6'],
['KEEP', 'PIL', 'XCE4'],
['KEEP', 'PIL', 'ACE8'],
['ROCKING', 'OUL', 'AZS6'],
['ROCKING', 'OUL', 'RVX8']]
below is my code and so far I was able to create this:
di = {}
di2 = {}
for i,j,k in ls3:
di.setdefault(i, []).extend([j,k])
for i,j in di.items():
di2.update({i:{j[0]:j[1:]}})
My output :
{'YOU': {'HE': ['EST8', 'HE', 'OLM6', 'SLO', 'WLR8']},
'ARE': {'KLP': ['EST6', 'POL', 'WLR4']},
'DOING': {'TIS': ['OIL8']},
'GREAT': {'POL': ['EOL6']},
'WORK': {'KOE': ['RIW8', 'KOE', 'PNG4', 'ROE', 'ERC8', 'ROE', 'WQD6']},
'KEEP': {'PAR': ['KOM8', 'PAR', 'RTW6', 'PIL', 'XCE4', 'PIL', 'ACE8']},
'ROCKING': {'OUL': ['AZS6', 'OUL', 'RVX8']}}
Expected output :
{{'YOU':{'HE':{'EST':8,'OLM':6},'SLO':{'WLR':8}}},
{'ARE':{'KLP':{'EST':6},'POL':{'WLR':4}}}, and so on}
The double setdefault is for the 2 nested dicts.
The last part is for splitting the digit. If it's more than a single digit, you should use a regex.
di = {}
for i,j,k in ls3:
di.setdefault(i, {}).setdefault(j, {})[k[:-1]] = int(k[-1])
In a pandas column I have list of POS tags as string. I thought this must be string because print(dataset['text_posTagged'][0][0]) prints [.
dataset['text_posTagged']
['VBP', 'JJ', 'NNS', 'VBP', 'JJ', 'IN', 'PRP', 'VBP', 'TO', 'VB', 'PRP', 'RB', 'VBZ', 'DT', 'JJ', 'PRP$', 'NN', 'NN', 'NN', 'NN', 'VBZ', 'JJ']
['UH', 'DT', 'VB', 'VB', 'PRP$', 'NN', 'TO', 'JJ', 'IN', 'PRP', 'MD', 'VB', 'DT', 'VBZ', 'DT', 'NN', 'NN']
['NN', 'VBD', 'NN', 'NN', 'NN', 'DT', 'IN', 'IN', 'NN', 'IN', 'NN', 'NN', 'VBD', 'IN', 'JJ', 'NN', 'NN']
To convert this to an actual list I used the following.
dataset['text_posTagged'] = dataset.text_posTagged.apply(lambda x: literal_eval(x)).
However, this gives ValueError: malformed node or string: nan
When I applied the same in a column that has list of words, it works fine.
dataset['text']
['are', 'red', 'violets', 'are', 'blue', 'if', 'you', 'want', 'to', 'buy', 'us', 'here', 'is', 'a', 'clue', 'our', 'eye', 'amp', 'cheek', 'palette', 'is', 'al']
['is', 'it', 'too', 'late', 'now', 'to', 'say', 'sorry']
['our', 'amazonian', 'clay', 'full', 'coverage', 'foundation', 'comes', 'in', '40', 'shades', 'of', 'creamy', 'goodness']
The following prints are
dataset['text'] = dataset.text.apply(lambda x: literal_eval(x)).
print(dataset['text'][0][0])
What is wrong with applying literal_eval on list of POS tags? How to do it properly?
Parse only non-null rows. You can drop the lambda.
m = dataset['text_posTagged'].notna()
dataset.loc[m, 'text_posTagged'] = (
dataset.loc[m, 'text_posTagged'].apply(literal_eval))
If you have 100 rows or fewer, you can also use pd.eval:
dataset.loc[m, 'text_posTagged'] = pd.eval(dataset.loc[m, 'text_posTagged'])
I am performing topic modelling and using functions to get the top keywords in the topic models as shown below.
def getTopKWords(self, K):
results = []
"""
returns top K discriminative words for topic t
ie words v for which p(v|t) is maximum
"""
index = []
key_terms = []
pseudocounts = np.copy(self.n_vt)
normalizer = np.sum(pseudocounts, (0))
pseudocounts /= normalizer[np.newaxis, :]
for t in range(self.numTopics):
topWordIndices = pseudocounts[:, t].argsort()[-1:-(K+1):-1]
vocab = self.vectorizer.get_feature_names()
print (t, [vocab[i] for i in topWordIndices])
## Code for storing the values in a single list
return results
The above functions gives me the code as shown in the fig
0 ['computer', 'laptop', 'mac', 'use', 'bought', 'like', 'warranty', 'screen', 'way', 'just']
1 ['laptop', 'computer', 'use', 'just', 'like', 'time', 'great', 'windows', 'macbook', 'months']
2 ['computer', 'great', 'laptop', 'mac', 'buy', 'just', 'macbook', 'use', 'pro', 'windows']
3 ['laptop', 'computer', 'great', 'time', 'battery', 'use', 'apple', 'love', 'just', 'work']
It results from the 4 time the loop runs and print index and all keywords in each vocab.
Now, I want to return a single list from the function which returns me the following output.
return [keyword1, keyword2, keyword3, keyword4]
where, keyword1/2/3/4 are the words which were occuring the most in vocab lists with index 0, 1,2,3 in output.
You can use collection.Counter:
from collections import Counter
a = ['computer', 'laptop', 'mac', 'use', 'bought', 'like',
'warranty', 'screen', 'way', 'just']
b = ['laptop', 'computer', 'use', 'just', 'like', 'time',
'great', 'windows', 'macbook', 'months']
c = ['computer', 'great', 'laptop', 'mac', 'buy', 'just',
'macbook', 'use', 'pro', 'windows']
d = ['laptop', 'computer', 'great', 'time', 'battery', 'use',
'apple', 'love', 'just', 'work']
def get_most_common(*kwargs):
"""Accepts iterables, feeds all into Counter and returns the Counter instance"""
c = Counter()
for k in kwargs:
c.update(k)
return c
# get the most common ones
mc = get_most_common(a,b,c,d).most_common()
# print top 4 keys
top4 = [k for k,v in mc[0:4]]
print (top4)
Output:
['computer', 'laptop', 'use', 'just']
some_results = [] # store stuff
for t in range(self.numTopics):
topWordIndices = pseudocounts[:, t].argsort()[-1:-(K+1):-1]
vocab = self.vectorizer.get_feature_names()
print (t, [vocab[i] for i in topWordIndices])
some_results.append( [vocab[i] for i in topWordIndices] )
mc = get_most_common(*some_results).most_common()
return [k for k,v in mc[0:4]]
I have some "big" lists, that fit into some other lists of the same list of the "big" lists, and I would like to map the sublists into one another, to get the one containing all the others. Its sorted by lenghth of the items first.
lists = [['Linden'], ['Linden', 'N.J.'], ['chemical', 'plants'], ['some', 'federal', 'officials'], ['the', 'other', 'side', 'of', 'the', 'country'], ['the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['an', 'industrial', 'corridor', 'of', 'chemical', 'plants', 'that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['At', 'the', 'other', 'side', 'of', 'the', 'country', 'Linden', 'N.J.', 'is', 'part', 'of', 'an', 'industrial', 'corridor', 'of', 'chemical', 'plants', 'and', 'oil', 'refineries', 'that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America']]
and the result should be:
['At', ['the', 'other', 'side', 'of', 'the', 'country'], [['Linden'], 'N.J.'], 'is', 'part', 'of', ['an', 'industrial', 'corridor', 'of', ['chemical', 'plants'], 'and', 'oil', 'refineries', 'that', ['some', 'federal', 'officials'], 'refer', 'to', 'as', ['the', 'most', 'dangerous', 'two', 'miles', 'in', 'America']]]
How to solve that?
I found my own solution, after some inhibited attempts to ask that on stackoverflow. So here is it:
If you have optimization ideas, I would be happy about this.
I would like to use some iterator instead of the recursive function, like here (iterate python nested lists efficiently), but in that trial I had, it flattened the nesting of the list.
from more_itertools import replace
import collections
nestlist = [['Linden'],['Linden', 'N.J.'], ['chemical', 'plants'], ['some', 'federal', 'officials'], ['the', 'other', 'side', 'of', 'the', 'country'], ['the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['an', 'industrial', 'corridor', 'of', 'chemical', 'plants', 'that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America'], ['At', 'the', 'other', 'side', 'of', 'the', 'country', 'Linden', 'N.J.', 'is', 'part', 'of', 'an', 'industrial', 'corridor', 'of', 'chemical', 'plants', 'and', 'oil', 'refineries', 'that', 'some', 'federal', 'officials', 'refer', 'to', 'as', 'the', 'most', 'dangerous', 'two', 'miles', 'in', 'America']]
#nestlist = [['Linden', 'N.J.'], ['chemical', 'plants'], ['country', 'Linden', 'N.J.', 'of', 'chemical', 'plants']]
#nestlist = [['Linden', 'N.J.'], ['Linden', 'N.J.', 'of'], ['country', 'Linden', 'N.J.', 'of', 'chemical', 'plants']]
def flatten(iterable):
for el in iterable:
if isinstance(el, collections.Iterable) and not isinstance(el, str):
yield from flatten(el)
else:
yield el
def on_each_iterable (lol, fun):
out = []
if isinstance(lol, collections.Iterable) and not isinstance(lol, str):
for x in lol:
out.append(on_each_iterable(x, fun))
out = fun(out)
else:
out = lol
return out
def stack_matrjoshka(nesting_list):
nesting_list = sorted(nesting_list, key=lambda x: len(x))
n = 0
while n < (len(nesting_list) - 2):
to_fit_there = nesting_list[n]
flatted_to_fit_there = list(flatten(to_fit_there[:]))
def is_fitting(*xs):
flatted_compared = list(flatten(xs[:]))
decision = flatted_compared == list(flatted_to_fit_there)
return decision
for m in range(n + 1, len(nesting_list)):
through = list(nesting_list[m])
def replacing_fun(x):
return list(replace(list(x), is_fitting, [to_fit_there], window_size=len(to_fit_there)))
nesting_list[m] = on_each_iterable(through, replacing_fun)
n = n + 1
return (nesting_list[-1])
nested_list = stack_matrjoshka(nestlist)
print (nested_list)
makes
['At', ['the', 'other', 'side', 'of', 'the', 'country'], [['Linden'], 'N.J.'], 'is', 'part', 'of', 'an', 'industrial', 'corridor', 'of', ['chemical', 'plants'], 'and', 'oil', 'refineries', ['that', ['some', 'federal', 'officials'], 'refer', 'to', 'as', ['the', 'most', 'dangerous', 'two', 'miles', 'in', 'America']]]