How do I solve this: I have an table of letters which is labeled out by each grid point.
(0,0)(0,1)(0,2)(0,3) o l n c
(1,0)(1,1)(1,2)(1,3) e t e a
(2,0)(2,1)(2,2)(2,3) i b t a
(3,0)(3,1)(3,2)(3,3) o m m f
I am trying to find all possible combinations going through the grid creating lines of 3,4,5 length.
ie: PossibleSolutions = [[(0,0),(0,1),(0,2)],[(0,0),(1,1),(2,2)],[(0,0),(1,0),(2,0)]]
each of these representing:[[o,l,n],[o,t,t],[o,e,i]]
All possible combinations but keeping within the grid layout.
from itertools import combinations
def PossibleWords(possible, board):
words = []
for i in range(len(possible)):
words.append(board[possible[i]])
return words
def Combinations(board):
coord = list(board.keys())
temp = []
temp.append(list(combinations([
(0,0),(0,1),(0,2),(0,3),
(1,0),(1,1),(1,2),(1,3),
(2,0),(2,1),(2,2),(2,3),
(3,0),(3,1),(3,2),(3,3)
], 3)))
possible = temp[0]
form = []
temp = []
solutions = []
for i in range(len(possible)):
form.append(PossibleWords(possible[i], board))
for i in range(len(form)):
temp.append(form[i])
for i in range(len(temp)):
solutions.append(''.join(temp[i]))
return solutions
output = ['ole', 'ole', 'one', 'one', 'oaf', 'oaf', 'let', 'lee', 'lei', 'let', 'lei', 'let', 'lab', 'lam', 'lam', 'lit', 'lam', 'lam', 'net', 'nee', 'net', 'net', 'nam', 'nam', 'nam', 'nam', 'cee', 'cab', 'cat', 'cam', 'cam', 'cam', 'cam', 'eta', 'eta', 'eat', 'eta', 'tea', 'tet', 'tea', 'tab', 'tat', 'tit', 'tom', 'tom', 'eat', 'eta', 'aim', 'aim', 'bam', 'bam', 'tom', 'tom']
I've tried combinations() but since my grid is in a list it doesn't follow the grid boundaries. Any guidance would be helpful, thank you.
Related
I have an array called "data" which contains the following information.
[['amazon',
'phone',
'serious',
'mind',
'blown',
'serious',
'enjoy',
'use',
'applic',
'full',
'blown',
'websit',
'allow',
'quick',
'track',
'packag',
'descript',
'say'],
['would',
'say',
'app',
'real',
'thing',
'show',
'ghost',
'said',
'quot',
'orang',
'quot',
'ware',
'orang',
'cloth',
'app',
'adiquit',
'would',
'recsmend',
'want',
'talk',
'ghost'],
['love',
'play',
'backgammonthi',
'game',
'offer',
'varieti',
'difficulti',
'make',
'perfect',
'beginn',
'season',
'player'],
The case is that I would like to save in a list, the values that appear at least 1% in this array.
The closest approximation I have found is the following but it does not return what I need. Any ideas?
import numpy_indexed as npi
idx = [np.ones(len(a))*i for i, a in enumerate(tokens_list_train)]
(rows, cols), table = npi.count_table(np.concatenate(idx), np.concatenate(tokens_list_train))
table = table / table.sum(axis=1, keepdims=True)
print(table * 100)`
let's see, we can remove the nesting using itertool.chain.from_iterable, but we also need the total length, which we can compute by making another generator to avoid looping twice, and we need to count the repetitions, which is done by a counter.
from collections import Counter
from itertools import chain
total_length = 0
def sum_sublist_length(some_list): # to sum the lengths of the sub-lists
global total_length
for value in some_list:
total_length += len(value)
yield value
counts = Counter(chain.from_iterable(sum_sublist_length(my_list)))
items = [item for item in counts if counts[item]/total_length >= 0.01]
print(items)
['amazon', 'phone', 'serious', 'mind', 'blown', 'enjoy', 'use', 'applic', 'full', 'websit', 'allow', 'quick', 'track', 'packag', 'descript', 'say', 'would', 'app', 'real', 'thing', 'show', 'ghost', 'said', 'quot', 'orang', 'ware', 'cloth', 'adiquit', 'recsmend', 'want', 'talk', 'love', 'play', 'backgammonthi', 'game', 'offer', 'varieti', 'difficulti', 'make', 'perfect', 'beginn', 'season', 'player']
Here's another way to generate a list of elements that appear 1% or more of the time, using pandas.DataFrame:
import numpy as np
import pandas as pd
# == Define `flatten` function to combine objects with multi-level nesting =======
def flatten(iterable, base_type=None, levels=None):
"""Flatten an iterable with multiple levels of nesting.
>>> iterable = [(1, 2), ([3, 4], [[5], [6]])]
>>> list(flatten(iterable))
[1, 2, 3, 4, 5, 6]
Binary and text strings are not considered iterable and
will not be collapsed.
To avoid collapsing other types, specify *base_type*:
>>> iterable = ['ab', ('cd', 'ef'), ['gh', 'ij']]
>>> list(flatten(iterable, base_type=tuple))
['ab', ('cd', 'ef'), 'gh', 'ij']
Specify *levels* to stop flattening after a certain level:
>>> iterable = [('a', ['b']), ('c', ['d'])]
>>> list(flatten(iterable)) # Fully flattened
['a', 'b', 'c', 'd']
>>> list(flatten(iterable, levels=1)) # Only one level flattened
['a', ['b'], 'c', ['d']]
"""
def walk(node, level):
if (
((levels is not None) and (level > levels))
or isinstance(node, (str, bytes))
or ((base_type is not None) and isinstance(node, base_type))
):
yield node
return
try:
tree = iter(node)
except TypeError:
yield node
return
else:
for child in tree:
yield from walk(child, level + 1)
yield from walk(iterable, 0)
# == Problem Solution ==========================================================
# 1. Flatten the array into a single level list of elements, then convert it
# to a `pandas.Series`.
series_array = pd.Series(list(flatten(array)))
# 2. Get the total number of elements in flattened list
element_count = len(series_array)
# 3. Use method `pandas.Series.value_counts() to count the number of times each
# elements appears, then divide each element count by the
# total number of elements in flattened list (`element_count`)
elements = (
(series_array.value_counts()/element_count)
# 4. Use `pandas.Series.loc` to select only values that appear more than
# 1% of the time.
# .loc[lambda xdf: xdf['rate_count'] >= 0.01, :]
.loc[lambda value: value >= 0.01]
# 5. Select the elements, and convert results to a list
.index.to_list()
)
print(elements)
['would', 'serious', 'blown', 'quot', 'orang', 'app', 'ghost', 'say', 'use', 'adiquit', 'enjoy', 'said', 'cloth', 'thing', 'applic', 'talk', 'player', 'track', 'recsmend', 'beginn', 'packag', 'allow', 'perfect', 'want', 'real', 'love', 'full', 'show', 'play', 'make', 'backgammonthi', 'mind', 'amazon', 'game', 'difficulti', 'offer', 'descript', 'websit', 'quick', 'season', 'phone', 'variety', 'ware']
I'm trying to make a function to see if words appear within a certain distance of one another, my code is as follows:
file_cont = [['man', 'once', 'upon', 'time', 'love',
'princess'], ['python', 'code', 'cool', 'uses', 'java'],
['man', 'help', 'test', 'weird', 'love']] #words I want to measure 'distance' between
dat = [{ind: val for val, ind in enumerate(el)} for el in file_cont]
def myfunc(w1, w2, dist, dat):
arr = []
for x in dat:
i1 = x.get(w1)
i2 = x.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist ):
arr.append(list(x.keys())[i1:i2+1])
return arr
It works in this instance,
myfunc("man", "love",4, dat) returns [['man', 'once', 'upon', 'time', 'love'],
['man', 'help', 'test', 'weird', 'love']] which is what I want
The problem I have is when I use a much bigger dataset (the elements of file_cont becomes thousands of words), it outputs odd results
For example I know the words 'jon' and 'snow' appear together in at least one instance in one of the elements of file_cont
When I do myfunc('jon','snow',6,dat) I get:
[[], [], ['castle', 'ward'], [], [], []]
something completely out of context, it doesn't mention 'jon' or 'snow'
What is the problem here and how would I go about fixing it?
The problem comes from the fact that your text may contain multiple occurrences of the same word, which you typically observe with larger excerpts.
Here's a minimal working example showing how the function may fail:
new_file = [['man', 'once', 'man', 'time', 'love', 'once']]
data = [{ind: val for val, ind in enumerate(el)} for el in new_file]
def myfunc(w1, w2, dist, dat):
arr = []
for x in dat:
i1 = x.get(w1)
i2 = x.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist ):
arr.append(list(x.keys())[i1:i2+1])
return arr
myfunc("man", "love", 4, data)
# > [['time', 'love']]
Notice that here, your dictionary will look like this:
# > [{'man': 2, 'once': 5, 'time': 3, 'love': 4}]
This is because, when creating the dictionary, each new occurence of a word will replace its key in the dictionary with the newly observed (higher) index. Thus, the function myfunc fails as the keys in the dictionary do not correspond anymore to the indices of the words in the excerpt.
A way to achieve what you want to do could be (for instance):
data = ['man', 'once', 'upon', 'man', 'time', 'love', 'princess', 'man']
w1 = 'man'
w2 = 'love'
dist = 3
def new_func(w1, w2, dist, data):
w1_indices = [i for i, x in enumerate(data) if x == w1]
w2_indices = [i for i, x in enumerate(data) if x == w2]
for i in w1_indices:
for j in w2_indices:
if abs(i-j) < dist:
print(data[min(i, j):max(i, j)+1])
new_func(w1, w2, dist, data)
# > ['man', 'time', 'love']
# > ['love', 'princess', 'man']
With a list of lists like in your case, you can do:
file_cont = [['man', 'once', 'upon', 'time', 'love', 'princess'], ['python', 'code', 'cool', 'uses', 'java'],
['man', 'help', 'test', 'weird', 'love']]
results = [new_func(w1, w2, dist, x) for x in file_cont]
print(results)
# > ['man', 'once', 'upon', 'time', 'love']
# > ['man', 'help', 'test', 'weird', 'love']
Is there a way to rank a multiple string sets in descending order in Python by their length and if two or more lists are tied, rank the lists alphabetically?
Let's say I am given the following lists:
rankings = []
general = {'hello', 'how are you', 'good', 'thanks'}
fun = {'lowkey', 'jello', 'karaoke', 'stardown', 'hilarious', 'highkey', 'drunk', 'sports'}
subjects = {'math', 'science', 'english', 'french', 'history'}
cities = {'Rome', 'NYC', 'London', 'Toronto'}
animals = {'bird', 'elephant', 'mouse', 'dog', 'cat'}
foods = {'banana', 'fish', 'meat'}
My expected output is:
rankings = ["fun", "animals", "subjects", "cities", "general", "foods"]
Any ideas on the basic code behind this?
Thanks.
You will need to use string literals for each identifier, in order to sort by identifiers as a secondary "tie-breaker".
Please refer to the "Key Functions" section in the Python docs for sorting.
all_sets = {'general': {'hello', 'how are you', 'good', 'thanks'},
'fun': {'lowkey', 'jello', 'karaoke', 'stardown', 'hilarious', 'highkey', 'drunk', 'sports'},
'subjects': {'math', 'science', 'english', 'french', 'history'},
'cities': {'Rome', 'NYC', 'London', 'Toronto'},
'animals': {'bird', 'elephant', 'mouse', 'dog', 'cat'},
'foods': {'banana', 'fish', 'meat'}}
items = list(all_sets.items())
def sorter(dict_item):
k, v = dict_item # k is identifier and v is the set
return (-len(v), k) # sort descending by set length, ascending by identifier
sorted_items = sorted(items, key=sorter)
sorted_keys = [k for k, v in sorted_items]
The value of sorted_keys resulting:
['fun', 'animals', 'subjects', 'cities', 'general', 'foods']
I need to find the starting index of the specific sequences (sequence of strings) in the list in python.
For ex.
list = ['In', 'a', 'gesture', 'sure', 'to', 'rattle', 'the', 'Chinese', 'Government', ',', 'Steven', 'Spielberg', 'pulled', 'out', 'of', 'the', 'Beijing', 'Olympics', 'to', 'protest', 'against', 'China', '_s', 'backing', 'for', 'Sudan', '_s', 'policy', 'in', 'Darfur', '.']
ex.
seq0 = "Steven Spielberg"
seq1 = "the Chinese Government"
seq2 = "the Beijing Olympics"
The output should be like :
10
6
15
You could simply iterate over list of your words and check at every index if following words match any of your sequences.
words = ['In', 'a', 'gesture', 'sure', 'to', 'rattle', 'the', 'Chinese', 'Government', ',', 'Steven', 'Spielberg', 'pulled', 'out', 'of', 'the', 'Beijing', 'Olympics', 'to', 'protest', 'against', 'China', '_s', 'backing', 'for', 'Sudan', '_s', 'policy', 'in', 'Darfur', '.']\
seq0 = "Steven Spielberg"
seq1 = "the Chinese Government"
seq2 = "the Beijing Olympics"
sequences = {'seq{}'.format(idx): i.split() for idx, i in enumerate([seq0, seq1, seq2])}
for idx in range(len(words)):
for k, v in sequences.items():
if idx + len(v) < len(words) and words[idx: idx+len(v)] == v:
print(k, idx)
Output:
seq1 6
seq0 10
seq2 15
You can do something like:
def find_sequence(seq, _list):
seq_list = seq.split()
all_occurrence = [idx for idx in [i for i, x in enumerate(_list) if x == seq_list[0]] if seq_list == list_[idx:idx+len(seq_list)]]
return -1 if not all_occurrence else all_occurrence[0]
Output:
for seq in [seq0, seq1, seq2]:
print(find_sequence(seq, list_))
10
6
15
Note, if the sequence is not found you will get -1.
I am performing topic modelling and using functions to get the top keywords in the topic models as shown below.
def getTopKWords(self, K):
results = []
"""
returns top K discriminative words for topic t
ie words v for which p(v|t) is maximum
"""
index = []
key_terms = []
pseudocounts = np.copy(self.n_vt)
normalizer = np.sum(pseudocounts, (0))
pseudocounts /= normalizer[np.newaxis, :]
for t in range(self.numTopics):
topWordIndices = pseudocounts[:, t].argsort()[-1:-(K+1):-1]
vocab = self.vectorizer.get_feature_names()
print (t, [vocab[i] for i in topWordIndices])
## Code for storing the values in a single list
return results
The above functions gives me the code as shown in the fig
0 ['computer', 'laptop', 'mac', 'use', 'bought', 'like', 'warranty', 'screen', 'way', 'just']
1 ['laptop', 'computer', 'use', 'just', 'like', 'time', 'great', 'windows', 'macbook', 'months']
2 ['computer', 'great', 'laptop', 'mac', 'buy', 'just', 'macbook', 'use', 'pro', 'windows']
3 ['laptop', 'computer', 'great', 'time', 'battery', 'use', 'apple', 'love', 'just', 'work']
It results from the 4 time the loop runs and print index and all keywords in each vocab.
Now, I want to return a single list from the function which returns me the following output.
return [keyword1, keyword2, keyword3, keyword4]
where, keyword1/2/3/4 are the words which were occuring the most in vocab lists with index 0, 1,2,3 in output.
You can use collection.Counter:
from collections import Counter
a = ['computer', 'laptop', 'mac', 'use', 'bought', 'like',
'warranty', 'screen', 'way', 'just']
b = ['laptop', 'computer', 'use', 'just', 'like', 'time',
'great', 'windows', 'macbook', 'months']
c = ['computer', 'great', 'laptop', 'mac', 'buy', 'just',
'macbook', 'use', 'pro', 'windows']
d = ['laptop', 'computer', 'great', 'time', 'battery', 'use',
'apple', 'love', 'just', 'work']
def get_most_common(*kwargs):
"""Accepts iterables, feeds all into Counter and returns the Counter instance"""
c = Counter()
for k in kwargs:
c.update(k)
return c
# get the most common ones
mc = get_most_common(a,b,c,d).most_common()
# print top 4 keys
top4 = [k for k,v in mc[0:4]]
print (top4)
Output:
['computer', 'laptop', 'use', 'just']
some_results = [] # store stuff
for t in range(self.numTopics):
topWordIndices = pseudocounts[:, t].argsort()[-1:-(K+1):-1]
vocab = self.vectorizer.get_feature_names()
print (t, [vocab[i] for i in topWordIndices])
some_results.append( [vocab[i] for i in topWordIndices] )
mc = get_most_common(*some_results).most_common()
return [k for k,v in mc[0:4]]