I'm trying to make a function to see if words appear within a certain distance of one another, my code is as follows:
file_cont = [['man', 'once', 'upon', 'time', 'love',
'princess'], ['python', 'code', 'cool', 'uses', 'java'],
['man', 'help', 'test', 'weird', 'love']] #words I want to measure 'distance' between
dat = [{ind: val for val, ind in enumerate(el)} for el in file_cont]
def myfunc(w1, w2, dist, dat):
arr = []
for x in dat:
i1 = x.get(w1)
i2 = x.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist ):
arr.append(list(x.keys())[i1:i2+1])
return arr
It works in this instance,
myfunc("man", "love",4, dat) returns [['man', 'once', 'upon', 'time', 'love'],
['man', 'help', 'test', 'weird', 'love']] which is what I want
The problem I have is when I use a much bigger dataset (the elements of file_cont becomes thousands of words), it outputs odd results
For example I know the words 'jon' and 'snow' appear together in at least one instance in one of the elements of file_cont
When I do myfunc('jon','snow',6,dat) I get:
[[], [], ['castle', 'ward'], [], [], []]
something completely out of context, it doesn't mention 'jon' or 'snow'
What is the problem here and how would I go about fixing it?
The problem comes from the fact that your text may contain multiple occurrences of the same word, which you typically observe with larger excerpts.
Here's a minimal working example showing how the function may fail:
new_file = [['man', 'once', 'man', 'time', 'love', 'once']]
data = [{ind: val for val, ind in enumerate(el)} for el in new_file]
def myfunc(w1, w2, dist, dat):
arr = []
for x in dat:
i1 = x.get(w1)
i2 = x.get(w2)
if (i1 is not None) and (i2 is not None) and (i2 - i1 <= dist ):
arr.append(list(x.keys())[i1:i2+1])
return arr
myfunc("man", "love", 4, data)
# > [['time', 'love']]
Notice that here, your dictionary will look like this:
# > [{'man': 2, 'once': 5, 'time': 3, 'love': 4}]
This is because, when creating the dictionary, each new occurence of a word will replace its key in the dictionary with the newly observed (higher) index. Thus, the function myfunc fails as the keys in the dictionary do not correspond anymore to the indices of the words in the excerpt.
A way to achieve what you want to do could be (for instance):
data = ['man', 'once', 'upon', 'man', 'time', 'love', 'princess', 'man']
w1 = 'man'
w2 = 'love'
dist = 3
def new_func(w1, w2, dist, data):
w1_indices = [i for i, x in enumerate(data) if x == w1]
w2_indices = [i for i, x in enumerate(data) if x == w2]
for i in w1_indices:
for j in w2_indices:
if abs(i-j) < dist:
print(data[min(i, j):max(i, j)+1])
new_func(w1, w2, dist, data)
# > ['man', 'time', 'love']
# > ['love', 'princess', 'man']
With a list of lists like in your case, you can do:
file_cont = [['man', 'once', 'upon', 'time', 'love', 'princess'], ['python', 'code', 'cool', 'uses', 'java'],
['man', 'help', 'test', 'weird', 'love']]
results = [new_func(w1, w2, dist, x) for x in file_cont]
print(results)
# > ['man', 'once', 'upon', 'time', 'love']
# > ['man', 'help', 'test', 'weird', 'love']
Related
I have an array called "data" which contains the following information.
[['amazon',
'phone',
'serious',
'mind',
'blown',
'serious',
'enjoy',
'use',
'applic',
'full',
'blown',
'websit',
'allow',
'quick',
'track',
'packag',
'descript',
'say'],
['would',
'say',
'app',
'real',
'thing',
'show',
'ghost',
'said',
'quot',
'orang',
'quot',
'ware',
'orang',
'cloth',
'app',
'adiquit',
'would',
'recsmend',
'want',
'talk',
'ghost'],
['love',
'play',
'backgammonthi',
'game',
'offer',
'varieti',
'difficulti',
'make',
'perfect',
'beginn',
'season',
'player'],
The case is that I would like to save in a list, the values that appear at least 1% in this array.
The closest approximation I have found is the following but it does not return what I need. Any ideas?
import numpy_indexed as npi
idx = [np.ones(len(a))*i for i, a in enumerate(tokens_list_train)]
(rows, cols), table = npi.count_table(np.concatenate(idx), np.concatenate(tokens_list_train))
table = table / table.sum(axis=1, keepdims=True)
print(table * 100)`
let's see, we can remove the nesting using itertool.chain.from_iterable, but we also need the total length, which we can compute by making another generator to avoid looping twice, and we need to count the repetitions, which is done by a counter.
from collections import Counter
from itertools import chain
total_length = 0
def sum_sublist_length(some_list): # to sum the lengths of the sub-lists
global total_length
for value in some_list:
total_length += len(value)
yield value
counts = Counter(chain.from_iterable(sum_sublist_length(my_list)))
items = [item for item in counts if counts[item]/total_length >= 0.01]
print(items)
['amazon', 'phone', 'serious', 'mind', 'blown', 'enjoy', 'use', 'applic', 'full', 'websit', 'allow', 'quick', 'track', 'packag', 'descript', 'say', 'would', 'app', 'real', 'thing', 'show', 'ghost', 'said', 'quot', 'orang', 'ware', 'cloth', 'adiquit', 'recsmend', 'want', 'talk', 'love', 'play', 'backgammonthi', 'game', 'offer', 'varieti', 'difficulti', 'make', 'perfect', 'beginn', 'season', 'player']
Here's another way to generate a list of elements that appear 1% or more of the time, using pandas.DataFrame:
import numpy as np
import pandas as pd
# == Define `flatten` function to combine objects with multi-level nesting =======
def flatten(iterable, base_type=None, levels=None):
"""Flatten an iterable with multiple levels of nesting.
>>> iterable = [(1, 2), ([3, 4], [[5], [6]])]
>>> list(flatten(iterable))
[1, 2, 3, 4, 5, 6]
Binary and text strings are not considered iterable and
will not be collapsed.
To avoid collapsing other types, specify *base_type*:
>>> iterable = ['ab', ('cd', 'ef'), ['gh', 'ij']]
>>> list(flatten(iterable, base_type=tuple))
['ab', ('cd', 'ef'), 'gh', 'ij']
Specify *levels* to stop flattening after a certain level:
>>> iterable = [('a', ['b']), ('c', ['d'])]
>>> list(flatten(iterable)) # Fully flattened
['a', 'b', 'c', 'd']
>>> list(flatten(iterable, levels=1)) # Only one level flattened
['a', ['b'], 'c', ['d']]
"""
def walk(node, level):
if (
((levels is not None) and (level > levels))
or isinstance(node, (str, bytes))
or ((base_type is not None) and isinstance(node, base_type))
):
yield node
return
try:
tree = iter(node)
except TypeError:
yield node
return
else:
for child in tree:
yield from walk(child, level + 1)
yield from walk(iterable, 0)
# == Problem Solution ==========================================================
# 1. Flatten the array into a single level list of elements, then convert it
# to a `pandas.Series`.
series_array = pd.Series(list(flatten(array)))
# 2. Get the total number of elements in flattened list
element_count = len(series_array)
# 3. Use method `pandas.Series.value_counts() to count the number of times each
# elements appears, then divide each element count by the
# total number of elements in flattened list (`element_count`)
elements = (
(series_array.value_counts()/element_count)
# 4. Use `pandas.Series.loc` to select only values that appear more than
# 1% of the time.
# .loc[lambda xdf: xdf['rate_count'] >= 0.01, :]
.loc[lambda value: value >= 0.01]
# 5. Select the elements, and convert results to a list
.index.to_list()
)
print(elements)
['would', 'serious', 'blown', 'quot', 'orang', 'app', 'ghost', 'say', 'use', 'adiquit', 'enjoy', 'said', 'cloth', 'thing', 'applic', 'talk', 'player', 'track', 'recsmend', 'beginn', 'packag', 'allow', 'perfect', 'want', 'real', 'love', 'full', 'show', 'play', 'make', 'backgammonthi', 'mind', 'amazon', 'game', 'difficulti', 'offer', 'descript', 'websit', 'quick', 'season', 'phone', 'variety', 'ware']
Id need some help with the following:
Id like to create a function that when inserted a string, i would get a dictionary with the unique elements in the list as the key, and as values the texts that are before and after it.
for example with the following string:
Id like to have the following:
Important to note that for example some words a repeated and have different values next to it.
I am trying with the following function:
def ffg(txt):
txt = re.sub(r'[^\w\s]','',txt).lower().split()
words = list(set(txt))
indx = [words.index(i) for i in txt]
for x in range(len(txt)):
res = txt[x]
But as you can see, it doesnt work at all.
I am assuming you past a sequence of words, so split the text into words however you please.
from collections import defaultdict
def word_context(l):
context = defaultdict(set)
for i, w in enumerate(l):
if i + 1 < len(l):
context[w].add(l[i+1])
if i - 1 >= 0:
context[w].add(l[i-1])
return dict(context)
Result:
>>> l
['half', 'a', 'league', 'half', 'a', 'league', 'half', 'a', 'league', 'onward', 'all', 'in', 'the', 'valley', 'of', 'death', 'rode', 'the', 'six', 'hundred']
>>> word_context(l)
{'half': {'a', 'league'}, 'a': {'half', 'league'}, 'league': {'half', 'a', 'onward'}, 'onward': {'all', 'league'}, 'all': {'onward', 'in'}, 'in': {'all', 'the'}, 'the': {'six', 'rode', 'in', 'valley'}, 'valley': {'the', 'of'}, 'of': {'death', 'valley'}, 'death': {'rode', 'of'}, 'rode': {'death', 'the'}, 'six': {'the', 'hundred'}, 'hundred': {'six'}}
Another variation:
import re
def collocate(txt):
txt = re.sub(r'[^\w\s]', '', txt).lower().split()
neighbors={}
for i in range(len(txt)):
if txt[i] not in neighbors:
neighbors[txt[i]]=set()
if i>0:
neighbors[txt[i]].add(txt[i-1])
if i < len(txt) - 1:
neighbors[txt[i]].add(txt[i+1])
return neighbors
print(collocate("Half a league, half a league, Half a league onward, All in the valley of Death Rode the six hundred."))
How do I solve this: I have an table of letters which is labeled out by each grid point.
(0,0)(0,1)(0,2)(0,3) o l n c
(1,0)(1,1)(1,2)(1,3) e t e a
(2,0)(2,1)(2,2)(2,3) i b t a
(3,0)(3,1)(3,2)(3,3) o m m f
I am trying to find all possible combinations going through the grid creating lines of 3,4,5 length.
ie: PossibleSolutions = [[(0,0),(0,1),(0,2)],[(0,0),(1,1),(2,2)],[(0,0),(1,0),(2,0)]]
each of these representing:[[o,l,n],[o,t,t],[o,e,i]]
All possible combinations but keeping within the grid layout.
from itertools import combinations
def PossibleWords(possible, board):
words = []
for i in range(len(possible)):
words.append(board[possible[i]])
return words
def Combinations(board):
coord = list(board.keys())
temp = []
temp.append(list(combinations([
(0,0),(0,1),(0,2),(0,3),
(1,0),(1,1),(1,2),(1,3),
(2,0),(2,1),(2,2),(2,3),
(3,0),(3,1),(3,2),(3,3)
], 3)))
possible = temp[0]
form = []
temp = []
solutions = []
for i in range(len(possible)):
form.append(PossibleWords(possible[i], board))
for i in range(len(form)):
temp.append(form[i])
for i in range(len(temp)):
solutions.append(''.join(temp[i]))
return solutions
output = ['ole', 'ole', 'one', 'one', 'oaf', 'oaf', 'let', 'lee', 'lei', 'let', 'lei', 'let', 'lab', 'lam', 'lam', 'lit', 'lam', 'lam', 'net', 'nee', 'net', 'net', 'nam', 'nam', 'nam', 'nam', 'cee', 'cab', 'cat', 'cam', 'cam', 'cam', 'cam', 'eta', 'eta', 'eat', 'eta', 'tea', 'tet', 'tea', 'tab', 'tat', 'tit', 'tom', 'tom', 'eat', 'eta', 'aim', 'aim', 'bam', 'bam', 'tom', 'tom']
I've tried combinations() but since my grid is in a list it doesn't follow the grid boundaries. Any guidance would be helpful, thank you.
I have a string s and a list of strings, arr.
The length of s is equal to the total length of strings in arr.
I need to split s into a list, such that each element in the list has the same length as the corresponding element in arr.
For example:
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
expected == ['Python', 'is', 'an', 'programming', 'language']
It is much cleaner to use iter with next:
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
new_s = iter(s)
result = [''.join(next(new_s) for _ in i) for i in arr]
Output:
['Python', 'is', 'an', 'programming', 'language']
One way would be to do this:
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
expected = []
i = 0
for word in arr:
expected.append(s[i:i+len(word)])
i+= len(word)
print(expected)
Using a simple for loop this can be done as follows:
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
start_index = 0
expected = list()
for a in arr:
expected.append(s[start_index:start_index+len(a)])
start_index += len(a)
print(expected)
In the future, an alternative approach will be to use an assignment expression (new in Python 3.8):
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
i = 0
expected = [s[i:(i := i+len(word))] for word in arr]
You can use itertools.accumulate to get the positions where you want to split the string:
>>> s = 'Pythonisanprogramminglanguage'
>>> arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
>>> import itertools
>>> L = list(itertools.accumulate(map(len, arr)))
>>> L
[6, 8, 10, 21, 29]
Now if you zip the list with itself, you get the intervals:
>>> list(zip([0]+L, L))
[(0, 6), (6, 8), (8, 10), (10, 21), (21, 29)]
And you just have to use the intervals to split the string:
>>> [s[i:j] for i,j in zip([0]+L, L)]
['Python', 'is', 'an', 'programming', 'language']
The itertools module has a function named accumulate() (added in Py 3.2) which helps make this relatively easy:
from itertools import accumulate # added in Py 3.2
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
cuts = tuple(accumulate(len(item) for item in arr))
words = [s[i:j] for i, j in zip((0,)+cuts, cuts)]
print(words) # -> ['Python', 'is', 'an', 'programming', 'language']
Create a simple loop and use the length of the words as your index:
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
ctr = 0
words = []
for x in arr:
words.append(s[ctr:len(x) + ctr])
ctr += len(x)
print(words)
# ['Python', 'is', 'an', 'programming', 'language']
Here is another approach :
import numpy as np
ar = [0]+list(map(len, arr))
ar = list(np.cumsum(ar))
output_ = [s[i:ar[ar.index(i)+1]] for i in ar[:-1]]
Output :
['Python', 'is', 'an', 'programming', 'language']
One more way
a,l = 0,[]
for i in map(len,arr):
l.append(s[a:a+i])
a+=i
print (l)
#['Python', 'is', 'an', 'programming', 'language']
Props to the answer using iter. The accumulate answers are my favorite. Here is another accumulate answer using map instead of a list comprehension
import itertools
s = 'Pythonisanprogramminglanguage'
arr = ['lkjhgf', 'zx', 'qw', 'ertyuiopakk', 'foacdhlc']
ticks = itertools.accumulate(map(len, arr[0:]))
words = list(map(lambda i, x: s[i:len(x) + i], (0,) + tuple(ticks), arr))
Output:
['Python', 'is', 'an', 'programming', 'language']
You could collect slices off the front of s.
output = []
for word in arr:
i = len(word)
chunk, s = s[:i], s[i:]
output.append(chunk)
print(output) # -> ['Python', 'is', 'an', 'programming', 'language']
Yet another approach would be to create a regex pattern describing the desired length of words. You can replace every character by . (=any character) and surround the words with ():
arr = ['lkjhgf', 'zx', 'q', 'ertyuiopakk', 'foacdhlc']
import re
pattern = '(' + ')('.join(re.sub('.', '.', word) for word in arr) + ')'
#=> '(......)(..)(.)(...........)(........)'
If the pattern matches, you get the desired words in groups directly:
s = 'Pythonisaprogramminglanguage'
re.match(pattern, s).groups()
#=> ('Python', 'is', 'a', 'programming', 'language')
I am trying to find the most efficient way of matching each element in a List to each element in a List of Lists using Python. For e.g input:
>>>myList = [['hi', 'no', 'bye', 'Global', 24],['morning', 'X', 'place'],['so', 'large', 'mall','test'], ['hi', 'X', 'place', 'bye']]
>>>check_against_myLIst = ['bye','place','hi','australia']
Now I am not sure if the best way would be to use a map function, for loops or any other python data analysis methodology.
The output needs to be converted into a dataframe, such that Output is.
Index Value Result
0 ['hi', 'no', 'bye', 'Global', 24] True
1 ['morning', 'X', 'place'] True
2 ['so', 'large', 'mall','test'] False
3 ['hi', 'X', 'place', 'bye'] True
Thanks !
You can just store the result in a list and construct a dataframe from that
result = [False]*len(myList)
for n, _list in enumerate(myList):
if [i for i in _list if i in check_against_myLIst]:
result[n] = True
First, create sets from myList, for the O(1) membership test.
>>> myList = [['hi', 'no', 'bye', 'Global', 24],['morning', 'X', 'place'],['so', 'large', 'mall','test'], ['hi', 'X', 'place', 'bye']]
>>> checks = ['bye','place','hi','australia'] # renamed from check_against_myLIst
>>> sets = map(set, myList)
Use efficient any checks to find out whether any element of checks is in a given set. (As opposed to computing intersections, any is lazy.)
>>> result = [(lst, any(s in set_ for s in check)) for lst, set_ in zip(myList, sets)]
Build the dataframe.
>>> df = pd.DataFrame(result, columns=['Value', 'Result'])
>>> df.index.name = 'Index'
df
Value Result
Index
0 [hi, no, bye, Global, 24] True
1 [morning, X, place] True
2 [so, large, mall, test] False
3 [hi, X, place, bye] True
Here you have example about the way you can get it without use pandas.
Anyway, let me explain another point of view:
# list
my_list = (['hi', 'no', 'bye', 'Global', 24],['morning', 'X', 'place'],['so', 'large', 'mall','test','TESSTTTT'], ['hi', 'X', 'place', 'bye'])
# check list
check_against_myLIst = ('bye','place','hi','australia')
# Function to find intersection of two arrays
def interSection(index, arr1,arr2):
result = 'false'
output = list(filter(lambda x: x in arr1, arr2))
if output:
result = 'true'
print 'index',"\t"*1,'Value',"\t"*6,'Result'
print index,"\t"*1,arr1,"\t"*4,result
print ''
# Driver program
if __name__ == "__main__":
count = 0
for arrayItem in my_list:
interSection(count, arrayItem,check_against_myLIst)
count += 1
You can make a function that tests intersections between sets of your list and the compared list.
Given
import pandas as pd
cmp = ["bye","place","hi","australia"]
lst = [
["hi", "no", "bye", "Global", 24],
["morning", "X", "place"],
["so", "large", "mall","test"],
["hi", "X", "place", "bye"]
]
Code
def is_in(nested, compare):
"""Return a tuple of (row, bool), True if the compared list intersects."""
compared = set(compare)
return [(x, bool(set(x) & compared)) for x in nested]
bool_lst = is_in(lst, cmp)
bool_lst
Output
[(['hi', 'no', 'bye', 'Global', 24], True),
(['morning', 'X', 'place'], True),
(['so', 'large', 'mall', 'test'], False),
(['hi', 'X', 'place', 'bye'], True)]
It looks similar to your output. From here, we just need to make a DataFrame:
df = pd.DataFrame(bool_lst, columns=["Value", "Result"])
df.rename_axis("Index")
Output
The latter can all be reduced to a single line:
pd.DataFrame([(x, bool(set(x) & set(cmp))) for x in lst], columns=["Value", "Result"]).rename_axis("Index")