There's a C++ comparison to get union of lists from lists of lists: The fastest way to find union of sets
And there's several other python related questions but none suggest the fastest way to unionize the lists:
Finding a union of lists of lists in Python
Flattening a shallow list in Python
From the answers, I've gathered that there are at least 2 ways to do it:
>>> from itertools import chain
>>> x = [[1,2,3], [3,4,5], [1,7,8]]
>>> list(set().union(*x))
[1, 2, 3, 4, 5, 7, 8]
>>> list(set(chain(*x)))
[1, 2, 3, 4, 5, 7, 8]
Note that I'm casting the set to list afterwards because I need the order of the list to be fixed for further processing.
After some comparison, it seems like list(set(chain(*x))) is more stable and takes less time:
from itertools import chain
import time
import random
# Dry run.
x = [[random.choice(range(10000))
for i in range(10)] for j in range(10)]
list(set().union(*x))
list(set(chain(*x)))
y_time = 0
z_time = 0
for _ in range(1000):
x = [[random.choice(range(10000))
for i in range(10)] for j in range(10)]
start = time.time()
y = list(set().union(*x))
y_time += time.time() - start
#print 'list(set().union(*x)):\t', y_time
start = time.time()
z = list(set(chain(*x)))
z_time += time.time() - start
#print 'list(set(chain(*x))):\t', z_time
assert sorted(y) == sorted(z)
#print
print y_time / 1000.
print z_time / 1000.
[out]:
1.39586925507e-05
1.09834671021e-05
Taking out the variable of casting sets to list:
y_time = 0
z_time = 0
for _ in range(1000):
x = [[random.choice(range(10000))
for i in range(10)] for j in range(10)]
start = time.time()
y = set().union(*x)
y_time += time.time() - start
start = time.time()
z = set(chain(*x))
z_time += time.time() - start
assert sorted(y) == sorted(z)
print y_time / 1000.
print z_time / 1000.
[out]:
1.22241973877e-05
1.02684497833e-05
Here's the full output when I try to print the intermediate timings (without list casting): http://pastebin.com/raw/y3i6dXZ8
Why is it that list(set(chain(*x))) takes less time than list(set().union(*x))?
Is there another way of achieving the same union of lists? Using numpy or pandas or sframe or something? Is the alternative faster?
What's fastest depends on the nature of x -- whether it is a long list or a short list, with many sublists or few sublists, whether the sublists are long or short, and whether there are many duplicates or few duplicates.
Here are some timeit results comparing some alternatives. There are so many possibilities that this is by no means a complete analysis, but perhaps this will give you a framework for studying your use case.
func | x | time
unique_concatenate | many_uniques | 0.863
empty_set_union | many_uniques | 1.191
short_set_union_rest | many_uniques | 1.192
long_set_union_rest | many_uniques | 1.194
set_chain | many_uniques | 1.224
func | x | time
long_set_union_rest | many_duplicates | 0.958
short_set_union_rest | many_duplicates | 0.969
empty_set_union | many_duplicates | 0.971
set_chain | many_duplicates | 1.128
unique_concatenate | many_duplicates | 2.411
func | x | time
empty_set_union | many_small_lists | 1.023
long_set_union_rest | many_small_lists | 1.028
set_chain | many_small_lists | 1.032
short_set_union_rest | many_small_lists | 1.036
unique_concatenate | many_small_lists | 1.351
func | x | time
long_set_union_rest | few_large_lists | 0.791
empty_set_union | few_large_lists | 0.813
unique_concatenate | few_large_lists | 0.814
set_chain | few_large_lists | 0.829
short_set_union_rest | few_large_lists | 0.849
Be sure to run the timeit benchmarks on your own machine since results may vary.
from __future__ import print_function
import random
import timeit
from itertools import chain
import numpy as np
def unique_concatenate(x):
return np.unique(np.concatenate(x))
def short_set_union_rest(x):
# This assumes x[0] is the shortest list in x
return list(set(x[0]).union(*x[1:]))
def long_set_union_rest(x):
# This assumes x[-1] is the longest list in x
return list(set(x[-1]).union(*x[1:]))
def empty_set_union(x):
return list(set().union(*x))
def set_chain(x):
return list(set(chain(*x)))
big_range = list(range(10**7))
small_range = list(range(10**5))
many_uniques = [[random.choice(big_range) for i in range(j)]
for j in range(10, 10000, 10)]
many_duplicates = [[random.choice(small_range) for i in range(j)]
for j in range(10, 10000, 10)]
many_small_lists = [[random.choice(big_range) for i in range(10)]
for j in range(10, 10000, 10)]
few_large_lists = [[random.choice(big_range) for i in range(1000)]
for j in range(10, 100, 10)]
if __name__=='__main__':
for x, n in [('many_uniques', 1), ('many_duplicates', 4),
('many_small_lists', 800), ('few_large_lists', 800)]:
timing = dict()
for func in [
'unique_concatenate', 'short_set_union_rest', 'long_set_union_rest',
'empty_set_union', 'set_chain']:
timing[func, x] = timeit.timeit(
'{}({})'.format(func, x), number=n,
setup='from __main__ import {}, {}'.format(func, x))
print('{:20} | {:20} | {}'.format('func', 'x', 'time'))
for key, t in sorted(timing.items(), key=lambda item: item[1]):
func, x = key
print('{:20} | {:20} | {:.3f}'.format(func, x, t))
print(end='\n')
Related
I want to do
df[(df['col']==50) | (df['col']==150) | etc ..]
"etc" is size changing from 1 to many
so I do a loop
result is like
str= "(df['col']==50) | (df['col']==150) | (df['col']==100)"
then I do this
df[str]
but this does not work
How can I make it work ?
A simple solution:
list_of_numbers = [50,150]
df[df["col"].isin(list_of_numbers)]
Where list_of_numbers are the numbers you want to include in the condition. I'm assuming here your condition is always or.
Use query to filter a dataframe from a string
df = pd.DataFrame({'col': range(25, 225, 25)})
l = [50, 100, 150]
q = ' | '.join([f"col == {i}" for i in l])
out = df.query(f)
>>> q
'col == 50 | col == 100 | col == 150'
>>> out
col
1 50
3 100
5 150
I have defined a function which returns a dataframe of intersection of all dataframes given as the input. However when I store the output of the function in some variable, it won't get stored in the variable. It is shown as a nonetype object
def intersection(list1, intersection_df,i):
if (i == 1):
intersection_df = list1[0]
print(type(intersection_df))
intersection(list1, intersection_df, i+1)
elif (i>len(list1)):
print(type(intersection_df))
a = spark.createDataFrame(intersection_df.rdd)
a.show()
return a
else:
intersection_df = intersection_df.alias('intersection_df')
tb = list1[i-1]
tb = tb.alias('tb')
intersection_df = intersection_df.join(tb, intersection_df['value'] == tb['value']).where(col('tb.value').isNotNull()).select(['intersection_df.value'])
print(type(intersection_df))
intersection(list1, intersection_df, i+1)
e.g if I give the input as following,
list1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
list2 = [3,4,5,6,7,8,9,10,11,12,13,14,15,16]
list3 = [6,7,8,9,10,11,12,13,4,16,343]
df1 = spark.createDataFrame(list1, StringType())
df2 = spark.createDataFrame(list2, StringType())
df3 = spark.createDataFrame(list3, StringType())
list4 = [df1,df2,df3]
empty_df = []
intersection_df = intersection(list4, empty_df, 1)
I expect the following output to be stored in interesection_df
+-----+
|value|
+-----+
| 7 |
| 11 |
| 8 |
| 6 |
| 9 |
| 10 |
| 4 |
| 12 |
| 13 |
+-----+
I think you got hit by the curse of recursion.
Problem:
You are calling intersection recursively but returning only in one of the if condition. So when it returns your df, it has no where to go (recall: each function call creates a stack).
Solution:
return when you call intersection from your if and else condition. for ex return intersection(list1, intersection_df, i+1) in your if condition.
Is there a way to reduce by a constant number each element of a dataframe verifying a condition including their own value without using a loop?
For instance, each cells < 2 sees its value reducing by 1.
Thank you very much.
I like to do this masking.
Here is an inefficient loop using your example
#Example using loop
for val in df['column']:
if(val<2):
val = val - 1
The following code gives the same result, but it will generally be much faster because it does not use a loop.
# Same effect using masks
mask = (df['column'] < 2) #Find entries that are less than 2.
df.loc[mask,'column'] = df.loc[mask,'column'] - 1 #Subtract 1.
I am not sure if this is the fastest, but you can use the .apply function:
import pandas as pd
df = pd.DataFrame(data=np.array([[1,2,3], [2,2,2], [4,4,4]]),
columns=['x', 'y', 'z'])
def conditional_add(x):
if x > 2:
return x + 2
else:
return x
df['x'] = df['x'].apply(conditional_add)
Will add 2 to the final row of column x.
More like (data from Willie)
df-((df<2)*2)
Out[727]:
x y z
0 -1 2 3
1 2 2 2
2 4 4 4
In this case I would use the np.where method from the NumPy library.
The method uses the following logic:
np.where(<condition>, <value if true>, <value if false>)
Example:
# import modules which are needed
import pandas as pd
import numpy as np
# create exmaple dataframe
df = pd.DataFrame({'A':[3,1,5,0.5,2,0.2]})
| A |
|-----|
| 3 |
| 1 |
| 5 |
| 0.5 |
| 2 |
| 0.2 |
# apply the np.where method with conditional statement
df['A'] = np.where(df.A < 2, df.A - 1, df.A)
| A |
|------|
| 3 |
| 0.0 |
| 5 |
| -0.5 |
| 2 |
| -0.8 |`
I have a list of data that I'm trying to find the max value from with python. My current code will loop over the data and render all possible combinations of the data, however I can't figure out how to render the max from the results.
Below is my current setup:
street = %sql SELECT * FROM streets
for i in range(len(flight)):
for j in range(len(flight)):
for k in range(len(flight)):
A = flight[i][2]
B = flight[k][2]
num = flight[i][4] , flight[j][4] , flight[k][4]
numsum = sum(num)
print A, B, numsum
Printing flight will render the below
+----+-----------+----------------------+----------------------+---+
| id | flight | Start | End |dis|
+----+-----------+----------------------+----------------------+---+
| 0 | w | SFO | DEN | 4 |
| 1 | e | DEN | NYC | 7 |
| 1 | e | DEN | ORD | 7 |
However the max with throw the below error.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-283-770cd29ebd83> in <module>()
8 num = street[i][4] , street[j][4] , street[k][4]
9 numsum = sum(num)
---> 10 print A, B, max(numsum)
11
TypeError: 'int' object is not iterable
If I remove the max from the last line everything in the database will print. For example:
SFO ORD 35
DEN JFK 12
SFO JFK 94
LAX DEN 54
...
Can someone help me figure out how to get the max value in numsum so the result prints like this:
SFO JFK 94
Thanks in advance!
You're not doing what you're trying to do. Your algorithm isn't well thought-out. look at it this way:
for each_item in whatever_iterator:
a = each_item[some_element]
b = each_item[another_element]
num = some, other, numbers
sumnum = sum(num) # hey that was easy!
print a, b, sumnum # every time through, let's print it!
Nowhere does this find the LARGEST. In order to do that, you'd want to iterate through and save current_max = max(current_max, new_value)
Looks like what you're looking to do is:
max_sumnum = (0, 0, 0)
for i, j, k in itertools.product(len(x), repeat=3):
num = x[i][4], x[j][4], x[???][1][k][4]
cur_sumnum = x[i][2], x[k][2], sum(num)
max_sumnum = max(max_numsum, cur_sumnum, key=lambda tup: tup[2])
print max_sumnum
I use itertools.product here because it's a great shortcut for nested for loops.
for i in range(3):
for j in range(5):
for k in range(100):
for m in range(2):
foo(i, j, k, m)
# equivalent to....
for i, j, k, m in itertools.product(range(3), range(5),
range(100), range(2)):
And I use the repeat keyword since you're doing a bunch of the same loops.
for i in range(3):
for j in range(3):
for k in range(3):
# equivalent to....
for i, j, k in itertools.product(range(3), repeat=3):
I have two list,
l1 = [1,2,3,4,5,6]
l2 = [3,2]
what i want is to remove the element of list l1 which is in l2, for that i have done something like this,
for x in l1:
if x in l2:
l1.remove(x)
it gives output like
[1, 3, 4, 5, 6]
but the output should be like
[1, 4, 5, 6]
can any one shed light on this.
This is easily explained like this.
consider the first array you have:
| 1 | 2 | 3 | 4 | 5 | 6 |
Now you start iterating
| 1 | 2 | 3 | 4 | 5 | 6 |
^
Nothing happens, iterator increments
| 1 | 2 | 3 | 4 | 5 | 6 |
^
2 gets removed
| 1 | 3 | 4 | 5 | 6 |
^
iterator increments
| 1 | 3 | 4 | 5 | 6 |
^
And voila, 3 is still there.
The solution is to iterate ove a copy of the vector e.g.
for x in l1[:]: <- slice on entire array
if x in l2:
l1.remove(x)
or to iterate in reverse:
for x in reversed(l1):
if x in l2:
l1.remove(x)
Which acts like this:
| 1 | 2 | 3 | 4 | 5 | 6 |
^
| 1 | 2 | 3 | 4 | 5 | 6 |
^
| 1 | 2 | 4 | 5 | 6 |
^
| 1 | 2 | 4 | 5 | 6 |
^
| 1 | 4 | 5 | 6 |
^
| 1 | 4 | 5 | 6 |
^
Why not making it a bit simpler? No need to actually iterate over l1 if we only want to remove elements present in l2:
for item in l2:
while item in l1:
l1.remove(item)
This gives you exactly the output desired...
Also, as commenters point out, if there is a possibility that we can have duplicates:
l1 = filter(lambda x: x not in l2, l1)
.. or many other variations using list comprehensions.
You want the outer loop to read:
for x in l1[:]:
...
You can't change a list while iterating over it and expect reasonable results. The above trick makes a copy of l1 and iterates over the copy instead.
Note that if order doesn't matter in the output list, and your elements are unique and hashable, you could use a set:
set(l1).difference(l2)
which will give you a set as output, but you can construct a list from it easily:
l1 = list(set(l1).difference(l2))
As others have said, you can't edit a list while you loop over it. A good option here is to use a list comprehension to create a new list.
removals = set(l2)
l1 = [item for item in l1 if item not in removals]
We make a set as a membership check on a set is significantly faster than on a list.
If the order and loss of duplicates in l1 do not matter:
list(set(l1) - set(l2))
The last list() is only required if you need the result as a list. You could also just use the resulting set, it's also iterable.
If you need it ordered you can of course call l.sort() on the resulting list.
Edit: Removed my original answer because even though it did give correct results, it did so for non-intuitive reasons, and is was not very fast either... so I've just left the timings:
import timeit
setup = """l1 = list(range(20)) + list(range(20))
l2 = [2, 3]"""
stmts = {
"mgilson": """for x in l1[:]:
if x in l2:
l1.remove(x)""",
"petr": """for item in l2:
while item in l1:
l1.remove(item)""",
"Lattyware": """removals = set(l2)
l1 = [item for item in l1 if item not in removals]""",
"millimoose": """for x in l2:
try:
while True: l1.remove(x)
except ValueError: pass""",
"Latty_mgilson": """removals = set(l2)
l1[:] = (item for item in l1 if item not in removals)""",
"mgilson_set": """l1 = list(set(l1).difference(l2))"""
}
for idea in stmts:
print("{0}: {1}".format(idea, timeit.timeit(setup=setup, stmt=stmts[idea])))
Results (Python 3.3.0 64bit, Win7):
mgilson_set: 2.5841989922197333
mgilson: 3.7747968857414813
petr: 1.9669433777815701
Latty_mgilson: 7.262900152285258
millimoose: 3.1890831105541793
Lattyware: 4.573971325181478
You're modifying the list l1 while you're iterating over it, this will cause weird behaviour. (The 3 will get skipped during iteration.)
Either iterate over a copy, or change your algorithm to iterate over l2 instead:
for x in l2:
try:
while True: l1.remove(x)
except ValueError: pass
(This should perform better than testing if x in l1 explicitly.) Nope, this performs terribly as l1 grows in size.
FWIW I get significantly different results than #Tim Pietzcker did using what I believe is more realistic input data set and by using a little more rigorous (but otherwise the same) approach to timing different people's answers.
The names and code snippets are the same as Tim's except I added a variation of the one named Lattyware called Lattyware_rev which determines what elements to keep rather than reject -- it turned out to be a slower than the former. Note that the two fastest don't preserve the order of l1.
Here's the latest timing code:
import timeit
setup = """
import random
random.seed(42) # initialize to constant to get same test values
l1 = [random.randrange(100) for _ in xrange(100)]
l2 = [random.randrange(100) for _ in xrange(10)]
"""
stmts = {
"Minion91": """
for x in reversed(l1):
if x in l2:
l1.remove(x)
""",
"mgilson": """
for x in l1[:]: # correction
if x in l2:
l1.remove(x)
""",
"mgilson_set": """
l1 = list(set(l1).difference(l2))
""",
"Lattyware": """
removals = set(l2)
l1 = [item for item in l1 if item not in removals]
""",
"Lattyware_rev": """
keep = set(l1).difference(l2)
l1 = [item for item in l1 if item in keep]
""",
"Latty_mgilson": """
removals = set(l2)
l1[:] = (item for item in l1 if item not in removals)""",
"petr": """
for item in l2:
while item in l1:
l1.remove(item)
""",
"petr (handles dups)": """
l1 = filter(lambda x: x not in l2, l1)
""",
"millimoose": """
for x in l2:
try:
while True: l1.remove(x)
except ValueError: pass
""",
"K.-Michael Aye": """
l1 = list(set(l1) - set(l2))
""",
}
N = 10000
R = 3
timings = [(idea,
min(timeit.repeat(stmts[idea], setup=setup, repeat=R, number=N)),
) for idea in stmts]
longest = max(len(t[0]) for t in timings) # length of longest name
exec(setup) # get an l1 & l2 just for heading length measurements
print('fastest to slowest timings of ideas:\n' +\
' ({:,d} timeit calls, best of {:d} executions)\n'.format(N, R)+\
' len(l1): {:,d}, len(l2): {:,d})\n'.format(len(l1), len(l2)))
for i in sorted(timings, key=lambda x: x[1]): # sort by speed (fastest first)
print "{:>{width}}: {}".format(*i, width=longest)
Output:
fastest to slowest timings of ideas:
(10,000 timeit calls, best of 3 executions)
len(l1): 100, len(l2): 10)
mgilson_set: 0.143126456832
K.-Michael Aye: 0.213544010551
Lattyware: 0.23666971551
Lattyware_rev: 0.466918513924
Latty_mgilson: 0.547516608553
petr: 0.552547776807
mgilson: 0.614238139366
Minion91: 0.728920176815
millimoose: 0.883061820848
petr (handles dups): 0.984093136969
Of course, please let me know if there's something radically wrong that would explain the radically different results.
l1 = [1, 2, 3, 4, 5, 6]
l2 = [3, 2]
[l1.remove(x) for x in l2]
print l1
[1, 4, 5, 6]