I am using url_analysis tools from spotify API (wrapper spotipy, with sp.) to process tracks, using the following code:
def loudness_drops(track_ids):
names = set()
tids = set()
tracks_with_drop_name = set()
tracks_with_drop_id = set()
for id_ in track_ids:
track_id = sp.track(id_)['uri']
tids.add(track_id)
track_name = sp.track(id_)['name']
names.add(track_name)
#get audio features
features = sp.audio_features(tids)
#and then audio analysis id
urls = {x['analysis_url'] for x in features if x}
print len(urls)
#fetch analysis data
for url in urls:
# print len(urls)
analysis = sp._get(url)
#extract loudness sections from analysis
x = [_['start'] for _ in analysis['segments']]
print len(x)
l = [_['loudness_max'] for _ in analysis['segments']]
print len(l)
#get max and min values
min_l = min(l)
max_l = max(l)
#normalize stream
norm_l = [(_ - min_l)/(max_l - min_l) for _ in l]
#define silence as a value below 0.1
silence = [l[i] for i in range(len(l)) if norm_l[i] < .1]
#more than one silence means one of them happens in the middle of the track
if len(silence) > 1:
tracks_with_drop_name.add(track_name)
tracks_with_drop_id.add(track_id)
return tracks_with_drop_id
The code works, but if the number of songs I search is set to, say, limit=20, the time it takes to process all the audio segments xand l makes the process too expensive, e,g:
time.time() prints 452.175742149
QUESTION:
how can I drastically reduce complexity here?
I've tried to use sets instead of lists, but working with set objects prohibts indexing.
EDIT: 10 urls:
[u'https://api.spotify.com/v1/audio-analysis/5H40slc7OnTLMbXV6E780Z', u'https://api.spotify.com/v1/audio-analysis/72G49GsqYeWV6QVAqp4vl0', u'https://api.spotify.com/v1/audio-analysis/6jvFK4v3oLMPfm6g030H0g', u'https://api.spotify.com/v1/audio-analysis/351LyEn9dxRxgkl28GwQtl', u'https://api.spotify.com/v1/audio-analysis/4cRnjBH13wSYMOfOF17Ddn', u'https://api.spotify.com/v1/audio-analysis/2To3PTOTGJUtRsK3nQemP4', u'https://api.spotify.com/v1/audio-analysis/4xPRxqV9qCVeKLQ31NxhYz', u'https://api.spotify.com/v1/audio-analysis/1G1MtHxrVngvGWSQ7Fj4Oj', u'https://api.spotify.com/v1/audio-analysis/3du9aoP5vPGW1h70mIoicK', u'https://api.spotify.com/v1/audio-analysis/6VIIBKYJAKMBNQreG33lBF']
This is what I see, not knowing much about spotify:
for id_ in track_ids:
# this runs N times, where N = len(track_ids)
...
tids.add(track_id) # tids contains all track_ids processed until now
# in the end: len(tids) == N
...
features = sp.audio_features(tids)
# features contains features of all tracks processed until now
# in the end, I guess: len(features) == N * num_features_per_track
urls = {x['analysis_url'] for x in features if x}
# very probably: len(urls) == len(features)
for url in urls:
# for the first track, this processes features of the first track only
# for the seconds track, this processes features of 1st and 2nd
# etc.
# in the end, this loop repeats N * N * num_features_per_track times
You should not any url twice. And you do, because you keep all tracks in tids and then for each track you process everything in tids, which turns the complexity of this into O(n2).
In general, always look for loops inside loops when trying to reduce complexity.
I believe in this case this should work, if audio_features expects a set of ids:
# replace this: features = sp.audio_features(tids)
# with:
features = sp.audio_features({track_id})
Related
A recruiter wants to form a team with different skills and he wants to pick the minimum number of persons which can cover all the required skills.
N represents number of persons and K is the number of distinct skills that need to be included. list spec_skill = [[1,3],[0,1,2],[0,2,4]] provides information about skills of each person. e.g. person 0 has skills 1 and 3, person 1 has skills 0, 1 and 2 and so on.
The code should outputs the size of the smallest team that recruiter could find (the minimum number of persons) and values indicating the specific IDs of the people to recruit onto the team.
I implemented the code with brute force as below but since some data are more than thousands, it seems I need to be solved with heuristic approaches. In this case it is possible to have approximate answer.
Any suggestion how to solve it with heuristic methods will be appreciated.
N,K = 3,5
spec_skill = [[1,3],[0,1,2],[0,2,4]]
A = list(range(K))
set_a = set(A)
solved = False
for L in range(0, len(spec_skill)+1):
for subset in itertools.combinations(spec_skill, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(str(len(subset)) + '\n' + ' '.join([str(spec_skill.index(item)) for item in subset]))
solved = True
break
if solved: break
Here is my way of doing this. There might be potential optimization possibilities in the code, but the base idea should be understandable.
import random
import time
def man_power(lst, K, iterations=None, period=0):
"""
Specify a fixed number of iterations
or a period in seconds to limit the total computation time.
"""
# mapping each sublist into a (sublist, original_index) tuple
lst2 = [(lst[i], i) for i in range(len(lst))]
mini_sample = [0]*(len(lst)+1)
if period<0 or (period == 0 and iterations is None):
raise AttributeError("You must specify iterations or a positive period")
def shuffle_and_pick(lst, iterations):
mini = [0]*len(lst)
for _ in range(iterations):
random.shuffle(lst2)
skillset = set()
chosen_ones = []
idx = 0
fullset = True
# Breaks from the loop when all skillsets are found
while len(skillset) < K:
# No need to go further, we didn't find a better combination
if len(chosen_ones) >= len(mini):
fullset = False
break
before = len(skillset)
skillset.update(lst2[idx][0])
after = len(skillset)
if after > before:
# We append with the orginal index of the sublist
chosen_ones.append(lst2[idx][1])
idx += 1
if fullset:
mini = chosen_ones.copy()
return mini
# Estimates how many iterations we can do in the specified period
if iterations is None:
t0 = time.perf_counter()
mini_sample = shuffle_and_pick(lst, 1)
iterations = int(period / (time.perf_counter() - t0)) - 1
mini_result = shuffle_and_pick(lst, iterations)
if len(mini_sample)<len(mini_result):
return mini_sample, len(mini_sample)
else:
return mini_result, len(mini_result)
I am trying to iterate over a large list. I want a method that can iterate this list quickly. But it takes much time to iterate. Is there any method to iterate quickly or python is not built to do this.
My code snippet is :-
for i in THREE_INDEX:
if check_balanced(rc, pc):
print('balanced')
else:
rc, pc = equation_suffix(rc, pc, i)
Here THREE_INDEX has a length of 117649. It takes much time to iterate over this list, is there any method to iterate it quicker. But it takes around 4-5 minutes to iterate
equation_suffix functions:
def equation_suffix(rn, pn, suffix_list):
len_rn = len(rn)
react_suffix = suffix_list[: len_rn]
prod_suffix = suffix_list[len_rn:]
for re in enumerate(rn):
rn[re[0]] = add_suffix(re[1], react_suffix[re[0]])
for pe in enumerate(pn):
pn[pe[0]] = add_suffix(pe[1], prod_suffix[pe[0]])
return rn, pn
check_balanced function:
def check_balanced(rl, pl):
total_reactant = []
total_product = []
reactant_name = []
product_name = []
for reactant in rl:
total_reactant.append(separate_num(separate_brackets(reactant)))
for product in pl:
total_product.append(separate_num(separate_brackets(product)))
for react in total_reactant:
for key in react:
val = react.get(key)
val_dict = {key: val}
reactant_name.append(val_dict)
for prod in total_product:
for key in prod:
val = prod.get(key)
val_dict = {key: val}
product_name.append(val_dict)
reactant_name = flatten_dict(reactant_name)
product_name = flatten_dict(product_name)
for elem in enumerate(reactant_name):
val_r = reactant_name.get(elem[1])
val_p = product_name.get(elem[1])
if val_r == val_p:
if elem[0] == len(reactant_name) - 1:
return True
else:
return False
I believe the reason why "iterating" the list take a long time is due to the methods you are calling inside the for loop. I took out the methods just to test the speed of the iteration, it appears that iterating through a list of size 117649 is very fast. Here is my test script:
import time
start_time = time.time()
new_list = [(1, 2, 3) for i in range(117649)]
end_time = time.time()
print(f"Creating the list took: {end_time - start_time}s")
start_time = time.time()
for i in new_list:
pass
end_time = time.time()
print(f"Iterating the list took: {end_time - start_time}s")
Output is:
Creating the list took: 0.005337953567504883s
Iterating the list took: 0.0035648345947265625s
Edit: time() returns second.
General for loops aren't an issue, but using them to build (or rebuild) lists is usually slower than using list comprehensions (or in some cases, map/filter, though those are advanced tools that are often a pessimization).
Your functions could be made significantly simpler this way, and they'd get faster to boot. Example rewrites:
def equation_suffix(rn, pn, suffix_list):
prod_suffix = suffix_list[len(rn):]
# Change `rn =` to `rn[:] = ` if you must modify the caller's list as in your
# original code, not just return the modified list (which would be fine in your original code)
rn = [add_suffix(r, suffix) for r, suffix in zip(rn, suffix_list)] # No need to slice suffix_list; zip'll stop when rn exhausted
pn = [add_suffix(p, suffix) for p, suffix in zip(pn, prod_suffix)]
return rn, pn
def check_balanced(rl, pl):
# These can be generator expressions, since they're iterated once and thrown away anyway
total_reactant = (separate_num(separate_brackets(reactant)) for reactant in rl)
total_product = (separate_num(separate_brackets(product)) for product in pl)
reactant_name = []
product_name = []
# Use .items() to avoid repeated lookups, and concat simple listcomps to reduce calls to append
for react in total_reactant:
reactant_name += [{key: val} for key, val in react.items()]
for prod in total_product:
product_name += [{key: val} for key, val in prod.items()]
# These calls are suspicious, and may indicate optimizations to be had on prior lines
reactant_name = flatten_dict(reactant_name)
product_name = flatten_dict(product_name)
for i, (elem, val_r) in enumerate(reactant_name.items()):
if val_r == product_name.get(elem):
if i == len(reactant_name) - 1:
return True
else:
# I'm a little suspicious of returning False the first time a single
# key's value doesn't match. Either it's wrong, or it indicates an
# opportunity to write short-circuiting code that doesn't have
# to fully construct reactant_name and product_name when much of the time
# there will be an early mismatch
return False
I'll also note that using enumerate without unpacking the result is going to get worse performance, and more cryptic code; in this case (and many others), enumerate isn't needed, as listcomps and genexprs can accomplish the same result without knowing the index, but when it is needed, always unpack, e.g. for i, elem in enumerate(...): then using i and elem separately will always run faster than for packed in enumerate(...): and using packed[0] and packed[1] (and if you have more useful names than i and elem, it'll be much more readable to boot).
I am trying to find percent match between keywords using filters, and have had some trouble getting the correct percent result when using a loop.
Here's what I've tried so far:
import pandas as pd
def percentmatch(component=[], manufacture=[]):
dummy = 0
for i in component:
if i in manufacture:
dummy += 1
requirements = len(component)
return (dummy/requirements)*100
def isDesired(innovator = [], manufacture = []):
for i in innovator:
if i in manufacture:
return True
return False
part = pd.read_csv("fakedata.csv")
#Change the Value for test case
part['Size'].iloc[5] = 'Startup'
manufacture = pd.read_csv("book1.csv")
#First filter if the manufacture wants to work with certain customer
criteria = []
for i, r in manufacture.iterrows():
criteria.append((isDesired([part['Size'].iloc[0]], r['Desired Customer**'].split(", "))))
manufacture['criteria'] = criteria
firstfilter = manufacture[criteria]
Now the second filter.
#Second filter if the manufacture can do certain phase. Ex: prototype, pre-release
criteria2 = []
for i, r in firstfilter.iterrows():
criteria2.append(isDesired([part['Phase'].iloc[0]], r['Preferred'].split(", ")))
firstfilter['criteria2'] = criteria2
secondfilter = firstfilter[criteria2]
#Third Filter to find the percent match in Methods
percentmatch1 = []
for i, r in secondfilter.iterrows():
print(r['Method'].split(", "))
print(part['Method'].iloc[0].split(", "))
# Indentation below is there, but refuses to show in S.O. for some reason
percentmatch1.append(percentmatch([part['Method'].iloc[0].split(", ")], r['Method'].split(",")))
# End of for loop is above, next line is on same level of indentation as for loop instantiation
secondfilter['Method match'] = percentmatch1
In the above code block, my output is
['CNC Machining', '3D printing', 'Injection Molding']
['CNC Machining', '3D printing']
Doing a quick secondfilter.head() lookup gives me the following:
secondfilter.head() output here
The method match should be 100% not 0%. How do I correct this?
I have a set of data for which has an ID, timestamp, and identifiers. I have to go through it, calculate the entropy and save some other links for the data. At each step more identifiers are added to the identifiers dictionary and I have to re-compute the entropy and append it. I have really large amount of data and the program gets stuck due to growing number of identifiers and their entropy calculation after each step. I read the following solution but it is about the data consisting of numbers.
Incremental entropy computation
I have copied two functions from this page and the incremental calculation of entropy gives different values than the classical full entropy calculation at every step.
Here is the code I have:
from math import log
# ---------------------------------------------------------------------#
# Functions copied from https://stackoverflow.com/questions/17104673/incremental-entropy-computation
# maps x to -x*log2(x) for x>0, and to 0 otherwise
h = lambda p: -p*log(p, 2) if p > 0 else 0
# entropy of union of two samples with entropies H1 and H2
def update(H1, S1, H2, S2):
S = S1+S2
return 1.0*H1*S1/S+h(1.0*S1/S)+1.0*H2*S2/S+h(1.0*S2/S)
# compute entropy using the classic equation
def entropy(L):
n = 1.0*sum(L)
return sum([h(x/n) for x in L])
# ---------------------------------------------------------------------#
# Below is the input data (Actually I read it from a csv file)
input_data = [["1","2008-01-06T02:13:38Z","foo,bar"], ["2","2008-01-06T02:12:13Z","bar,blup"], ["3","2008-01-06T02:13:55Z","foo,bar"],
["4","2008-01-06T02:12:28Z","foo,xy"], ["5","2008-01-06T02:12:44Z","foo,bar"], ["6","2008-01-06T02:13:00Z","foo,bar"],
["7","2008-01-06T02:13:00Z","x,y"]]
total_identifiers = {} # To store the occurrences of identifiers. Values shows the number of occurrences
all_entropies = [] # Classical way of calculating entropy at every step
updated_entropies = [] # Incremental way of calculating entropy at every step
for item in input_data:
temp = item[2].split(",")
identifiers_sum = sum(total_identifiers.values()) # Sum of all identifiers
old_entropy = 0 if all_entropies[-1:] == [] else all_entropies[-1] # Get previous entropy calculation
for identifier in temp:
S_new = len(temp) # sum of new samples
temp_dictionaty = {a:1 for a in temp} # Store current identifiers and their occurrence
if identifier not in total_identifiers:
total_identifiers[identifier] = 1
else:
total_identifiers[identifier] += 1
current_entropy = entropy(total_identifiers.values()) # Entropy for current set of identifiers
updated_entropy = update(old_entropy, identifiers_sum, current_entropy, S_new)
updated_entropies.append(updated_entropy)
entropy_value = entropy(total_identifiers.values()) # Classical entropy calculation for comparison. This step becomes too expensive with big data
all_entropies.append(entropy_value)
print(total_identifiers)
print('Sum of Total Identifiers: ', identifiers_sum) # Gives 12 while the sum is 14 ???
print("All Classical Entropies: ", all_entropies) # print for comparison
print("All Updated Entropies: ", updated_entropies)
The other issue is that when I print "Sum of total_identifiers", it gives 12 instead of 14! (Due to very large amount of data, I read the actual file line by line and write the results directly to the disk and do not store it in the memory apart from the dictionary of identifiers).
The code above uses Theorem 4; it seems to me that you want to use Theorem 5 instead (from the paper in the next paragraph).
Note, however, that if the number of identifiers is really the problem then the incremental approach below isn't going to work either---at some point the dictionaries are going to get too large.
Below you can find a proof-of-concept Python implementation that follows the description from Updating Formulas and Algorithms for Computing Entropy and Gini Index from Time-Changing Data Streams.
import collections
import math
import random
def log2(p):
return math.log(p, 2) if p > 0 else 0
CountChange = collections.namedtuple('CountChange', ('label', 'change'))
class EntropyHolder:
def __init__(self):
self.counts_ = collections.defaultdict(int)
self.entropy_ = 0
self.sum_ = 0
def update(self, count_changes):
r = sum([change for _, change in count_changes])
residual = self._compute_residual(count_changes)
self.entropy_ = self.sum_ * (self.entropy_ - log2(self.sum_ / (self.sum_ + r))) / (self.sum_ + r) - residual
self._update_counts(count_changes)
return self.entropy_
def _compute_residual(self, count_changes):
r = sum([change for _, change in count_changes])
residual = 0
for label, change in count_changes:
p_new = (self.counts_[label] + change) / (self.sum_ + r)
p_old = self.counts_[label] / (self.sum_ + r)
residual += p_new * log2(p_new) - p_old * log2(p_old)
return residual
def _update_counts(self, count_changes):
for label, change in count_changes:
self.sum_ += change
self.counts_[label] += change
def entropy(self):
return self.entropy_
def naive_entropy(counts):
s = sum(counts)
return sum([-(r/s) * log2(r/s) for r in counts])
if __name__ == '__main__':
print(naive_entropy([1, 1]))
print(naive_entropy([1, 1, 1, 1]))
entropy = EntropyHolder()
freq = collections.defaultdict(int)
for _ in range(100):
index = random.randint(0, 5)
entropy.update([CountChange(index, 1)])
freq[index] += 1
print(naive_entropy(freq.values()))
print(entropy.entropy())
Thanks #blazs for providing the entropy_holder class. That solves the problem. So the idea is to import entropy_holder.py from (https://gist.github.com/blazs/4fc78807a96976cc455f49fc0fb28738) and use it to store the previous entropy and update at every step when new identifiers come.
So the minimum working code would look like this:
import entropy_holder
input_data = [["1","2008-01-06T02:13:38Z","foo,bar"], ["2","2008-01-06T02:12:13Z","bar,blup"], ["3","2008-01-06T02:13:55Z","foo,bar"],
["4","2008-01-06T02:12:28Z","foo,xy"], ["5","2008-01-06T02:12:44Z","foo,bar"], ["6","2008-01-06T02:13:00Z","foo,bar"],
["7","2008-01-06T02:13:00Z","x,y"]]
entropy = entropy_holder.EntropyHolder() # This class will hold the current entropy and counts of identifiers
for item in input_data:
for identifier in item[2].split(","):
entropy.update([entropy_holder.CountChange(identifier, 1)])
print(entropy.entropy())
This entropy by using the Blaz's incremental formulas is very close to the entropy calculated the classical way and saves from iterating over all the data again and again.
def models():
default = [0.6,0.67,2.4e-2,1e-2,2e-5,1.2e-3,2e-5]
lower = [np.log10(i/10) for i in default]
upper = [np.log10(i*10) for i in default]
n = 5
a = np.logspace(lower[0],upper[0],n)
b = np.logspace(lower[1],upper[1],n)
c = np.logspace(lower[2],upper[2],n)
d = np.logspace(lower[3],upper[3],n)
e = np.logspace(lower[4],upper[4],n)
f = np.logspace(lower[5],upper[5],n)
g = np.logspace(lower[6],upper[6],n)
combs = itertools.product(a,b,c,d,e,f,g)
list1 = []
for x in combs:
x = list(x)
list1.append(x)
return list1
The code above returns a list of 5^7 = 78,125 lists. Is there a way I can combine items in a,b,c,d,e,f,g, possibly randomly, to create a list of say, 10000, lists?
You could take random samples of each array and combine them, especially if you don't need to guarantee that specific combinations don't occur more than once:
import numpy as np
import random
def random_models(num_values):
n = 5
default = [0.6, 0.67, 2.4e-2, 1e-2, 2e-5, 1.2e-3, 2e-5]
ranges = zip((np.log10(i/10) for i in default),
(np.log10(i*10) for i in default))
data_arrays = []
for lower, upper in ranges:
data_arrays.append(np.logspace(lower, upper, n))
results = []
for i in xrange(num_values):
results.append([random.choice(arr) for arr in data_arrays])
return results
l = random_models(10000)
print len(l)
Here's a version that will avoid repeats up until you request more data than can be given without repeating:
def random_models_avoid_repeats(num_values):
n = 5
default = [0.6, 0.67, 2.4e-2, 1e-2, 2e-5, 1.2e-3, 2e-5]
# Build the range data (tuples of (lower, upper) range)
ranges = zip((np.log10(i/10) for i in default),
(np.log10(i*10) for i in default))
# Create the data arrays to sample from
data_arrays = []
for lower, upper in ranges:
data_arrays.append(np.logspace(lower, upper, n))
sequence_data = []
for entry in itertools.product(*data_arrays):
sequence_data.append(entry)
results = []
# Holds the current choices to choose from. The data will come from
# sequence_data above, but randomly shuffled. Values are popped off the
# end to keep things efficient. It's possible to ask for more data than
# the samples can give without repeats. In that case, we'll reload
# temp_data, randomly shuffle again, and start the process over until we've
# delivered the number of desired results.
temp_data = []
# Build the lists
for i in xrange(num_values):
if len(temp_data) == 0:
temp_data = sequence_data[:]
random.shuffle(temp_data)
results.append(temp_data.pop())
return results
Also note that we can avoid building a results list if you make this a generator by using yield. However, you'd want to consume the results using a forstatement as well:
def random_models_avoid_repeats_generator(num_values):
n = 5
default = [0.6, 0.67, 2.4e-2, 1e-2, 2e-5, 1.2e-3, 2e-5]
# Build the range data (tuples of (lower, upper) range)
ranges = zip((np.log10(i/10) for i in default),
(np.log10(i*10) for i in default))
# Create the data arrays to sample from
data_arrays = []
for lower, upper in ranges:
data_arrays.append(np.logspace(lower, upper, n))
sequence_data = []
for entry in itertools.product(*data_arrays):
sequence_data.append(entry)
# Holds the current choices to choose from. The data will come from
# sequence_data above, but randomly shuffled. Values are popped off the
# end to keep things efficient. It's possible to ask for more data than
# the samples can give without repeats. In that case, we'll reload
# temp_data, randomly shuffle again, and start the process over until we've
# delivered the number of desired results.
temp_data = []
# Build the lists
for i in xrange(num_values):
if len(temp_data) == 0:
temp_data = sequence_data[:]
random.shuffle(temp_data)
yield temp_data.pop()
You'd have to use it like this:
for entry in random_models_avoid_repeats_generator(10000):
# Do stuff...
Or manually iterate over it using next().