I have a data which consist of 2 million records, I am trying to convert the values to number and save it in a dictionary. Then use that dictionary to use it as a lookup. All to reduce the size of the file.
the data looks like this
[{
'a' : ['one','two'],
'b' : 'fine',
'c' : ['help']},
{
'a' : ['four','hen'],
'b' : 'happy',
'c' : ['mouse']},
{
'a' : ['two','hen'],
'b' : 'fine'}.......]
def convertDataToNumber(newdata):
dataR = []
dataRD = {}
result=[]
cin = 0
ctr = 1
# all_keys = {k for d in newdata for k in d.keys()}
for d in newdata:
for key,val in d.items():
if isinstance(val,type([])):
for l in val:
if l not in dataR:
dataR.append(l)
dataRD[(dataR[cin])] = ctr
ctr = ctr + 1
cin = cin + 1
d[key] = [dataRD.get(x,x) for x in d[key]]
if isinstance(val,str):
if val not in dataR:
dataR.append(val)
dataRD[(dataR[cin])] = ctr
ctr = ctr + 1
cin = cin + 1
d[key] = [dataRD.get(x,x) for x in [d[key]]]
return dataRD,newdata
Is there a better way to convert the values to numbers.
currently it is taking around 1 hour to execute this operation.
output:
[{'a' : [1,2],
'b':[3],
'c':[4]},
{'a' : [5,6],
'b':[7],
'c':[8]},
{'a':[2,6],
'b':[3]}]
You can create dict for saving each str with a unique ID and append a new string in dict and use the store str and number. (With this approach we iterate over dict one-time and on each iterate over item of dict and seeing each str if exist in dict use the number if not exist store a new number for that str.)
def cnvrt_num(v, mem_cat):
if isinstance(v, list):
res = []
for i in v:
mem_cat[i] = mem_cat.get(i, len(mem_cat)+1)
res.append(mem_cat[i])
else:
mem_cat[v] = mem_cat.get(v, len(mem_cat)+1)
res = [mem_cat[v]]
return res
mem_cat = {}
for dct in lst:
for k,v in dct.items():
dct[k] = cnvrt_num(v, mem_cat)
print(mem_cat)
# {'one': 1, 'two': 2, 'fine': 3, 'help': 4, 'four': 5, 'hen': 6, 'happy': 7, 'mouse': 8}
print(lst)
[
{'a': [1, 2], 'b': [3], 'c': [4]},
{'a': [5, 6], 'b': [7], 'c': [8]},
{'a': [2, 6], 'b': [3]}
]
Input:
lst = [
{'a' : ['one','two'],'b' : 'fine','c' : ['help']},
{'a' : ['four','hen'],'b' : 'happy','c' : ['mouse']},
{'a' : ['two','hen'],'b' : 'fine'}]
This is a follow-up to my previous question here. I have a optimization model that tries to find the highest coverage of a set of probe to a sequence. I approached it by creating an overlap matrix as shown below.
import pyomo
import pyomo.environ as pe
import pyomo.opt as po
import numpy as np
import matplotlib.pyplot as plt
# Initialise all sequences and probes
sequence = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
probes = ["a", "b", "c", "d", "e", "f", "g", "h"]
probe_starts = {"a": 0, "b": 1, "c": 4, "d": 5, "e": 6, "f": 8, "g": 13, "h": 12}
probe_ends = {"a": 2, "b": 2, "c": 6, "d": 6, "e": 8, "f": 11, "g": 15, "h": 14}
probe_lengths = {
p: e - s + 1 for (p, s), e in zip(probe_starts.items(), probe_ends.values())
}
# Create a matrix of probes against probes to check for overlap
def is_overlapping(x, y):
x_start, x_end = x
y_start, y_end = y
return (
(x_start >= y_start and x_start <= y_end)
or (x_end >= y_start and x_end <= y_end)
or (y_start >= x_start and y_start <= x_end)
or (y_end >= x_start and y_end <= x_end)
)
overlap = {}
matrix = np.zeros((len(probes), len(probes)))
for row, x in enumerate(zip(probe_starts.values(), probe_ends.values())):
for col, y in enumerate(zip(probe_starts.values(), probe_ends.values())):
matrix[row, col] = is_overlapping(x, y)
overlap[probes[row]] = list(matrix[row].astype(int))
I now build up my model as normal, adding a constraint that if one probe is assigned than any overlapping probes cannot be assigned.
# Model definition
model = pe.ConcreteModel()
model.probes = pe.Set(initialize=probes)
model.lengths = pe.Param(model.probes, initialize=probe_lengths)
model.overlap = pe.Param(model.probes, initialize=overlap, domain=pe.Any)
model.assign = pe.Var(model.probes, domain=pe.Boolean)
# Objective - highest coverage
obj = sum(model.assign[p] * probe_lengths[p] for p in model.probes)
model.objective = pe.Objective(expr=obj, sense=pe.maximize)
# Constraints
model.no_overlaps = pe.ConstraintList()
for query in model.probes:
model.no_overlaps.add(
sum(
[
model.assign[query] * model.assign[p]
for idx, p in enumerate(model.probes)
if model.overlap[query][idx]
]
)
<= 1
)
This works when solving with the quadratic BONMIN solver as shown below. However, when scaling up to a few thousand probes with significantly more overlap then this becomes prohibitively slowly.
solver = po.SolverFactory("BONMIN")
results = solver.solve(model)
visualize = np.zeros((len(probes), len(sequence)))
for idx, (start, end, val) in enumerate(
zip(probe_starts.values(), probe_ends.values(), model.assign.get_values().values())
):
visualize[idx, start : end + 1] = val + 1
plt.imshow(visualize)
plt.yticks(ticks=range(len(probes)), labels=probes)
plt.xticks(range(len(sequence)))
plt.colorbar()
plt.show()
Any suggestions regarding how to convert this into a linear problem would be appreciated. Thanks in advance!
You can attack this as an Integer Program (IP). There are 2 variables you need: one to indicate whether a probe has been "assigned" and another to indicate (or count) if a spot s in the sequence is covered by probe p in order to do the accounting.
It also helps to chop up the sequence into subsets (shown) that are indexed by the probes which could cover them, if assigned.
There is probably a dynamic programming approach to this as well that somebody might chip in. This works...
Code:
# model to make non-contiguous connections across a sequence
# with objective to "cover" as many points in sequence as possible
import pyomo.environ as pe
sequence = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
probes = ["a", "b", "c", "d", "e", "f", "g", "h"]
probe_starts = {"a": 0, "b": 1, "c": 4, "d": 5, "e": 6, "f": 8, "g": 13, "h": 12}
probe_ends = {"a": 2, "b": 2, "c": 6, "d": 6, "e": 8, "f": 11, "g": 15, "h": 14}
# sequence = [0, 1, 2, 3, 4, 5]
# probes = ["a", "b", "c"]
# probe_starts = {"a": 0, "b": 2, "c": 3}
# probe_ends = {"a": 2, "b": 4, "c": 5}
coverages = {p:[t for t in sequence if t>=probe_starts[p] and t<=probe_ends[p]] for p in probes}
# Model definition
model = pe.ConcreteModel()
model.sequence = pe.Set(initialize=sequence)
model.probes = pe.Set(initialize=probes)
# make an indexed set as convenience of probes:coverage ...
model.covers = pe.Set(model.probes, within=model.sequence, initialize=coverages)
model.covers_flat_set = pe.Set(initialize=[(p,s) for p in probes for s in model.covers[p]])
model.assign = pe.Var(model.probes, domain=pe.Binary) # 1 if probe p is used...
model.covered = pe.Var(model.covers_flat_set, domain=pe.Binary) # s is covered by p
# model.pprint()
# Objective
obj = sum(model.covered[p, s] for (p, s) in model.covers_flat_set)
model.objective = pe.Objective(expr=obj, sense=pe.maximize)
# Constraints
# selected probe must cover the associated points between start and end, if assigned
def cover(model, p):
return sum(model.covered[p, s] for s in model.covers[p]) == len(model.covers[p])*model.assign[p]
model.C1 = pe.Constraint(model.probes, rule=cover)
# cannot cover any point by more than 1 probe
def over_cover(model, s):
cov_options = [(p,s) for p in model.probes if (p, s) in model.covers_flat_set]
if not cov_options:
return pe.Constraint.Skip # no possible coverages
return sum(model.covered[p, s] for (p, s) in cov_options) <= 1
model.C2 = pe.Constraint(model.sequence, rule=over_cover)
solver = pe.SolverFactory('glpk')
result = solver.solve(model)
print(result)
#model.display()
# el-cheapo visualization...
for s in model.sequence:
probe = None
print(f'{s:3d}', end='')
for p in model.probes:
if (p, s) in model.covers_flat_set and model.assign[p].value:
probe = p
if probe:
print(f' {probe}')
else:
print()
Yields:
Problem:
- Name: unknown
Lower bound: 13.0
Upper bound: 13.0
Number of objectives: 1
Number of constraints: 24
Number of variables: 32
Number of nonzeros: 55
Sense: maximize
Solver:
- Status: ok
Termination condition: optimal
Statistics:
Branch and bound:
Number of bounded subproblems: 5
Number of created subproblems: 5
Error rc: 0
Time: 0.007474184036254883
Solution:
- number of solutions: 0
number of solutions displayed: 0
0 a
1 a
2 a
3
4 c
5 c
6 c
7
8 f
9 f
10 f
11 f
12 h
13 h
14 h
15
16
[Finished in 609ms]
I am trying to convert
a = "546"
to
a = 546
without using any library functions.
The "purest" I can think of:
>>> a = "546"
>>> result = 0
>>> for digit in a:
result *= 10
for d in '0123456789':
result += digit > d
>>> result
546
Or using #Ajax1234's dictionary idea if that's allowed:
>>> a = "546"
>>> value = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9}
>>> result = 0
>>> for digit in a:
result = 10 * result + value[digit]
>>> result
546
You can keep a dictionary that stores the string and integer values of a numeric key, and then iterate over the string. While iterating over the string, you can use enumerate to keep track of the index and then raise 10 to that power minus 1 and then multiply by the corresponding key from the dictionary:
a = "546"
length = 0
for i in a:
length += 1
d = {'1': 1, '0': 0, '3': 3, '2': 2, '5': 5, '4': 4, '7': 7, '6': 6, '9': 9, '8': 8}
count = 0
counter = 0
for i in a:
count += (10**(length-counter-1)*d[i])
counter += 1
print(count)
Output:
546
The trick is that 546 = 500 + 40 + 6, or 5*10^2 + 4*10^1 + 6*10^0.
Note how the exponent is just the index (in reverse). Using that, you can generalize this approach into a function:
def strToInt(number):
total = 0 # this is where we accumulate the result
pwr = len(number) - 1 # start the exponent off as 2
for digit in number: # digit is the str "5", "4", and "6"
digitVal = ord(digit) - ord('0') # using the ascii table, digitVal is the int value of 5,4, and 6.
total += digitVal * (10 ** pwr) # add 500, then 40, then 6
pwr -= 1 # make sure to drop the exponent down by one each time
return total
And you can use it like so:
>>> strToInt("546")
546
def stringToInt(s):
result = 0
value = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}
for digit in s:
result = 10 * result + value[digit]
return result
def int(a):
ty = a.__class__.__name__
out = 0
di = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4,
'5': 5, '6': 6, '7': 7, '8': 8, '9': 9}
if ty not in ("str", "int", "float", "bytes"):
raise TypeError("unsupported format")
if a.__class__ == float:
return a.__floor__()
elif a.__class__ == int:
return a
else:
ind = 0
for val in a[::-1]:
if val not in di:
raise ValueError("invalid input")
out += di[val]*(10**ind)
ind += 1
#print(out, di[val])
return out
print(int("55"))
55
You can loop through the string and perform the operation on each character using ord.
example:
a="546"
num=0
for i in a:
num = num * 10 + ord(i) - ord('0')
astr = "1234"
num = 0
for index,val in enumerate(astr[::-1]):
res = (ord(val) - ord('0')) * (10 ** index)
num += (res)
a=input()
r=0
for i in a:
r=r*10+(ord(i)-ord("0"))
print(r)
print(type(r))
This question already has answers here:
Modifying list while iterating [duplicate]
(7 answers)
Closed 8 years ago.
I want to iterate through a list, and remove the items that count more than once, so they don't get printed repeatedly by the for loop.
However, some items appearing only one time in the list seem to get affected too by this, and I can't figure out why.
Any input would be greatly appreciated.
Example Output:
listy = [2,2,1,3,4,2,1,2,3,4,5]
for i in listy:
if listy.count(i)>1:
print i, listy.count(i)
while i in listy: listy.remove(i)
else:
print i, listy.count(i)
Outputs:
2 4
3 2
1 2
thus ignoring completely 4 and 5.
You should not modify a list while iterating over it. This one should work:
listy = [2,2,1,3,4,2,1,2,3,4,5]
found = set()
for i in listy:
if not i in found:
print i, listy.count(i)
found.add(i)
The result is:
2 4
1 2
3 2
4 2
5 1
The reason for your problems is that you modify the list while you are iterating over it.
If you don't care about the order in which items appear in the output and don't care about the count, you can simply use use a set:
>>> listy = [2,2,1,3,4,2,1,2,3,4,5]
>>> print set(listy)
set([1, 2, 3, 4, 5])
If you do care about the count, use the Counter class from the collections module in the Standard Library:
>>> import collections
>>> collections.Counter(listy)
Counter({2: 4, 1: 2, 3: 2, 4: 2, 5: 1})
>>> c = collections.Counter(listy)
>>> for item in c.iteritems():
... print "%i has a count of %i" % item
...
1 has a count of 2
2 has a count of 4
3 has a count of 2
4 has a count of 2
5 has a count of 1
If you do care about both the order and the count, you have to build a second list:
>>> checked = []
>>> counts = []
>>> for item in listy:
>>> if item not in checked:
>>> checked.append(item)
>>> counts.append(listy.count(item))
>>> print zip(checked, counts)
... [(2, 4), (1, 2), (3, 2), (4, 2), (5, 1)]
This is the least efficient solution, of course.
If you don't want to keep the counts for later, you don't need the counts list:
listy = [2,2,1,3,4,2,1,2,3,4,5]
checked = set()
for item in listy:
# "continue early" looks better when there is lots of code for
# handling the other case
if item in checked:
continue
checked.add(item)
print item, listy.count(item)
Don't modify a list while iterating over it, it will mess you up every time:
listy = [2,2,1,3,4,2,1,2,3,4,5]
# * * * Get hit
for i in listy:
print i
if listy.count(i) > 1:
print i, listy.count(i), 'item and occurences'
while i in listy: listy.remove(i)
else:
print i, listy.count(i)
First, you remove four 2s. Two are right at the beginning, so that puts you at the first 1.
Then you advance one when you get the next i from listy, putting you at the first 3.
Then you remove two 3s. The first is right there, so that puts you at the first 4.
Then you advance one again. The 2 is gone already, so this puts you at the second 1.
You then delete both 1s; this moves you forward two spaces. The 2 and 3 are gone, so this puts you at the 5.
You advance one, this moves you off the end of the list so the loop is over.
If what you want is to print each item only once, you can use the simple set method, or you could use the itertools unique_everseen recipe:
def unique_everseen(iterable, key=None):
"List unique elements, preserving order. Remember all elements ever seen."
# unique_everseen('AAAABBBCCDAABBB') --> A B C D
# unique_everseen('ABBCcAD', str.lower) --> A B C D
seen = set()
seen_add = seen.add
if key is None:
for element in ifilterfalse(seen.__contains__, iterable):
seen_add(element)
yield element
else:
for element in iterable:
k = key(element)
if k not in seen:
seen_add(k)
yield element
Which extends the basic set version to allow you to specify a special way to compare items.
If you want to know which items are only in the list once:
listy2 = filter(lambda i: listy.count(i) == 1, listy)
listy2 now has all the single occurrences.
If you don't like the lambda, just do:
def getsingles(listy):
def singles(i):
return listy.count(i) == 1
return singles
then:
listy2 = filter(getsingles(listy), listy)
This makes a special function that will tell you which items are in listy only once.
The reason of the behavior you get is here, in the note:
http://docs.python.org/reference/compound_stmts.html#index-811
Update 1
agf's solution isn't a good one for performance reason: the list is filtered according to the count of each element. The counting is done for each element, that is to say the counting process that consists to run through the entire list to count, is done as many times as there are elements in list: it's overconsuming time, imagine if your list is 1000 length
A better solution I think is to use an instance of Counter:
import random
from collections import Counter
li = [ random.randint(0,20) for i in xrange(30)]
c = Counter(li)
print c
print type(c)
res = [ k for k in c if c[k]==1]
print res
result
Counter({8: 5, 0: 3, 4: 3, 9: 3, 2: 2, 5: 2, 11: 2, 3: 1, 6: 1, 10: 1, 12: 1, 15: 1, 16: 1, 17: 1, 18: 1, 19: 1, 20: 1})
<class 'collections.Counter'>
[3, 6, 10, 12, 15, 16, 17, 18, 19, 20]
Another solution would be to add the read elements in a set in order that the program avoids to make a count for an already seen element.
Update 2
errrr.... my solution is stupid, you don't want to select the element appearing only one time in the list....
Then the following code is the right one , I think:
import random
from collections import Counter
listy = [ random.randint(0,20) for i in xrange(30)]
print 'listy==',listy
print
c = Counter(listy)
print c
print type(c)
print
slimmed_listy = []
for el in listy:
if el in c:
slimmed_listy.append(el)
print 'element',el,' count ==',c[el]
del c[el]
print
print 'slimmed_listy==',slimmed_listy
result
listy== [13, 10, 1, 1, 13, 11, 18, 15, 3, 15, 12, 11, 15, 18, 11, 10, 14, 10, 20, 3, 18, 9, 11, 2, 19, 15, 5, 14, 1, 1]
Counter({1: 4, 11: 4, 15: 4, 10: 3, 18: 3, 3: 2, 13: 2, 14: 2, 2: 1, 5: 1, 9: 1, 12: 1, 19: 1, 20: 1})
<class 'collections.Counter'>
element 13 count == 2
element 10 count == 3
element 1 count == 4
element 11 count == 4
element 18 count == 3
element 15 count == 4
element 3 count == 2
element 12 count == 1
element 14 count == 2
element 20 count == 1
element 9 count == 1
element 2 count == 1
element 19 count == 1
element 5 count == 1
slimmed_listy== [13, 10, 1, 11, 18, 15, 3, 12, 14, 20, 9, 2, 19, 5]
In case you wouldn't want the result in the order of listy, the code would be even simpler
Update 3
If you want only to print, then I propose:
import random
from collections import Counter
listy = [ random.randint(0,20) for i in xrange(30)]
print 'listy==',listy
print
def gener(li):
c = Counter(li)
for el in li:
if el in c:
yield el,c[el]
del c[el]
print '\n'.join('element %4s count %4s' % x for x in gener(listy))
result
listy== [16, 2, 4, 9, 15, 19, 1, 1, 3, 5, 12, 15, 12, 3, 17, 13, 8, 11, 4, 6, 15, 1, 0, 1, 3, 3, 6, 5, 0, 8]
element 16 count 1
element 2 count 1
element 4 count 2
element 9 count 1
element 15 count 3
element 19 count 1
element 1 count 4
element 3 count 4
element 5 count 2
element 12 count 2
element 17 count 1
element 13 count 1
element 8 count 2
element 11 count 1
element 6 count 2
element 0 count 2
Modifying a list while you iterate over it is a bad idea in every language I have encountered. My suggestion: don't do that. Here are some better ideas.
Use a set to find single occurrences
source = [2,2,1,3,4,2,1,2,3,4,5]
for s in set(source):
print s
And you get this:
>>> source = [2,2,1,3,4,2,1,2,3,4,5]
>>> for s in set(source):
... print s
...
1
2
3
4
5
If you want the counts, use defaultdict
from collections import defaultdict
d = defaultdict(int)
source = [2,2,1,3,4,2,1,2,3,4,5]
for s in source:
d[s] += 1
for k, v in d.iteritems():
print k, v
You'll get this:
>>> for k, v in d.iteritems():
... print k, v
...
1 2
2 4
3 2
4 2
5 1
If you want your results sorted, use sort and operator
import operator
for k, v in sorted(d.iteritems(), key=operator.itemgetter(1)):
print k, v
You'll get this:
>>> import operator
>>> for k, v in sorted(d.iteritems(), key=operator.itemgetter(1)):
... print k, v
...
5 1
1 2
3 2
4 2
2 4
I am not sure if it is a good idea to iterate the list and remove elements at the same time. If you really just want to output all items and their number of occurrences, I would do it like this:
listy = [2,2,1,3,4,2,1,2,3,4,5]
listx = []
listc = []
for i in listy:
if not i in listx:
listx += [i]
listc += [listy.count(i)]
for x, c in zip(listx, listc):
print x, c
Like agf said, modifying a list while you iterate it will cause problems. You could solve your code by using while and pop:
single_occurrences = []
while listy:
i = listy.pop(0)
count = listy.count(i)+1
if count > 1:
print i, count
while i in listy: listy.remove(i)
else:
print i, count
single_occurrences.append(i)
Output:
2 4
1 2
3 2
4 2
5 1
One way to do that would be to create a result list and test whether the tested value is in it :
res=[]
listy = [2,2,1,3,4,2,1,2,3,4,5]
for i in listy:
if listy.count(i)>1 and i not in res:
res.append(i)
for i in res:
print i, listy.count(i)
Result :
2 4
1 2
3 2
4 2