Unique values from multipel column in pandas - python

distinct_values = df.col_name.unique().compute()
But what if I don't know the names of columns.

I think you need:
df = pd.DataFrame({"colA":['a', 'b', 'b', 'd', 'e'], "colB":[1,2,1,2,1]})
unique_dict = {}
# df.columns will give you list of columns in dataframe
for col in df.columns:
unique_dict[col] = list(df[col].unique())
Output:
{'colA': ['a', 'b', 'd', 'e'], 'colB': [1, 2]}

You can try this,
>>> import pandas as pd
>>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 5]})
>>> d = dict()
>>> d['any_column_name'] = pd.unique(df.values.ravel('K'))
>>> d
{'any_column_name': array([1, 2, 3, 5])}
or for just one feature,
>>> d = dict()
>>> d['a'] = df['a'].unique()
>>> d
{'a': array([1, 2, 3])}
or individually for all,
>>> d = dict()
>>> for col in df.columns:
... d[col] = df[col].unique()
...
>>> d
{'a': array([1, 2, 3]), 'b': array([2, 3, 5])}

Related

How to convert Pandas data frame to dict with values in a list

I have a huge Pandas data frame with the structure follows as an example below:
import pandas as pd
df = pd.DataFrame({'col1': ['A', 'A', 'B', 'C', 'C', 'C'], 'col2': [1, 2, 5, 2, 4, 6]})
df
col1 col2
0 A 1
1 A 2
2 B 5
3 C 2
4 C 4
5 C 6
The task is to build a dictionary with elements in col1 as keys and corresponding elements in col2 as values. For the example above the output should be:
A -> [1, 2]
B -> [5]
C -> [2, 4, 6]
Although I write a solution as
from collections import defaultdict
dd = defaultdict(set)
for row in df.itertuples():
dd[row.col1].append(row.col2)
I wonder if somebody is aware of a more "Python-native" solution, using in-build pandas functions.
Without apply we do it by for loop
{x : y.tolist() for x , y in df.col2.groupby(df.col1)}
{'A': [1, 2], 'B': [5], 'C': [2, 4, 6]}
Use GroupBy.apply with list for Series of lists and then Series.to_dict:
d = df.groupby('col1')['col2'].apply(list).to_dict()
print (d)
{'A': [1, 2], 'B': [5], 'C': [2, 4, 6]}

How would you separate a list into sublists of all the duplicates in the original?

l = [a, a, a, b, b, c]
desired1 = [a, a, a]
desired2 = [b, b]
desired3 = [c]
desired could also be a list of all the lists above.
You can use itertools.groupby() to group common elements together:
>>> from itertools import groupby
>>> l = ['a', 'a', 'a', 'b', 'b', 'c']
>>>
>>> runs = [list(group) for _, group in groupby(l)]
>>> runs
[['a', 'a', 'a'], ['b', 'b'], ['c']]
>>>
Note this will only work if the list has already been sorted, so you may have to sort before grouping:
>>> l = ['a', 'b', 'a', 'b', 'a', 'c'] # unsorted
>>> runs = [list(group) for _, group in groupby(sorted(l))]
>>> runs
[['a', 'a', 'a'], ['b', 'b'], ['c']]
>>>
Another option using Counter :)
from collections import Counter
l = ['a', 'a', 'a', 'b', 'b', 'c']
c = Counter(l)
result = [ [k]*v for k, v in c.items() ]
an easy way is to use a dict
dict = {}
for thing in list:
dict[thing] += 1
list = []
for key in dict:
curr_list = []
for i in range(dict[key]):
curr_list.append(key)
list.append(curr_list)
Slightly more straightforward dict approach
>>> l = [1,2,1,3,4,2,3,1,1,4]
>>> out = {}
>>> for i in set(l):
out[i] = []
...
>>> for i in l:
... out[i].append(i)
>>> out
{1: [1, 1, 1, 1], 2: [2, 2], 3: [3, 3], 4: [4, 4]}
>>> out.values()
[[1, 1, 1, 1], [2, 2], [3, 3], [4, 4]]

More Elegant way to number a list according to values

I would like to map a list into numbers according to the values.
For example:
['aa', 'b', 'b', 'c', 'aa', 'b', 'a'] -> [0, 1, 1, 2, 0, 1, 3]
I'm trying to achieve this by using numpy and a mapping dict.
def number(lst):
x = np.array(lst)
unique_names = list(np.unique(x))
mapping = dict(zip(unique_names, range(len(unique_names)))) # Translating dict
map_func = np.vectorize(lambda name: d[name])
return map_func(x)
Is there a more elegant / faster way to do this?
Update: Bonus question -- do it with the order maintained.
You can use the return_inverse keyword:
x = np.array(['aa', 'b', 'b', 'c', 'aa', 'b', 'a'])
uniq, map_ = np.unique(x, return_inverse=True)
map_
# array([1, 2, 2, 3, 1, 2, 0])
Edit: Order preserving version:
x = np.array(['aa', 'b', 'b', 'c', 'aa', 'b', 'a'])
uniq, idx, map_ = np.unique(x, return_index=True, return_inverse=True)
mxi = idx.max()+1
mask = np.zeros((mxi,), bool)
mask[idx] = True
oidx = np.where(mask)[0]
iidx = np.empty_like(oidx)
iidx[map_[oidx]] = np.arange(oidx.size)
iidx[map_]
# array([0, 1, 1, 2, 0, 1, 3])
Here's a vectorized NumPy based solution -
def argsort_unique(idx):
# Original idea : http://stackoverflow.com/a/41242285/3293881 by #Andras
n = idx.size
sidx = np.empty(n,dtype=int)
sidx[idx] = np.arange(n)
return sidx
def map_uniquetags_keep_order(a):
arr = np.asarray(a)
sidx = np.argsort(arr)
s_arr = arr[sidx]
m = np.concatenate(( [True], s_arr[1:] != s_arr[:-1] ))
unq = s_arr[m]
tags = np.searchsorted(unq, arr)
rev_idx = argsort_unique(sidx[np.searchsorted(s_arr, unq)].argsort())
return rev_idx[tags]
Sample run -
In [169]: a = ['aa', 'b', 'b', 'c', 'aa', 'b', 'a'] # String input
In [170]: map_uniquetags_keep_order(a)
Out[170]: array([0, 1, 1, 2, 0, 1, 3])
In [175]: a = [4, 7, 7, 5, 4, 7, 2] # Numeric input
In [176]: map_uniquetags_keep_order(a)
Out[176]: array([0, 1, 1, 2, 0, 1, 3])
Use sets to remove duplicates:
myList = ['a', 'b', 'b', 'c', 'a', 'b']
mySet = set(myList)
Then build your dictionary using comprehension:
mappingDict = {letter:number for number,letter in enumerate(mySet)}
I did it using the ASCII values because it is easy and short.
def number(list):
return map(lambda x: ord(x)-97,list)
l=['a', 'b', 'b', 'c', 'a', 'b']
print number(l)
Output:
[0, 1, 1, 2, 0, 1]
If the order is not a concern:
[sorted(set(x)).index(item) for item in x]
# returns:
[1, 2, 2, 3, 1, 2, 0]

Python list to dictionnary with indexes

I am trying to convert a list :
[A, B, A, A, B, C]
to a dictionnary with each item and the indexes where it was found :
{ A : [0,2,3], B : [1,4], C : [5] }
Any idea of an efficient way to do that ?
Use a defaultdict and enumerate:
>>> lst = ['a','b','a','a','b','c']
>>> from collections import defaultdict
>>> d = defaultdict(list)
>>> for i, value in enumerate(lst):
... d[value].append(i)
...
>>> d
defaultdict(<class 'list'>, {'a': [0, 2, 3], 'c': [5], 'b': [1, 4]})
Or, this can be accomplished with a plain dict, although, it is usually slower:
>>> lst = ['a','b','a','a','b','c']
>>> d = {}
>>> for i, value in enumerate(lst):
... d.setdefault(value, []).append(i)
...
>>> d
{'a': [0, 2, 3], 'c': [5], 'b': [1, 4]}
You could have, of course, converted the defaultdict to a dict:
>>> d
defaultdict(<class 'list'>, {'a': [0, 2, 3], 'c': [5], 'b': [1, 4]})
>>> dict(d)
{'a': [0, 2, 3], 'c': [5], 'b': [1, 4]}
>>> help(dict)
Try this,
lst = ['A', 'B', 'A', 'A', 'B', 'C']
print {i:[j[0] for j in enumerate(lst) if j[1] == i] for i in set(lst)}
Result
{'A': [0, 2, 3], 'B': [1, 4], 'C': [5]}
Use list comprehension and a dict comprehension. Create a set out of the list first. Then you can easily use enumerate and do this.
>>> l = ["A", "B", "A", "A", "B", "C"]
>>> {i:[j for j,k in enumerate(l) if k==i] for i in set(l)}
{'C': [5], 'B': [1, 4], 'A': [0, 2, 3]}

How can I convert a dictionary table to a list table in python?

def change_table(table):
"""
(dict) -> table
return a table list from a dictionary list where the table list lists are the rows of the items
change_table({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]})
>>> [['a', 'b', 'c'], [1, 1, 1], [2, 2, 2], [3, 3, 3]]
"""
table_list = []
item_table = []
item_row = []
column_row = []
for key in table:
column_row.append(key)
i = 0
for item in range(len(table[key])):
item_table.append(table[key][item])
item_row.insert(i, item_table)
i = i + 1
table_list.extend(column_row)
table_list.extend(item_row)
return table_list
This doesn't do what I want it to do. Your help is greatly appreciated!
[[key for key in table.keys()]] + [[val[i] for val in table.values()] for i in range(len(table.values()[0]))]
Edit: The comment below is valid, the key order is not the same as it is in the original table, however column order is preserved. Using sorted(keys) will disrupt the order:
table = {'a':['a1','a2','a3'],'b':['b1','b2','b3'],'c':['1','2','3']}
[[key for key in sorted(table.keys())]] + [[val[i] for val in table.values()] for i in range(len(table.values()[0]))]
>>>[['a', 'b', 'c'], ['a1', '1', 'b1'], ['a2', '2', 'b2'], ['a3', '3', 'b3']]
In Python 2:
def change_table(d):
values = d.values()
return sorted(d.keys()) + [[item[i] for item in values]
for i in range(len(values[0]))]
if __name__ == '__main__':
print(change_table({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]}))
...or Python 3:
def change_table(d):
values = list(d.values())
return [sorted(d.keys())] + [[item[i] for item in values]
for i in range(len(values[0]))]
if __name__ == '__main__':
print(change_table({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]}))

Categories

Resources