Create a matrix from dynamic dictionary - python

I want to create a matrix.
Input:
data = [
{'a': 2, 'g': 1},
{'p': 3, 'a': 5, 'cat': 4}
...
]
Output:
a p cat g
1st 2 0 0 1
2nd 5 3 4 0
This is my code. But I think it's not smart and very slow when data size huge.
Have any good ways to do this one?
Thank you.
data = [
{'a': 2, 'g': 1},
{'p': 3, 'a': 5, 'cat': 4}
]
### Get keyword map ###
key_map = set()
for row in data:
key_map = key_map.union(set(row.keys()))
key_map = list(key_map) # ['a', 'p', 'g', 'cat']
### Create matrix ###
result = []
for row in data:
matrix = [0] * len(key_map)
for k, v in row.iteritems():
matrix[key_map.index(k)] = v
result.append(matrix)
print result
# [[2, 0, 0, 1], [5, 3, 4, 0]]
Edited
By #wwii advice. Use Pandas looks good:
from pandas import DataFrame
result = DataFrame(data, index=range(len(data)))
print result.fillna(0, downcast=int).as_matrix().tolist()
# [[2, 0, 1, 0], [5, 4, 0, 3]]

You can use set comprehension to generate the key_map
key_map = list({data for row in data for data in row})

Here is a partial answer. I couldn't get the columns in the order specified - it is limited by how the keys get ordered in the set, key_map. It uses string formatting to line the data up - you can play around with the spacing to fit larger or smaller numbers.
# ordinal from
# http://code.activestate.com/recipes/576888-format-a-number-as-an-ordinal/
from ordinal import ordinal
data = [
{'a': 2, 'g': 1},
{'p': 3, 'a': 5, 'cat': 4}
]
### Get keyword map ###
key_map = set()
for row in data:
key_map = key_map.union(set(row.keys()))
key_map = list(key_map) # ['a', 'p', 'g', 'cat']
# strings to format the output
header = '{: >10}{: >8}{: >8}{: >8}'.format(*key_map)
line_fmt = '{: <8}{: >2}{: >8}{: >8}{: >8}'
print header
def ordered_data(d, keys):
"""Returns an ordered list of dictionary values.
returns 0 if key not in d
d --> dict
keys --> list of keys
returns list
"""
return [d.get(key, 0) for key in keys]
for i, thing in enumerate(data):
print line_fmt.format(ordinal(i+1), *ordered_data(thing, key_map))
Output
a p g cat
1st 2 0 1 0
2nd 5 3 0 4
It might be worthwhile to dig into the Pandas docs and look at its DataFrame - it might make life easier.

I second the answer using the Pandas dataframes. However, my code should be a bit simpler than yours.
In [1]: import pandas as pd
In [5]: data = [{'a': 2, 'g': 1},{'p': 3, 'a': 5, 'cat': 4}]
In [6]: df = pd.DataFrame(data)
In [7]: df
Out[7]:
a cat g p
0 2 NaN 1 NaN
1 5 4 NaN 3
In [9]: df = df.fillna(0)
In [10]: df
Out[10]:
a cat g p
0 2 0 1 0
1 5 4 0 3
I did my coding in iPython, which I highly recommend!
To save to csv, just use an additional line of code:
df.to_csv('filename.csv')

I am a freshie in python, just suggestions that may be helpful hopefully:)
key_map = []
for row in data:
key_map.extend(row.keys())
key_map = list(set(key_map))
you can change the middle part to this, which will save you some time to find the key_map
In your case union will at least scan through each row to find the different item.

Related

Replacing multiple columns with values in pandas

I am replacing multiple columns values in pandas with the pd.DataFrame.replace method, however, this will not update any values inside my loop, and I cannot understand why it wont.
For example:
import pandas as pd
df = pd.DataFrame({'A': [0, 1, 2, 2, 2],
'B': [5, 6, 7, 8, 9],
'C': ['a', 'b', 'c', 'd', 'e']})
operators = { 'A':{ 0 : 2 } , 'B': { 5 : 8 }, 'C': None }
for keys, values in operators.items():
if values == None:
continue
else:
for existing, new in values.items():
if keys == 'A' and new is not None:
print(keys, existing, new)
df.replace({keys: existing}, new)
elif keys == 'B' and new is not None:
df.replace({keys: existing}, new)
else:
df.replace({keys: existing}, new)
Will print the exact same values for the dataframe.
You need to store your replacements. That means you should either do:
put inplace=True
df.replace({keys: existing}, new, inplace=True)
save the replacement back to df without inplace
df = df.replace({keys: existing}, new)
Loop
Anyway, why do you need that monstrous 👾 loop?
You can simply pass a dict to pandas.DataFrame.replace, it works like a charm.
It does not support 'C': None though, as the dictionary should either be value to value for all columns {0:2, 5:8} or column value to value as you have it for 'A' and 'B'.
Let's get rid of your None and do the replacement in one go:
import pandas as pd
df = pd.DataFrame({'A': [0, 1, 2, 2, 2],
'B': [5, 6, 7, 8, 9],
'C': ['a', 'b', 'c', 'd', 'e']})
operators = { 'A':{ 0 : 2 } , 'B': { 5 : 8 }, 'C': None }
# df = df.replace({k:v for k,v in operators.items() if v!=None})
df.replace({k:v for k,v in operators.items() if v!=None}, inplace=True)
print(df)
Output:
A B C
0 2 8 a
1 1 6 b
2 2 7 c
3 2 8 d
4 2 9 e

Set tuple pair of (x, y) coordinates into dict as key with id value

The data looks like this:
d = {'location_id': [1, 2, 3, 4, 5], 'x': [47.43715, 48.213889, 46.631111, 46.551111, 47.356628], 'y': [11.880689, 14.274444, 14.371, 13.665556, 11.705181]}
df = pd.DataFrame(data=d)
print(df)
location_id x y
0 1 47.43715 11.880689
1 2 48.213889 14.274444
2 3 46.631111 14.371
3 4 46.551111 13.665556
4 5 47.356628 11.705181
Expected output:
{(47.43715, 11.880689): 1, (48.213889, 14.274444): 2, (46.631111, 14.371): 3, ...}
So i can simply access ID providing point coordinates.
What i have tried:
dict(zip(df['x'].astype('float'), df['y'].astype('float'), zip(df['location_id'])))
Error: ValueError: dictionary update sequence element #0 has length 3; 2 is required
or
dict(zip(tuple(df['x'].astype('float'), df['y'].astype('float')), zip(df['location_id'])))
TypeError: tuple expected at most 1 arguments, got 2
I have Googled for it a while, but I am not very clear about it. Thank you for any assistance.
I think this
result = dict(zip(zip(df['x'], df['y']), df['location_id']))
should give you what you want? Result:
{(47.43715, 11.880689): 1,
(48.213889, 14.274444): 2,
(46.631111, 14.371): 3,
(46.551111, 13.665556): 4,
(47.356628, 11.705181): 5}
I didn't use a dataframe, is this what you wanted?
my_dict = {}
d = {'location_id': [1, 2, 3, 4, 5], 'x': [47.43715, 48.213889, 46.631111, 46.551111, 47.356628], 'y': [11.880689, 14.274444, 14.371, 13.665556, 11.705181]}
for i in range(len(d['location_id'])):
my_dict[ (d['x'][i] , d['y'][i]) ] = d['location_id'][i]
You can set x and y column as index then export location_id column to dictionary
d = df.set_index(['x', 'y'])['location_id'].to_dict()
print(d)
{(47.43715, 11.880689): 1, (48.213889, 14.274444): 2, (46.631111, 14.371): 3, (46.551111, 13.665556): 4, (47.356628, 11.705181): 5}

Count of rows linked by ids in Pandas dataframe

I have a table of ids, and previous ids (see image 1), I want to count the number of unique ids in total linked in one chain, e.g. if we take the latest id as the 'parent' then the result for the example data below would be something like Image 2, where 'a' is linked to 5 total ids (a, b, c, d & e) and 'w' is linked to 4 ids (w, x, y & z). In practicality, I am dealing with randomly generated ids, not sequenced letters.
Python Code to produce example dataframes:
import pandas as pd
raw_data = pd.DataFrame([['a','b'], ['b','c'], ['c', 'd'],['d','e'],['e','-'],
['w','x'], ['x', 'y'], ['y','z'], ['z','-']], columns=['id', 'previous_id'])
output = pd.DataFrame([['a',5],['w',4]], columns = ['parent_id','linked_ids'])
Use convert_matrix.from_pandas_edgelist with connected_components first, then create dictionary for mapping, get first mapped values per groups by Series.map filtered by Series.duplicated and last add new column by Series.map with Counter for mapp dictionary:
df1 = raw_data[raw_data['previous_id'].ne('-')]
import networkx as nx
from collections import Counter
g = nx.from_pandas_edgelist(df1,'id','previous_id')
connected_components = nx.connected_components(g)
d = {y:i for i, x in enumerate(connected_components) for y in x}
print (d)
{'c': 0, 'e': 0, 'b': 0, 'd': 0, 'a': 0, 'y': 1, 'x': 1, 'w': 1, 'z': 1}
c = Counter(d.values())
mapp = {k: c[v] for k, v in d.items()}
print (mapp)
{'c': 5, 'e': 5, 'b': 5, 'd': 5, 'a': 5, 'y': 4, 'x': 4, 'w': 4, 'z': 4}
df = (raw_data.loc[~raw_data['id'].map(d).duplicated(), ['id']]
.rename(columns={'id':'parent_id'})
.assign(linked_ids = lambda x: x['parent_id'].map(mapp)))
print (df)
parent_id linked_ids
0 a 5
5 w 4

How to find sum of dictionaries in a pandas DataFrame across all rows?

I have a DataFrame
df = pd.DataFrame({'keywords': [{'a': 3, 'b': 4, 'c': 5}, {'c':1, 'd':2}, {'a':5, 'c':21, 'd':4}, {'b':2, 'c':1, 'g':1, 'h':1, 'i':1}]})
I want to add all the elements across all rows that would give the result without using iterrows:
a: 8
b: 6
c: 28
d: 6
g: 1
h: 1
i: 1
note: no element occurs twice in a single row in the original DataFrame.
Using collections.Counter, you can sum an iterable of Counter objects. Since Counter is a subclass of dict, you can then feed to pd.DataFrame.from_dict.
from collections import Counter
counts = sum(map(Counter, df['keywords']), Counter())
res = pd.DataFrame.from_dict(counts, orient='index')
print(res)
0
a 8
b 6
c 28
d 6
g 1
h 1
i 1
Not sure how this compares in terms of optimization with #jpp's answer, but I'll give it a shot.
# What we're starting out with
df = pd.DataFrame({'keywords': [{'a': 3, 'b': 4, 'c': 5}, {'c':1, 'd':2}, {'a':5, 'c':21, 'd':4}, {'b':2, 'c':1, 'g':1, 'h':1, 'i':1}]})
# Turns the array of dictionaries into a DataFrame
values_df = pd.DataFrame(df["keywords"].values.tolist())
# Sums up the individual keys
sums = {key:values_df[key].sum() for key in values_df.columns}

drop non-json object rows from python dataframe column

I have a dataframe such that the column contains both json objects and strings. I want to get rid of rows that does not contains json objects.
Below is how my dataframe looks like :
import pandas as pd
df = pd.DataFrame({'A': ["hello","world",{"a":5,"b":6,"c":8},"usa","india",{"a":9,"b":10,"c":11}]})
print(df)
How should i remove the rows that contains only strings, so that after removing those string rows, I can apply below to this column to convert json object into separate columns of dataframe:
from pandas.io.json import json_normalize
df = json_normalize(df['A'])
print(df)
I think I would prefer to use an isinstance check:
In [11]: df.loc[df.A.apply(lambda d: isinstance(d, dict))]
Out[11]:
A
2 {'a': 5, 'b': 6, 'c': 8}
5 {'d': 9, 'e': 10, 'f': 11}
If you want to include numbers too, you can do:
In [12]: df.loc[df.A.apply(lambda d: isinstance(d, (dict, np.number)))]
Out[12]:
A
2 {'a': 5, 'b': 6, 'c': 8}
5 {'d': 9, 'e': 10, 'f': 11}
Adjust this to whichever types you want to include...
The last step, json_normalize takes a list of json objects, for whatever reason a Series is no good (and gives the KeyError), you can make this a list and your good to go:
In [21]: df1 = df.loc[df.A.apply(lambda d: isinstance(d, (dict, np.number)))]
In [22]: json_normalize(list(df1["A"]))
Out[22]:
a b c d e f
0 5.0 6.0 8.0 NaN NaN NaN
1 NaN NaN NaN 9.0 10.0 11.0
df[df.applymap(np.isreal).sum(1).gt(0)]
Out[794]:
A
2 {'a': 5, 'b': 6, 'c': 8}
5 {'d': 9, 'e': 10, 'f': 11}
If you want an ugly solution that also works...here's a function I created that finds columns that contain only strings, and returns the df minus those rows. (since your df has only one column, you'll just dataframe containing 1 column with all dicts). Then, from there, you'll want to use
df = json_normalize(df['A'].values) instead of just df = json_normalize(df['A']).
For a single column dataframe...
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
def delete_strings(df):
nrows = df.shape[0]
rows_to_keep = []
for row in np.arange(nrows):
if type(df.iloc[row,0]) == dict:
rows_to_keep.append(row) #add the row number to list of rows
#to keep if the row contains a dict
return df.iloc[rows_to_keep,0] #return only rows with dicts
df = pd.DataFrame({'A': ["hello","world",{"a":5,"b":6,"c":8},"usa","india",
{"a":9,"b":10,"c":11}]})
df = delete_strings(df)
df = json_normalize(df['A'].values)
print(df)
#0 {'a': 5, 'b': 6, 'c': 8}
#1 {'a': 9, 'b': 10, 'c': 11}
For a multi-column df (also works with a single column df):
def delete_rows_of_strings(df):
rows = df.shape[0] #of rows in df
cols = df.shape[1] #of coluns in df
rows_to_keep = [] #list to track rows to keep
for row in np.arange(rows): #for every row in the dataframe
#num_string will count the number of strings in the row
num_string = 0
for col in np.arange(cols): #for each column in the row...
#if the value is a string, add one to num_string
if type(df.iloc[row,col]) == str:
num_string += 1
#if num_string, the number of strings in the column,
#isn't equal to the number of columns in the row...
if num_string != cols: #...add that row number to the list of rows to keep
rows_to_keep.append(row)
#return the df with rows containing at least one non string
return(df.iloc[rows_to_keep,:])
df = pd.DataFrame({'A': ["hello","world",{"a":5,"b":6,"c":8},"usa","india"],
'B' : ['hi',{"a":5,"b":6,"c":8},'sup','america','china']})
# A B
#0 hello hi
#1 world {'a': 5, 'b': 6, 'c': 8}
#2 {'a': 5, 'b': 6, 'c': 8} sup
print(delete_rows_of_strings(df))
# A B
#1 world {'a': 5, 'b': 6, 'c': 8}
#2 {'a': 5, 'b': 6, 'c': 8} sup

Categories

Resources