Python dict to csv in columns format - python

I have the following dict
items = {'people': ['Peter', 'Danny'], 'numbers': [1,2,3,4], 'cities': ['London']}
And I would like to write that dict to a CSV file by columns, that is, with the following format:
people,numbers,cities
Peter,1,London
Danny,2,
,3,
,4,
My current approach won't work because I get the CSV file by rows:
people,Peter,Danny
numbers,1,2,3,4
cities,London
How can I do what I need?

Or you can use Pandas for that, which only takes two lines
import pandas as pd
pd.DataFrame(items).fillna('').to_csv('file_path')

You can use itertools.zip_longest (itertools.izip_longest in Python2):
from itertools import zip_longest
import csv
items = {'people': ['Peter', 'Danny'], 'numbers': [1,2,3,4], 'cities': ['London']}
headers = ['people', 'numbers', 'cities']
with open('filename.csv', 'w') as f:
full_listing = [['' if not b else b for b in i] for i in zip_longest(*[items[c] for c in headers])]
write = csv.writer(f)
write.writerows([headers]+full_listing)
Output:
people,numbers,cities
Peter,1,London
Danny,2,
,3,
,4,

A simple way is to calculate the length of the longest list in your dictionary, and then append '' to all the lists so they have this length.
num_rows = max((len(x) for x in items.values()))
items = {k: items[k] + [''] * (num_rows - len(items[k])) for k in items}
print(items)
#{'cities': ['London', '', '', ''],
# 'numbers': [1, 2, 3, 4],
# 'people': ['Peter', 'Danny', '', '']}
Then write the dict to csv using the csv module.
Or you can build a pandas DataFrame from your dictionary:
import pandas as pd
df = pd.DataFrame(items)
print(df)
# cities numbers people
#0 London 1 Peter
#1 2 Danny
#2 3
#3 4
Now you can write it to a file using the to_csv() method.

If you do not want to rely on external dependencies like pandas, you can quickly achieve this in pure python with join method of str objects.
items = {'people': ['Peter', 'Danny'],
'numbers': [1, 2, 3, 4],
'cities': ['London']}
def to_csv(items):
# names of columns
header = ','.join(list(items.keys()))
# building lines
lines = list()
max_len = max([len(items[key]) for key in items.keys()])
for i in range(max_len):
lines.append(
','.join(
[str(items[key][i]) for key in items.keys()
if i < len(items[key])]))
# return header and lines separated by new lines
return '\n'.join([header] + lines)
print(to_csv(items))
outputs :
people,numbers,cities
Peter,1,London
Danny,2,
,3,
,4,

Related

Extract key value pairs from dict in pandas column using list items in another column

Trying to create a new column that is the key/value pairs extracted from a dict in another column using list items in a second column.
Sample Data:
names name_dicts
['Mary', 'Joe'] {'Mary':123, 'Ralph':456, 'Joe':789}
Expected Result:
names name_dicts new_col
['Mary', 'Joe'] {'Mary':123, 'Ralph':456, 'Joe':789} {'Mary':123, 'Joe':789}
I have attempted to use AST to convert the name_dicts column to a column of true dictionaries.
This function errored out with a "cannot convert string" error.
col here is the df['name_dicts'] col
def get_name_pairs(col):
for k,v in col.items():
if k.isin(df['names']):
return
Using a list comprehension and operator.itemgetter:
from operator import itemgetter
df['new_col'] = [dict(zip(l, itemgetter(*l)(d)))
for l,d in zip(df['names'], df['name_dicts'])]
output:
names name_dicts new_col
0 [Mary, Joe] {'Mary': 123, 'Ralph': 456, 'Joe': 789} {'Mary': 123, 'Joe': 789}
used input:
df = pd.DataFrame({'names': [['Mary', 'Joe']],
'name_dicts': [{'Mary':123, 'Ralph':456, 'Joe':789}]
})
You can apply a lambda function with dictionary comprehension at row level to get the values from the dict in second column based on the keys in the list of first column:
# If col values are stored as string:
import ast
for col in df:
df[col] = df[col].apply(ast.literal_eval)
df['new_col']=df.apply(lambda x: {k:x['name_dicts'].get(k,0) for k in x['names']},
axis=1)
# Replace above lambda by
# lambda x: {k:x['name_dicts'][k] for k in x['names'] if k in x['name_dicts']}
# If you want to include only key/value pairs for the key that is in
# both the list and the dictionary
names ... new_col
0 [Mary, Joe] ... {'Mary': 123, 'Joe': 789}
[1 rows x 3 columns]
PS: ast.literal_eval runs without error for the sample data you have posted for above code.
Your function needs only small change - and you can use it with .apply()
import pandas as pd
df = pd.DataFrame({
'names': [['Mary', 'Joe']],
'name_dicts': [{'Mary':123, 'Ralph':456, 'Joe':789}],
})
def filter_data(row):
result = {}
for key, val in row['name_dicts'].items():
if key in row['names']:
result[key] = val
return result
df['new_col'] = df.apply(filter_data, axis=1)
print(df.to_string())
Result:
names name_dicts new_col
0 [Mary, Joe] {'Mary': 123, 'Ralph': 456, 'Joe': 789} {'Mary': 123, 'Joe': 789}
EDIT:
If you have string "{'Mary':123, 'Ralph':456, 'Joe':789}" in name_dicts then you can replace ' with " and you will have json which you can convert to dictionary using json.loads
import json
df['name_dicts'] = df['name_dicts'].str.replace("'", '"').apply(json.loads)
Or directly convert it as Python's code:
import ast
df['name_dicts'] = df['name_dicts'].apply(ast.literal_eval)
Eventually:
df['name_dicts'] = df['name_dicts'].apply(eval)
Full code:
import pandas as pd
df = pd.DataFrame({
'names': [['Mary', 'Joe']],
'name_dicts': ["{'Mary':123, 'Ralph':456, 'Joe':789}",], # strings
})
#import json
#df['name_dicts'] = df['name_dicts'].str.replace("'", '"').apply(json.loads)
#df['name_dicts'] = df['name_dicts'].apply(eval)
import ast
df['name_dicts'] = df['name_dicts'].apply(ast.literal_eval)
def filter_data(row):
result = {}
for key, val in row['name_dicts'].items():
if key in row['names']:
result[key] = val
return result
df['new_col'] = df.apply(filter_data, axis=1)
print(df.to_string())

How to split a python dictionary containing lists into separate dictionaries

I have a dict like this:
dictl = {'name':['william','tom','kite','john'],'age':[1,2,3,4]}
the output should like this:
dict1 = {'name':['william','tom'],'age':[1,2]}
dict2 = {'name':['kite','john'],'age':[3,4]}
code:
dicts =[]
new_dict ={}
for k,v in dictl.items():
new_dict[k]=dict[k][:2]
dicts.append(new_dict)
This code works, but is any better way to do that?
If you are allowed to use a library, you can do this:
import pandas as pd
df = pd.DataFrame({'name': ['william', 'tom', 'kite', 'john'], 'age': [1, 2, 3, 4]})
[g.to_dict(orient='list') for _,g in df.groupby(np.arange(len(df))//2)]
# Out[154]:
# [{'name': ['william', 'tom'], 'age': [1, 2]},
# {'name': ['kite', 'john'], 'age': [3, 4]}]
Your code doesn't work, so any way that does work would be better.
We can step through the sublists and grab slices:
start_dict = {'name':['william','tom','kite','john'],'age':[1,2,3,4]}
# find out how many directories we need after the split:
split_count = (len(start_dict[list(start_dict)[0]]) - 1) // 2 + 1
# prepare blanks:
split_dict = [dict() for _ in range(split_count)]
# grab the dictionary entries:
for k,v in start_dict.items():
# split out suitable portions:
for ix, sub in enumerate(split_dict):
sub[k] = v[2*ix:2*ix+2]
But I have to wonder what you are using this data structure for, and whether there is a better way without the synchronized lists.

write dictionary of lists to a tab delimited file in python, with dictionary key values as columns without Pandas

the dictionary I am using is:
dict={'item': [1,2,3], 'id':['a','b','c'], 'car':['sedan','truck','moped'], 'color': ['r','b','g'], 'speed': [2,4,10]}
I am trying to produce a tab delimited out put as such:
item id
1 a
2 b
3 c
The code I have written:
with open('file.txt', 'w') as tab_file:
dict_writer = DictWriter(tab_file, dict.keys(), delimiter = '\t')
dict_writer.writeheader()
dict_writer.writerows(dict)
specifically, I am struggling with writing to the file in a column based manner. Meaning, that the dictionary keys populate as the header, and the dictionary values populate vertically underneath the associated header. Also, I do NOT have the luxury of using Pandas
This solution will work for an ambiguous number of items and subitems in the dict:
d = {'item': [1, 2, 3], 'id': [4, 5, 6]}
for i in d:
print(i + "\t", end="")
numSubItems = len(d[i])
print()
for level in range(numSubItems):
for i in d:
print(str(d[i][level]) + "\t", end="")
print()
EDIT:
To implement this with writing to a text file:
d = {'item': [1, 2, 3], 'id': [4, 5, 6], 'test': [6, 7, 8]}
with open('file.txt', 'w') as f:
for i in d:
f.write(i + "\t")
numSubItems = len(d[i])
f.write("\n")
for level in range(numSubItems):
for i in d:
f.write(str(d[i][level]) + "\t")
f.write("\n")
Here's a way to do this using a one-off function and zip:
d = {
'item': [1, 2, 3],
'id': ['a', 'b', 'c'],
'car': ['sedan', 'truck', 'moped'],
'color': ['r', 'b', 'g'],
'speed': [2, 4, 10],
}
def row_printer(row):
print(*row, sep='\t')
row_printer(d.keys()) # Print header
for t in zip(*d.values()): # Print rows
row_printer(t)
To print to a file: print(..., file='file.txt')
You can use a simple loop with a zip:
d={'item': [1,2,3], 'id':["a","b","c"]}
print('item\tid')
for num, letter in zip(d['item'], d['id']):
print('\t'.join(str(num) + letter))
item id
1 a
2 b
3 c
EDIT:
If you don't want to hard code column names you can use this:
d={'item': [1,2,3], 'id':["a","b","c"]}
print('\t'.join(d.keys()))
for num, letter in zip(*d.values()):
print('\t'.join(str(num) + letter))
However the order of the columns is only guaranteed in python3.7+ if you use a dictionary. If you have a lower python version use an orderedDict instead, like this:
from collections import OrderedDict
d=OrderedDict({'item': [1,2,3], 'id':["a","b","c"]})
print('\t'.join(d.keys()))
for num, letter in zip(*d.values()):
print('\t'.join(str(num) + letter))
Instead of using csv.DictWriter you can also use a module like pandas for this:
import pandas as pd
df = pd.DataFrame.from_dict(d)
df.to_csv(“test.csv”, sep=“\t”, index=False)
Probably, you have to install it first by using
pip3 install pandas
See here for an example.

What is the Python list or NumPy equivalent of Excel's SUMIF function?

I have a 2D array:
expenses = np.array([['jim', 'sam', 'bill', 'sam'],[1,2,6,5]])
I want to know the total expenses for each unique person in a new array without hardcoding any names (real list is very long) so that I get an output like this:
totals = [['jim', 'sam', 'bill'],[1,7,6]]
Is there a way of doing this with a list or NumPy? I don't want to use Pandas for this.
Thanks in advance!
names = np.asarray(['jim', 'sam', 'bill', 'sam'])
values = np.asarray([1, 2, 6, 5])
result = {name: values[names == name].sum() for name in np.unique(names)}
Another fun way to do this (without numpy) is using a Counter:
from collections import Counter
names = ['jim', 'sam', 'bill', 'sam']
counts = [1,2,6,5]
c = Counter()
for name, count in zip(names,counts):
c[name] += count
# Remapping of dict to list of lists
list(map(list, zip(*c.items())))
Output:
[['sam', 'jim', 'bill'], [7, 1, 6]]

Assigning values to an unhashable list in pandas dataframe

I have a column in dataframe which contains lists. As you can see from the below image.
I want to know how can I extract all the words from this column without any duplicate words and need to iterate over the list of unique words from 0 to len(uniquewordlist) and assign a value to each word based on which iteration I'm in.
Thanks for your help.
This is sort of how your data is!
import pandas as pd
df = pd.DataFrame([[['kubernetes', 'client', 'bootstrapping', 'ponda']], [['micro', 'insu']], [['motor', 'upi']],[['secure', 'app', 'installation']],[['health', 'insu', 'express', 'credit', 'customer']],[['secure', 'app', 'installation']],[['aap', 'insta']],[['loan', 'house', 'loan', 'customers']]])
df.columns = ['ingredients']
print(df)
Output:
ingredients
0 [kubernetes, client, bootstrapping, ponda]
1 [micro, insu]
2 [motor, upi]
3 [secure, app, installation]
4 [health, insu, express, credit, customer]
5 [secure, app, installation]
6 [aap, insta]
7 [loan, house, loan, customers]
Here is the code to bring out a list of unique words.
for i in df.index:
df.at[i, 'string'] = " ".join(item for item in df.at[i, 'ingredients'])
df.drop(['ingredients'], axis = 1, inplace = True)
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer()
counts = countvec.fit_transform(df['string'])
vocab = pd.DataFrame(counts.toarray())
vocab.columns = countvec.get_feature_names()
print(list(vocab.columns))
Gives
['aap', 'app', 'bootstrapping', 'client', 'credit', 'customer', 'customers', 'express', 'health', 'house', 'insta', 'installation', 'insu', 'kubernetes', 'loan', 'micro', 'motor', 'ponda', 'secure', 'upi']
You now have a list of your unique vocabulary. If you can give further context as to how you want to assign values, I can continue this answer.
Extended answer:
wordlist = list(vocab.columns)
worddict = {}
for i in range(0, len(wordlist)):
worddict[wordlist[i]] = i
print(worddict)
You can use enumerate and itertools.chain within a dictionary comprehension. set ensures mappings are unique.
Data from #Abhishek.
from itertools import chain
res = {v: k for k, v in enumerate(set(chain.from_iterable(df['ingredients'])))}
print(res)
{'aap': 15,
'app': 3,
'bootstrapping': 1,
...
'ponda': 0,
'secure': 17,
'upi': 5}
You can obtain #jpp's answer with a different one liner (works for dataframes as well):
import pandas as pd
from collections import Counter
s = pd.Series([['apple', 'orange', 'raspberry'],
['apple', 'cucumber', 'strawberry', 'orange']])
s.apply(Counter).sum()
Counter({'apple': 2,
'cucumber': 1,
'orange': 2,
'raspberry': 1,
'strawberry': 1})
If you use
list(s.apply(Counter).sum().keys())
you get exactly #Abhishek's answer, which is in my opinion more readable. Applying set won't work, since + is not defined for sets

Categories

Resources