Pretty Print output in a sideways tree format in console window - python

I have a dictionary such as this created using Python.
d = {'a': ['Adam', 'Book', 4], 'b': ['Bill', 'TV', 6, 'Jill', 'Sports', 1, 'Bill', 'Computer', 5], 'c': ['Bill', 'Sports', 3], 'd': ['Quin', 'Computer', 3, 'Adam', 'Computer', 3], 'e': ['Quin', 'TV', 2, 'Quin', 'Book', 5], 'f': ['Adam', 'Computer', 7]}
I wanted to print this out in a sideways tree format rather on the console. I've tried pretty print but when the dictionary gets long, it becomes difficult to read.
For example, with this dictionary, it would return:
a -> Book -> Adam -> 4
b -> TV -> Bill -> 6
-> Sports -> Jill -> 1
-> Computer -> Bill -> 5
c -> Sports -> Bill -> 3
d -> Computer -> Quin -> 3
-> Adam -> 3
e -> TV -> Quin -> 2
Book -> Quin -> 5
f -> Computer -> Adam -> 7
Essentially, the pretty print is organized by the Activity, or the item in second position in the list, then by name and then by the number.
The sample output above is just an example. I tried working with Pretty print a tree but was unable to figure out how to turn that into a sideways format.

You can have a look at the code of the ETE toolkit. The function _asciiArt produces nice representations of trees even with internal node labels
from ete2 import Tree
t = Tree("(((A,B), C), D);")
print t
# /-A
# /---|
# /---| \-B
# | |
#----| \-C
# |
# \-D

Here's how I would do it. Since the tree is only two levels deep -- despite what your desired output format might seem to imply -- there's no need to use recursion to traverse its contents, as iteration works quite well. Probably this is nothing like the #f code you referenced, since I don't know the language, but it's a lot shorter and more readable -- at least to me.
from itertools import izip
def print_tree(tree):
for key in sorted(tree.iterkeys()):
data = tree[key]
previous = data[0], data[1], data[2]
first = True
for name, activity, value in izip(*[iter(data)]*3): # groups of three
activity = activity if first or activity != previous[1] else ' '*len(activity)
print '{} ->'.format(key) if first else ' ',
print '{} -> {} -> {}'.format(activity, name, value)
previous = name, activity, value
first = False
d = {'a': ['Adam', 'Book', 4],
'b': ['Bill', 'TV', 6, 'Jill', 'Sports', 1, 'Bill', 'Computer', 5],
'c': ['Bill', 'Sports', 3],
'd': ['Quin', 'Computer', 3, 'Adam', 'Computer', 3],
'e': ['Quin', 'TV', 2, 'Quin', 'Book', 5],
'f': ['Adam', 'Computer', 7]}
print_tree(d)
Output:
a -> Book -> Adam -> 4
b -> TV -> Bill -> 6
Sports -> Jill -> 1
Computer -> Bill -> 5
c -> Sports -> Bill -> 3
d -> Computer -> Quin -> 3
-> Adam -> 3
e -> TV -> Quin -> 2
Book -> Quin -> 5
f -> Computer -> Adam -> 7
Update
To organize the output by name instead of activity you'd need to change three lines as indicated below:
from itertools import izip
def print_tree(tree):
for key in sorted(tree.iterkeys()):
data = tree[key]
previous = data[0], data[1], data[2]
first = True
for name, activity, value in sorted(izip(*[iter(data)]*3)): # changed
name = name if first or name != previous[0] else ' '*len(name) # changed
print '{} ->'.format(key) if first else ' ',
print '{} -> {} -> {}'.format(name, activity, value) # changed
previous = name, activity, value
first = False
Output after modification:
a -> Adam -> Book -> 4
b -> Bill -> Computer -> 5
-> TV -> 6
Jill -> Sports -> 1
c -> Bill -> Sports -> 3
d -> Adam -> Computer -> 3
Quin -> Computer -> 3
e -> Quin -> Book -> 5
-> TV -> 2
f -> Adam -> Computer -> 7

def treePrint(tree):
for key in tree:
print key, # comma prevents a newline character
treeElem = tree[key] # multiple lookups is expensive, even amortized O(1)!
for subElem in treeElem:
print " -> ", subElem,
if type(subElem) != str: # OP wants indenting after digits
print "\n " # newline and a space to match indenting
print "" # forces a newline

Related

Convert nested dictionary to pandas dataframe

I have a nested dictionary as below:
stud_data_dict = { 's1' : {'Course 1' : {'Course Name':'Maths',
'Marks':95,
'Grade': 'A+'},
'Course 2' : {'Course Name':'Science',
'Marks': 75,
'Grade': 'B-'}},
's2' : {'Course 1' : {'Course Name':'English',
'Marks': 82,
'Grade': 'B'},
'Course 2' : {'Course Name':'Maths',
'Marks': 90,
'Grade': 'A'}}}
I need to convert it into a dataframe like below
Student Course 1 Course 2
Course Name Marks Grade Course Name Marks Grade
s1 Maths 95 A+ Science 75 B-
s2 English 82 B Maths 90 A
I have tired the following code from this answer
stud_df = pandas.DataFrame.from_dict(stud_data_dict, orient="index").stack().to_frame()
final_df = pandas.DataFrame(stud_df[0].values.tolist(), index=stud_df.index)
I am getting the dataframe like below
Course Name Marks Grade
s1 Course 1 Maths 95 A+
Course 2 Science 75 B-
s2 Course 1 English 82 B
Course 2 Maths 90 A
This is the closest I got to the desired output. What changes do I need to make to the code to get the desired dataframe?
Change dictionary first and then pass to Series with reshape by Series.unstack:
#reformat nested dict
#https://stackoverflow.com/a/39807565/2901002
d = {(level1_key, level2_key, level3_key): values
for level1_key, level2_dict in stud_data_dict.items()
for level2_key, level3_dict in level2_dict.items()
for level3_key, values in level3_dict.items()}
stud_df = pd.Series(d).unstack([1,2])
print (stud_df)
Course 1 Course 2
Course Name Marks Grade Course Name Marks Grade
s1 Maths 95 A+ Science 75 B-
s2 English 82 B Maths 90 A
Another idea is created dictionary of tuples in keys with defaultdict:
from collections import defaultdict
d = defaultdict(dict)
for k, v in stud_data_dict.items():
for k1, v1 in v.items():
for k2, v2 in v1.items():
d[(k1, k2)].update({k: v2})
df = pd.DataFrame(d)
print(df)
Course 1 Course 2
Course Name Marks Grade Course Name Marks Grade
s1 Maths 95 A+ Science 75 B-
s2 English 82 B Maths 90 A
One option is to create data frames from the inner dictionaries, concatenate into a single frame, reshape and cleanup:
out = {key: pd.DataFrame.from_dict(value, orient='index')
for key, value in stud_data_dict.items()}
(pd
.concat(out)
.unstack()
.swaplevel(axis = 1)
.sort_index(axis = 1)
.rename_axis('Student')
.reset_index()
)
Student Course 1 Course 2
Course Name Grade Marks Course Name Grade Marks
0 s1 Maths A+ 95 Science B- 75
1 s2 English B 82 Maths A 90
You should get more performance if you can do all the initial wrangling in vanilla python or numpy, before creating the final dataframe:
out = []; outer = []; bottom = []; index = [];
for key, value in stud_data_dict.items():
out = []
for k, v in value.items():
out.extend(v.values())
outer.append(out)
index.append(key)
bottom.extend(v.keys())
top = np.repeat([*value.keys()], len(v))
pd.DataFrame(outer,
columns = [top, bottom],
index = index)
Course 1 Course 2
Course Name Marks Grade Course Name Marks Grade
s1 Maths 95 A+ Science 75 B-
s2 English 82 B Maths 90 A

Trying to return my list as a table in Python [duplicate]

I am quite new to Python and I am now struggling with formatting my data nicely for printed output.
I have one list that is used for two headings, and a matrix that should be the contents of the table. Like so:
teams_list = ["Man Utd", "Man City", "T Hotspur"]
data = np.array([[1, 2, 1],
[0, 1, 0],
[2, 4, 2]])
Note that the heading names are not necessarily the same lengths. The data entries are all integers, though.
Now, I want to represent this in a table format, something like this:
Man Utd Man City T Hotspur
Man Utd 1 0 0
Man City 1 1 0
T Hotspur 0 1 2
I have a hunch that there must be a data structure for this, but I cannot find it. I have tried using a dictionary and formatting the printing, I have tried for-loops with indentation and I have tried printing as strings.
I am sure there must be a very simple way to do this, but I am probably missing it due to lack of experience.
There are some light and useful python packages for this purpose:
1. tabulate: https://pypi.python.org/pypi/tabulate
from tabulate import tabulate
print(tabulate([['Alice', 24], ['Bob', 19]], headers=['Name', 'Age']))
Name Age
------ -----
Alice 24
Bob 19
tabulate has many options to specify headers and table format.
print(tabulate([['Alice', 24], ['Bob', 19]], headers=['Name', 'Age'], tablefmt='orgtbl'))
| Name | Age |
|--------+-------|
| Alice | 24 |
| Bob | 19 |
2. PrettyTable: https://pypi.python.org/pypi/PrettyTable
from prettytable import PrettyTable
t = PrettyTable(['Name', 'Age'])
t.add_row(['Alice', 24])
t.add_row(['Bob', 19])
print(t)
+-------+-----+
| Name | Age |
+-------+-----+
| Alice | 24 |
| Bob | 19 |
+-------+-----+
PrettyTable has options to read data from csv, html, sql database. Also you are able to select subset of data, sort table and change table styles.
3. texttable: https://pypi.python.org/pypi/texttable
from texttable import Texttable
t = Texttable()
t.add_rows([['Name', 'Age'], ['Alice', 24], ['Bob', 19]])
print(t.draw())
+-------+-----+
| Name | Age |
+=======+=====+
| Alice | 24 |
+-------+-----+
| Bob | 19 |
+-------+-----+
with texttable you can control horizontal/vertical align, border style and data types.
4. termtables: https://github.com/nschloe/termtables
import termtables as tt
string = tt.to_string(
[["Alice", 24], ["Bob", 19]],
header=["Name", "Age"],
style=tt.styles.ascii_thin_double,
# alignment="ll",
# padding=(0, 1),
)
print(string)
+-------+-----+
| Name | Age |
+=======+=====+
| Alice | 24 |
+-------+-----+
| Bob | 19 |
+-------+-----+
with texttable you can control horizontal/vertical align, border style and data types.
Other options:
terminaltables Easily draw tables in terminal/console applications from a list of lists of strings. Supports multi-line rows.
asciitable Asciitable can read and write a wide range of ASCII table formats via built-in Extension Reader Classes.
Some ad-hoc code:
row_format ="{:>15}" * (len(teams_list) + 1)
print(row_format.format("", *teams_list))
for team, row in zip(teams_list, data):
print(row_format.format(team, *row))
This relies on str.format() and the Format Specification Mini-Language.
>>> import pandas
>>> pandas.DataFrame(data, teams_list, teams_list)
Man Utd Man City T Hotspur
Man Utd 1 2 1
Man City 0 1 0
T Hotspur 2 4 2
Python actually makes this quite easy.
Something like
for i in range(10):
print '%-12i%-12i' % (10 ** i, 20 ** i)
will have the output
1 1
10 20
100 400
1000 8000
10000 160000
100000 3200000
1000000 64000000
10000000 1280000000
100000000 25600000000
1000000000 512000000000
The % within the string is essentially an escape character and the characters following it tell python what kind of format the data should have. The % outside and after the string is telling python that you intend to use the previous string as the format string and that the following data should be put into the format specified.
In this case I used "%-12i" twice. To break down each part:
'-' (left align)
'12' (how much space to be given to this part of the output)
'i' (we are printing an integer)
From the docs: https://docs.python.org/2/library/stdtypes.html#string-formatting
Updating Sven Marnach's answer to work in Python 3.4:
row_format ="{:>15}" * (len(teams_list) + 1)
print(row_format.format("", *teams_list))
for team, row in zip(teams_list, data):
print(row_format.format(team, *row))
I know that I am late to the party, but I just made a library for this that I think could really help. It is extremely simple, that's why I think you should use it. It is called TableIT.
Basic Use
To use it, first follow the download instructions on the GitHub Page.
Then import it:
import TableIt
Then make a list of lists where each inner list is a row:
table = [
[4, 3, "Hi"],
[2, 1, 808890312093],
[5, "Hi", "Bye"]
]
Then all you have to do is print it:
TableIt.printTable(table)
This is the output you get:
+--------------------------------------------+
| 4 | 3 | Hi |
| 2 | 1 | 808890312093 |
| 5 | Hi | Bye |
+--------------------------------------------+
Field Names
You can use field names if you want to (if you aren't using field names you don't have to say useFieldNames=False because it is set to that by default):
TableIt.printTable(table, useFieldNames=True)
From that you will get:
+--------------------------------------------+
| 4 | 3 | Hi |
+--------------+--------------+--------------+
| 2 | 1 | 808890312093 |
| 5 | Hi | Bye |
+--------------------------------------------+
There are other uses to, for example you could do this:
import TableIt
myList = [
["Name", "Email"],
["Richard", "richard#fakeemail.com"],
["Tasha", "tash#fakeemail.com"]
]
TableIt.print(myList, useFieldNames=True)
From that:
+-----------------------------------------------+
| Name | Email |
+-----------------------+-----------------------+
| Richard | richard#fakeemail.com |
| Tasha | tash#fakeemail.com |
+-----------------------------------------------+
Or you could do:
import TableIt
myList = [
["", "a", "b"],
["x", "a + x", "a + b"],
["z", "a + z", "z + b"]
]
TableIt.printTable(myList, useFieldNames=True)
And from that you get:
+-----------------------+
| | a | b |
+-------+-------+-------+
| x | a + x | a + b |
| z | a + z | z + b |
+-----------------------+
Colors
You can also use colors.
You use colors by using the color option (by default it is set to None) and specifying RGB values.
Using the example from above:
import TableIt
myList = [
["", "a", "b"],
["x", "a + x", "a + b"],
["z", "a + z", "z + b"]
]
TableIt.printTable(myList, useFieldNames=True, color=(26, 156, 171))
Then you will get:
Please note that printing colors might not work for you but it does works the exact same as the other libraries that print colored text. I have tested and every single color works. The blue is not messed up either as it would if using the default 34m ANSI escape sequence (if you don't know what that is it doesn't matter). Anyway, it all comes from the fact that every color is RGB value rather than a system default.
More Info
For more info check the GitHub Page
Just use it
from beautifultable import BeautifulTable
table = BeautifulTable()
table.column_headers = ["", "Man Utd","Man City","T Hotspur"]
table.append_row(['Man Utd', 1, 2, 3])
table.append_row(['Man City', 7, 4, 1])
table.append_row(['T Hotspur', 3, 2, 2])
print(table)
As a result, you will get such a neat table and that's it.
A simple way to do this is to loop over all columns, measure their width, create a row_template for that max width, and then print the rows. It's not exactly what you are looking for, because in this case, you first have to put your headings inside the table, but I'm thinking it might be useful to someone else.
table = [
["", "Man Utd", "Man City", "T Hotspur"],
["Man Utd", 1, 0, 0],
["Man City", 1, 1, 0],
["T Hotspur", 0, 1, 2],
]
def print_table(table):
longest_cols = [
(max([len(str(row[i])) for row in table]) + 3)
for i in range(len(table[0]))
]
row_format = "".join(["{:>" + str(longest_col) + "}" for longest_col in longest_cols])
for row in table:
print(row_format.format(*row))
You use it like this:
>>> print_table(table)
Man Utd Man City T Hotspur
Man Utd 1 0 0
Man City 1 1 0
T Hotspur 0 1 2
When I do this, I like to have some control over the details of how the table is formatted. In particular, I want header cells to have a different format than body cells, and the table column widths to only be as wide as each one needs to be. Here's my solution:
def format_matrix(header, matrix,
top_format, left_format, cell_format, row_delim, col_delim):
table = [[''] + header] + [[name] + row for name, row in zip(header, matrix)]
table_format = [['{:^{}}'] + len(header) * [top_format]] \
+ len(matrix) * [[left_format] + len(header) * [cell_format]]
col_widths = [max(
len(format.format(cell, 0))
for format, cell in zip(col_format, col))
for col_format, col in zip(zip(*table_format), zip(*table))]
return row_delim.join(
col_delim.join(
format.format(cell, width)
for format, cell, width in zip(row_format, row, col_widths))
for row_format, row in zip(table_format, table))
print format_matrix(['Man Utd', 'Man City', 'T Hotspur', 'Really Long Column'],
[[1, 2, 1, -1], [0, 1, 0, 5], [2, 4, 2, 2], [0, 1, 0, 6]],
'{:^{}}', '{:<{}}', '{:>{}.3f}', '\n', ' | ')
Here's the output:
| Man Utd | Man City | T Hotspur | Really Long Column
Man Utd | 1.000 | 2.000 | 1.000 | -1.000
Man City | 0.000 | 1.000 | 0.000 | 5.000
T Hotspur | 2.000 | 4.000 | 2.000 | 2.000
Really Long Column | 0.000 | 1.000 | 0.000 | 6.000
I think this is what you are looking for.
It's a simple module that just computes the maximum required width for the table entries and then just uses rjust and ljust to do a pretty print of the data.
If you want your left heading right aligned just change this call:
print >> out, row[0].ljust(col_paddings[0] + 1),
From line 53 with:
print >> out, row[0].rjust(col_paddings[0] + 1),
Pure Python 3
def print_table(data, cols, wide):
'''Prints formatted data on columns of given width.'''
n, r = divmod(len(data), cols)
pat = '{{:{}}}'.format(wide)
line = '\n'.join(pat * cols for _ in range(n))
last_line = pat * r
print(line.format(*data))
print(last_line.format(*data[n*cols:]))
data = [str(i) for i in range(27)]
print_table(data, 6, 12)
Will print
0 1 2 3 4 5
6 7 8 9 10 11
12 13 14 15 16 17
18 19 20 21 22 23
24 25 26
table_data= [[1,2,3],[4,5,6],[7,8,9]]
for row in table_data:
print("{: >20} {: >20} {: >20}".format(*row))
OUTPUT:
1 2 3
4 5 6
7 8 9
wherein f string formatting
">" is used for right alignment
"<" is used for left alignment
20 is the space width that can be changed according to the requirement.
try rich: https://github.com/Textualize/rich
from rich.console import Console
from rich.table import Table
console = Console()
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Date", style="dim", width=12)
table.add_column("Title")
table.add_column("Production Budget", justify="right")
table.add_column("Box Office", justify="right")
table.add_row(
"Dec 20, 2019", "Star Wars: The Rise of Skywalker", "$275,000,000", "$375,126,118"
)
table.add_row(
"May 25, 2018",
"[red]Solo[/red]: A Star Wars Story",
"$275,000,000",
"$393,151,347",
)
table.add_row(
"Dec 15, 2017",
"Star Wars Ep. VIII: The Last Jedi",
"$262,000,000",
"[bold]$1,332,539,889[/bold]",
)
console.print(table)
https://github.com/willmcgugan/rich/raw/master/imgs/table.png
The following function will create the requested table (with or without numpy) with Python 3 (maybe also Python 2). I have chosen to set the width of each column to match that of the longest team name. You could modify it if you wanted to use the length of the team name for each column, but will be more complicated.
Note: For a direct equivalent in Python 2 you could replace the zip with izip from itertools.
def print_results_table(data, teams_list):
str_l = max(len(t) for t in teams_list)
print(" ".join(['{:>{length}s}'.format(t, length = str_l) for t in [" "] + teams_list]))
for t, row in zip(teams_list, data):
print(" ".join(['{:>{length}s}'.format(str(x), length = str_l) for x in [t] + row]))
teams_list = ["Man Utd", "Man City", "T Hotspur"]
data = [[1, 2, 1],
[0, 1, 0],
[2, 4, 2]]
print_results_table(data, teams_list)
This will produce the following table:
Man Utd Man City T Hotspur
Man Utd 1 2 1
Man City 0 1 0
T Hotspur 2 4 2
If you want to have vertical line separators, you can replace " ".join with " | ".join.
References:
lots about formatting https://pyformat.info/ (old and new formatting
styles)
the official Python tutorial (quite good) -
https://docs.python.org/3/tutorial/inputoutput.html#the-string-format-method
official Python information (can be difficult to read) -
https://docs.python.org/3/library/string.html#string-formatting
Another resource -
https://www.python-course.eu/python3_formatted_output.php
For simple cases you can just use modern string formatting (simplified Sven's answer):
f'{column1_value:15} {column2_value}':
table = {
'Amplitude': [round(amplitude, 3), 'm³/h'],
'MAE': [round(mae, 2), 'm³/h'],
'MAPE': [round(mape, 2), '%'],
}
for metric, value in table.items():
print(f'{metric:14} : {value[0]:>6.3f} {value[1]}')
Output:
Amplitude : 1.438 m³/h
MAE : 0.171 m³/h
MAPE : 27.740 %
Source: https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals
I found this just looking for a way to output simple columns. If you just need no-fuss columns, then you can use this:
print("Titlex\tTitley\tTitlez")
for x, y, z in data:
print(x, "\t", y, "\t", z)
EDIT: I was trying to be as simple as possible, and thereby did some things manually instead of using the teams list. To generalize to the OP's actual question:
#Column headers
print("", end="\t")
for team in teams_list:
print(" ", team, end="")
print()
# rows
for team, row in enumerate(data):
teamlabel = teams_list[team]
while len(teamlabel) < 9:
teamlabel = " " + teamlabel
print(teamlabel, end="\t")
for entry in row:
print(entry, end="\t")
print()
Ouputs:
Man Utd Man City T Hotspur
Man Utd 1 2 1
Man City 0 1 0
T Hotspur 2 4 2
But this no longer seems any more simple than the other answers, with perhaps the benefit that it doesn't require any more imports. But #campkeith's answer already met that and is more robust as it can handle a wider variety of label lengths.
I would try to loop through the list and use a CSV formatter to represent the data you want.
You can specify tabs, commas, or any other char as the delimiter.
Otherwise, just loop through the list and print "\t" after each element
http://docs.python.org/library/csv.html
I got a better one that can save a lot of spaces.
table = [
['number1', 'x', 'name'],
["4x", "3", "Hi"],
["2", "1", "808890312093"],
["5", "Hi", "Bye"]
]
column_max_width = [max(len(row[column_index]) for row in table) for column_index in range(len(table[0]))]
row_format = ["{:>"+str(width)+"}" for width in column_max_width]
for row in table:
print("|".join([print_format.format(value) for print_format, value in zip(row_format, row)]))
output:
number1| x| name
4x| 3| Hi
2| 1|808890312093
5|Hi| Bye
To create a simple table using terminaltables open the terminal or your command prompt and run pip install terminaltables.
You can print a Python list as the following:
from terminaltables import AsciiTable
l = [
['Head', 'Head'],
['R1 C1', 'R1 C2'],
['R2 C1', 'R2 C2'],
['R3 C1', 'R3 C2']
]
table = AsciiTable(l)
print(table.table)
list1 = [1, 2, 3]
list2 = [10, 20, 30]
l = []
for i in range(0, len(list1)):
l.append(list1[i]), l.append(list2[i])
# print(l)
for i in range(0, len(l), 2):
print(l[i], "", l[i + 1])

Count regex matches in one column by values in another column with pandas

I am working with pandas and have a dataframe that contains a list of sentences and people who said them, like this:
sentence person
'hello world' Matt
'cake, delicious cake!' Matt
'lovely day' Maria
'i like cake' Matt
'a new day' Maria
'a new world' Maria
I want to count non-overlapping matches of regex strings in sentence (e.g. cake, world, day) by the person. Note each row of sentence may contain more than one match (e.g cake):
person 'day' 'cake' 'world'
Matt 0 3 1
Maria 2 0 1
So far I am doing this:
rows_cake = df[df['sentences'].str.contains(r"cake")
counts_cake = rows_cake.value_counts()
However this str.contains gives me rows containing cake, but not individual instances of cake.
I know I can use str.counts(r"cake") on rows_cake. However, in practise my dataframe is extremely large (> 10 million rows) and the regexes I am using are quite complex so I am looking for a more efficient solution if possible.
Maybe you should first try to get the sentence itself and then use re to do your optimized regex stuff like that:
for row in df.itertuples(index=False):
do_some_regex_stuff(row[0], row[1])#in this case row[0] is a sentence. row[1] is person
As far as I know itertuples is quiet fast (Notes no.1 here). So the only optimization problem you have is with regex itself.
I came up with rather simple solution. But cant claim it to be the fastest or efficient.
import pandas as pd
import numpy as np
# to be used with read_clipboard()
'''
sentence person
'hello world' Matt
'cake, delicious cake!' Matt
'lovely day' Maria
'i like cake' Matt
'a new day' Maria
'a new world' Maria
'''
df = pd.read_clipboard()
# print(df)
Output:
sentence person
0 'hello world' Matt
1 'cake, delicious cake!' Matt
2 'lovely day' Maria
3 'i like cake' Matt
4 'a new day' Maria
5 'a new world' Maria
.
# if the list of keywords is fix and relatively small
keywords = ['day', 'cake', 'world']
# for each keyword and each string, counting the occourance
for key in keywords:
df[key] = [(len(val.split(key)) - 1) for val in df['sentence']]
# print(df)
Output:
sentence person day cake world
0 'hello world' Matt 0 0 1
1 'cake, delicious cake!' Matt 0 2 0
2 'lovely day' Maria 1 0 0
3 'i like cake' Matt 0 1 0
4 'a new day' Maria 1 0 0
5 'a new world' Maria 0 0 1
.
# create a simple pivot with what data you needed
df_pivot = pd.pivot_table(df,
values=['day', 'cake', 'world'],
columns=['person'],
aggfunc=np.sum).T
# print(df_pivot)
Final Output:
cake day world
person
Maria 0 2 1
Matt 3 0 1
Open to suggestions if this seems to be a good approach especially given the volume of data. Eager to learn.
since this primarily involves strings, I would suggest taking the computation out of Pandas - Python is faster than Pandas in most cases when it comes to string manipulation :
#read in data
df = pd.read_clipboard(sep='\s{2,}', engine='python')
#create a dictionary of persons and sentences :
from collections import defaultdict, ChainMap
d = defaultdict(list)
for k,v in zip(df.person, df.sentence):
d[k].append(v)
d = {k:",".join(v) for k,v in d.items()}
#search words
strings = ("cake", "world", "day")
#get count of words and create a dict
m = defaultdict(list)
for k,v in d.items():
for st in strings:
m[k].append({st:v.count(st)})
res = {k:dict(ChainMap(*v)) for k,v in m.items()}
print(res)
{'Matt': {'day': 0, 'world': 1, 'cake': 3},
'Maria': {'day': 2, 'world': 1, 'cake': 0}}
output = pd.DataFrame(res).T
day world cake
Matt 0 1 3
Maria 2 1 0
test the speeds and see which one is better. it would be useful for me and others as well.

Efficient grouping with count of items

Here is how my dataset looks like:
Name | Country
---------------
Alex | USA
Tony | DEU
Alex | GBR
Alex | USA
I am trying to get something like this out, essentially grouping and counting:
Name | Country
---------------
Alex | {USA:2,GBR:1}
Tony | {DEU:1}
Works, but slow on LARGE datasets
Here is my code that does work on smaller dfs, but takes forever on bigger dfs (mine is around 14 million rows). I also use the multiprocessing module to speed up, but it doesn't help much:
def countNames(x):
return dict(Counter(x))
def aggregate(df_full,nameList):
df_list = []
for q in nameList:
df = df_full[df_full['Name']==q]
df_list.append(df.groupby('Name')['Country'].apply(lambda x: str(countNames(x))).to_frame().reset_index())
return pd.concat(df_list)
df = pd.DataFrame({'Name':['Alex','Tony','Alex','Alex'],
'Country':['USA','GBR','USA','DEU']})[['Name','Country']]
aggregate(df,df.Name.unique())
Is there anything that can speed up the internal logic (except for running with multiprocessing)?
This is essentially a cross tabulation. You said "something like this" which implies that you aren't quite sure what the output should be.
Option 1
Group by and value_counts
df.groupby('Name').Country.value_counts()
Name Country
Alex USA 2
GBR 1
Tony DEU 1
Name: Country, dtype: int64
To get your specified output:
pd.Series({
name: pd.value_counts(d).to_dict()
for name, d in df.groupby('Name').Country
}).rename_axis('Name').reset_index(name='Country')
Name Country
0 Alex {'USA': 2, 'GBR': 1}
1 Tony {'DEU': 1}
Option 2
However, I'd prefer these representations. Which we can see a number of ways to do this in the answer to question # 9 in this answer
pd.crosstab(df.Name, df.Country)
Country DEU GBR USA
Name
Alex 0 1 2
Tony 1 0 0
Are you looking for this?
import pandas as pd
df = pd.DataFrame({'Name':['Alex','Tony','Alex','Alex'],
'Country':['USA','GBR','USA','DEU']})[['Name','Country']]
df = (df.groupby('Name')['Country']
.apply(lambda x: str(x.value_counts().to_dict()))
.reset_index(name='Country'))
Returns:
Name Country
0 Alex {'USA': 2, 'DEU': 1}
1 Tony {'GBR': 1}
For an O(n) complexity solution, use collections.Counter.
from collections import Counter, defaultdict
import pandas as pd
df = pd.DataFrame({'Name':['Alex','Tony','Alex','Alex'],
'Country':['USA','GBR','USA','DEU']})[['Name','Country']]
c = Counter(map(tuple, df.values))
# Counter({('Alex', 'DEU'): 1, ('Alex', 'USA'): 2, ('Tony', 'GBR'): 1})
Dictionary result
You can then get a Name -> Country dictionary mapping via collections.defaultdict. I would not put dictionaries in a pandas dataframe, it's not designed for this purpose.
tree = lambda: defaultdict(tree)
d = tree()
for k, v in c.items():
d[k[0]][k[1]] = v
for k, v in d.items():
print(k, v)
# Alex defaultdict(<function <lambda>>, {'USA': 2, 'DEU': 1})
# Tony defaultdict(<function <lambda>>, {'GBR': 1})
Dataframe result
For display purposes, you can build a dataframe directly from the defaultdict:
res_df = pd.DataFrame.from_dict(d, orient='index').fillna(0)
# USA DEU GBR
# Alex 2.0 1.0 0.0
# Tony 0.0 0.0 1.0

In Pandas, how to calculate the relative probabilities of values of a column given a value of another column?

I have two data frames, vehicles and casualties, each with a common column Accident_Index:
import pandas as pd
vehicles = pd.DataFrame({'Accident_Index': [1, 1, 2, 3, 3, 4, 4],
'Vehicle_Type': ['car', 'car', 'motorcyle', 'car', 'car', 'car', 'car'],
'Sex_Driver': ['male', 'female', 'male', 'female', 'female', 'male', 'male']})
casualties = pd.DataFrame({'Accident_Index': [1, 1, 2, 3, 4],
'Casualty_Severity': ['fatal', 'serious', 'fatal', 'light', 'fatal']})
For ease of visualization, here is vehicles:
Accident_Index Sex_Driver Vehicle_Type
0 1 male car
1 1 female car
2 2 male motorcyle
3 3 female car
4 3 female car
5 4 male car
6 4 male car
and here is casualties:
Accident_Index Casualty_Severity
0 1 fatal
1 1 serious
2 2 fatal
3 3 light
4 4 fatal
I would like to calculate the how many times more likely are accidents involving male car drivers to be fatal compared to accidents involving female car drivers.
So far, I've come up with the following solution:
dfm = casualties.merge(vehicles, on='Accident_Index')
dfm_cars = dfm.loc[dfm.Vehicle_Type == 'car']
dfm_cars_fatal_male = dfm_cars.isin({'Casualty_Severity': ['fatal'], 'Sex_Driver': ['male']})
male_driver_involved_in_fatal_car_accident = (dfm_cars_fatal_male['Casualty_Severity'] & dfm_cars_fatal_male['Sex_Driver']).sum()
dfm_cars_fatal_female = dfm_cars.isin({'Casualty_Severity': ['fatal'], 'Sex_Driver': ['female']})
female_driver_involved_in_fatal_car_accident = (dfm_cars_fatal_female['Casualty_Severity'] & dfm_cars_fatal_female['Sex_Driver']).sum()
print(male_driver_involved_in_fatal_car_accident / female_driver_involved_in_fatal_car_accident)
The answer, in this case, is 3, because there are two car accidents with a fatality, one involving a male and a female driver and one involving two male drivers.
This code doesn't seem particularly succinct, however. How could I refactor this?
IIUC, you could use merge + query + groupby:
g = casualties.merge(vehicles, on='Accident_Index')\
.query("Vehicle_Type == 'car' and Casualty_Severity == 'fatal'")\
.groupby('Sex_Driver').Sex_Driver.count()
g / g.sum()
Sex_Driver
female 0.25
male 0.75
Name: Sex_Driver, dtype: float64
To make this simpler, you can have query work with variables:
vehicle = 'car'
severity = 'fatal'
You can then rewrite the query step to:
query("Vehicle_Type == #vehicle and Casualty_Severity == #severity")
This makes it easier to reuse your code, if you want to, say, put it inside a function and test against various combinations of input.

Categories

Resources