I have a dataframe df with 4 unique UID - 1001,1002,1003,1004.
I want to write a user-defined function in python that does the following:
growth curve -plots Turbidity against Time for each unique UID. Turbidity values are the ones in the Time_1, Time_2, Time_3,Time_4 & Time_5 columns. For example, UID = 1003 will have 4 plots on each graph
Add a legend to each graph such as M+L, F+L, M+R, and F+R (from columns Gen and Type)
Add a title to each graph. For example- UID:1003 + Site:FRX
Export the graphs as a pdf or jpeg or tiff file - 4 graphs per page
# The dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
df= {
'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
'Site':['FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX'],
'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
'Time1':[100.78,112.34,108.52,139.19,149.02,177.77,79.18,89.10,106.78,102.34,128.52,119.19,129.02,147.77,169.18,170.11],
'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','Time1','Time2','Time3','Time4','Time5'])
df
My attempt
# See below for my thoughts/attempt- I am open to other python libraries and approaches
def graph2pdf(inputdata):
#1. convert from wide to long
inputdata = pd.melt(df,id_vars = ['Gen','Type','UID'],var_name = 'Time',value_name = 'Turbidity')
#
cmaps = ['Reds', 'Blues', 'Greens', 'Greys','Yellows']
label_patches = []
for i, cmap in enumerate(cmaps):
# I want a growth curve not a distribution curve
sns.kdeplot(x = Time, y = Turbidity,data = data, cmap=cmaps[i]+'_d')
label_patch = mpatches.Patch(color=sns.color_palette(cmaps[i])[2],label=label)
label_patches.append(label_patch)
#2. add legend
plt.legend(handles=label_patches, loc='upper left')
#3. add title- 'UID number+ SiteName: FRX' to each of the graphs
plt.title('UID:1003+FRX')
plt.show()
#4. export as pdf file i.e 4 graphs per page
with PdfPages('turbidityvstime_pdf.pdf') as pdf:
plt.figure(figsize=(2,2)) # 4 graphs per page, I am anticipating more pages in the future
pdf.savefig() # saves the current figure into a pdf page
plt.close()
# testing the user-defined function
graph2pdf(df)
I want the graph to look something like the figure below (turbidity instead of density on the y-axis and time on the x-axis). if possible, a white or clear background is preferred
Thanks
I line plot is usually not appropriate for discrete data, because the slope of the lines can imply trends that do not exist.
This is discrete because measurements are taken at discrete moments in time, not a continuous time series.
Discrete data is best visualized with a bar plot.
Use seaborn figure-level methods like sns.catplot or sns.replot to create the figure with four subplots.
Tested in python 3.8.11, pandas 1.3.2, matplotlib 3.4.3, seaborn 0.11.2
import pandas as pd
import seaborn as sns
def graph2pdf(df):
# melt the dataframe; any column not a var or value, should be in id_vars
data = df.melt(id_vars=df.columns[:4], var_name='Time', value_name='Turbidity')
# combine Gen and Type to create label, which can be used for hue
data['label'] = data.Gen + '-' + data.Type
# plot a catplot for bars
p1 = sns.catplot(data=data, kind='bar', x='Time', y='Turbidity', hue='label', col='UID', col_wrap=2, height=3.25)
p1.fig.subplots_adjust(top=0.9) # adjust the figure
p1.fig.suptitle('UID:1003+FRX')
p1.savefig("barplots.png")
# plot a relplot for lines
p2 = sns.relplot(data=data, kind='line', x='Time', y='Turbidity', hue='label', col='UID', col_wrap=2, height=3.25, marker='o')
p2.fig.subplots_adjust(top=0.9)
p2.fig.suptitle('UID:1003+FRX')
p2.savefig("lineplots.png")
graph2pdf(df)
I'm trying to plot some data that I have and I'm having issues with the x-axis tick labels. Does anyone have a fix for this? Also, is there an easier way to plot this data with certain conditions? For example, I'm looking at poker hands here, and I only want to plot this data for individuals that have over 50 hands (ie. data points). To do this, I created a new list and filtered out those with Hands < 50, is there a way of plotting this with pandas without creating a new list?
## For data handling
import pandas as pd
import numpy as np
from pandas import plotting
## For plotting
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.figure import Figure
preflop = pd.read_csv("all_player_preflop_report_tourney.csv", thousands=',')
#preflop['Hands'] = preflop['Hands'].astype(int) if preflop['Hands'] < 20000
preflop['Hands'] = preflop['Hands'].astype(int)
preflop = preflop.rename(columns={'BB/100':'BB_100','Raise First':'RFI','WTSD %': 'WTSD', 'All-In Adj BB/100':'adj_BB_100','Avg PF All-In Equity':'pf_all_in','CC 2Bet PF':'cc_2bet','3Bet PF':'3bet','2Bet PF & Call 3Bet':'2Bet_call_3Bet','Raise & 4Bet+ PF':'rfi_and_4bet+','2Bet PF & Fold':'2bet_and_fold','5Bet+ PF':'5bet+','3Bet PF & Fold':'3bet_and_fold','Call Any PFR':'call_any_pfr','Call Steal':'call_steal', 'Call vs BTN Open':'call_btn_open','CC 3Bet+ PF':'cc_3bet+','Limp Behind':'limp_behind','Raise Limpers':'raise_limpers'})
preflop = preflop.set_index('Player')
preflop_copy = preflop.copy()
preflop_train = preflop_copy.sample(frac = .75, random_state = 250)
preflop_test = preflop_copy.drop(preflop_train.index)
## first make a figure
## this makes a figure that is 8 units by 8 units
plt.figure(figsize = (8,8))
preflop_50 = preflop_copy.loc[(preflop_copy.Hands > 100)]
#preflop_50.plot.scatter(x="RFI", y="BB_100")
plt.scatter(preflop_50.RFI,preflop_50.BB_100)
x = np.arange(0,1,0.1)
plt.xticks(x)
#Figure.align_xlabels(plot)
## Always good practice to label well when
## presenting a figure to others
## place an xlabel
plt.xlabel("RFI", fontsize =16)
## place a ylabel
plt.ylabel("BB/100", fontsize = 16)
## type this to show the plot
plt.show()
I have a situation with my data. I like the behaviour of .plot() over a data frame. But sometimes it doesn't work, because the frequency of the time index is not an integer.
But reproducing the plot in matplotlib is OK. Just ugly.
The part that bother me the most is the settings of the x axis. The tick frequency and the limits. Is there any easy way that I can reproduce this behaviour in matplotlib?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Create Data
f = lambda x: np.sin(0.1*x) + 0.1*np.random.randn(1,x.shape[0])
x = np.arange(0,217,0.001)
y = f(x)
# Create DataFrame
data = pd.DataFrame(y.transpose(), columns=['dp'], index=None)
data['t'] = pd.date_range('2021-01-01 14:32:09', periods=len(data['dp']),freq='ms')
data.set_index('t', inplace=True)
# Pandas plot()
data.plot()
# Matplotlib plot (ugly x-axis)
plt.plot(data.index,data['dp'])
EDIT: Basically, what I want to achieve is a similar spacing in the xtics labels, and the tight margin adjust of the values. Legends and axis title, I can do them
Pandas output
Matplotlib output
Thanks
You can use some matplotlib date utilities:
Figure.autofmt_xdate() to unrotate and center the date labels
Axis.set_major_locator() to change the interval to 1 min
Axis.set_major_formatter() to reformat as %H:%M
fig, ax = plt.subplots()
ax.plot(data.index, data['dp'])
import matplotlib.dates as mdates
fig.autofmt_xdate(rotation=0, ha='center')
ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
# uncomment to remove the first `xtick`
# ax.set_xticks(ax.get_xticks()[1:])
I have this dataframe and I want to line plot it. As I have plotted it.
Graph is
Code to generate is
fig, ax = plt.subplots(figsize=(15, 5))
date_time = pd.to_datetime(df.Date)
df = df.set_index(date_time)
plt.xticks(rotation=90)
pd.DataFrame(df, columns=df.columns).plot.line( ax=ax,
xticks=pd.to_datetime(frame.Date))
I want a marker of innovationScore with value(where innovationScore is not 0) on open, close line. I want to show that that is the change when InnovationScore changes.
You have to address two problems - marking the corresponding spots on your curves and using the dates on the x-axis. The first problem can be solved by identifying the dates, where the score is not zero, then plotting markers on top of the curve at these dates. The second problem is more of a structural nature - pandas often interferes with matplotlib when it comes to date time objects. Using pandas standard plotting functions is good because it addresses common problems. But mixing pandas with matplotlib plotting (and to set the markers, you have to revert to matplotlib afaik) is usually a bad idea because they do not necessarily present the date time in the same format.
import pandas as pd
from matplotlib import pyplot as plt
#fake data generation, the following code block is just for illustration
import numpy as np
np.random.seed(1234)
n = 50
date_range = pd.date_range("20180101", periods=n, freq="D")
choice = np.zeros(10)
choice[0] = 3
df = pd.DataFrame({"Date": date_range,
"Open": np.random.randint(100, 150, n),
"Close": np.random.randint(100, 150, n),
"Innovation Score": np.random.choice(choice, n)})
fig, ax = plt.subplots()
#plot the three curves
l = ax.plot(df["Date"], df[["Open", "Close", "Innovation Score"]])
ax.legend(iter(l), ["Open", "Close", "Innovation Score"])
#filter dataset for score not zero
IS = df[df["Innovation Score"] > 0]
#plot markers on these positions
ax.plot(IS["Date"], IS[["Open", "Close"]], "ro")
#and/or set vertical lines to indicate the position
ax.vlines(IS["Date"], 0, max(df[["Open", "Close"]].max()), ls="--")
#label x-axis score not zero
ax.set_xticks(IS["Date"])
#beautify the output
ax.set_xlabel("Month")
ax.set_ylabel("Artifical score people take seriously")
fig.autofmt_xdate()
plt.show()
Sample output:
You can achieve it like this:
import matplotlib.pyplot as plt
plt.plot([1, 2, 3], "ro-") # r is red, o is for larger marker, - is for line
plt.plot([3, 2, 1], "b.-") # b is blue, . is for small marker, - is for line
plt.show()
Check out also example here for another approach:
https://matplotlib.org/3.3.3/gallery/lines_bars_and_markers/markevery_prop_cycle.html
I very often get inspiration from this list of examples:
https://matplotlib.org/3.3.3/gallery/index.html
(Using Python 3.0) In increments of 0.25, I want to calculate and plot PDFs for the given data across specified ranges for easy visualization.
Calculating the individual plot has been done thanks to the SO community, but I cannot quite get the algorithm right to iterate properly across the range of values.
Data: https://www.dropbox.com/s/y78pynq9onyw9iu/Data.csv?dl=0
What I have so far is normalized toy data that looks like a shotgun blast with one of the target areas isolated between the black lines with an increment of 0.25:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import seaborn as sns
Data=pd.read_csv("Data.csv")
g = sns.jointplot(x="x", y="y", data=Data)
bottom_lim = 0
top_lim = 0.25
temp = Data.loc[(Data.y>=bottom_lim)&(Data.y<top_lim)]
g.ax_joint.axhline(top_lim, c='k', lw=2)
g.ax_joint.axhline(bottom_lim, c='k', lw=2)
# we have to create a secondary y-axis to the joint-plot, otherwise the kde
might be very small compared to the scale of the original y-axis
ax_joint_2 = g.ax_joint.twinx()
sns.kdeplot(temp.x, shade=True, color='red', ax=ax_joint_2, legend=False)
ax_joint_2.spines['right'].set_visible(False)
ax_joint_2.spines['top'].set_visible(False)
ax_joint_2.yaxis.set_visible(False)
And now what I want to do is make a ridgeline/joyplot of this data across each 0.25 band of data.
I tried a few techniques from the various Seaborn examples out there, but nothing really accounts for the band or range of values as the y-axis. I'm struggling to translate my written algorithm into working code as a result.
I don't know if this is exactly what you are looking for, but hopefully this gets you in the ballpark. I also know very little about python, so here is some R:
library(tidyverse)
library(ggridges)
data = read_csv("https://www.dropbox.com/s/y78pynq9onyw9iu/Data.csv?dl=1")
data2 = data %>%
mutate(breaks = cut(x, breaks = seq(-1,7,.5), labels = FALSE))
data2 %>%
ggplot(aes(x=x,y=breaks)) +
geom_density_ridges() +
facet_grid(~breaks, scales = "free")
data2 %>%
ggplot(aes(x=x,y=y)) +
geom_point() +
geom_density() +
facet_grid(~breaks, scales = "free")
And please forgive the poorly formatted axis.