Scatter plot for multiple classes - python

I have 4 arrays of clusters that I need to plot in a scatter plot. The documentation shows a simple example of X and Y plotting. I've tried some tutorials but most of them work with datasets or dataframes, so I was unable to properly figure out how to plot my data the right way. In short, I'm trying to plot these 4 arrays as clusters:
[ 4.33976958 19.73690959 9.05452373 1.29938447 1.25155903
18.07181231
1.28825463 14.31906422 1.58 4.04618339 4.27626005 1.28062485
1.00079968 12.40582121 5.31973684 3.59755473 6.18436739 4.96310387
4.21620683]
[1.31590273 3.75281228 2.5215868 1.99959996 1.06376689 2.35703203
1.02449988 1.64012195 2.755431 1.35661343 6.20786598 1.26
1.18389189 2.10864886 1.81118746 1.4 1.6857046 1.23693169
1.18810774]
[2.45348731 8.16029411 3.09767655 1.9078784 1.23951603 8.81716508
1.08885261 3.22546121 3.85585269 1.34164079 5.62138773 1.74688294
1.20016666 1.96203975 2.9662097 1.63963411 1.69339895 1.27687118
1.34699666]
[2.48386795 4.32485838 2.03381415 2.3 3.48137904 4.8340873
3.52278299 1.41421356 1.41265707 1.26743836 3.90384426 2.44532206
1.36367151 3.3346664 2.16 0.97897906 1.68534863 1.6503333
1.47837749]
My current code:
import matplotlib.pyplot as plt
std_colomns1 = [4.33976958, 19.73690959, 9.05452373, 1.29938447, 1.25155903, 18.07181231, 1.28825463, 14.31906422, 1.58, 4.04618339, 4.27626005, 1.28062485, 1.00079968, 12.40582121, 5.31973684, 3.59755473, 6.18436739, 4.96310387, 4.21620683]
std_colomns2 = [1.31590273, 3.75281228, 2.5215868, 1.99959996, 1.06376689, 2.35703203, 1.02449988, 1.64012195, 2.755431, 1.35661343, 6.20786598, 1.26, 1.18389189, 2.10864886, 1.81118746, 1.4, 1.6857046, 1.23693169, 1.18810774]
std_colomns3 = [2.45348731, 8.16029411, 3.09767655, 1.9078784, 1.23951603, 8.81716508, 1.08885261, 3.22546121, 3.85585269, 1.34164079, 5.62138773, 1.74688294, 1.20016666, 1.96203975, 2.9662097, 1.63963411, 1.69339895, 1.27687118, 1.34699666]
std_colomns4 = [2.48386795, 4.32485838, 2.03381415, 2.3, 3.48137904, 4.8340873, 3.52278299, 1.41421356, 1.41265707, 1.26743836, 3.90384426, 2.44532206, 1.36367151, 3.3346664, 2.16, 0.97897906, 1.68534863, 1.6503333, 1.47837749]
x = std_colomns1
y = std_colomns4
plt.scatter(x, y, label="Face clusters", color='k', s=10)
plt.xlabel('X')
plt.ylabel('y')
plt.title("Faces Features")
plt.legend()
plt.show()
I wish to plot those 4 arrays in a 2D space and distinguish them either by class (color) or centroids plotted in the center of each cluster.

import matplotlib.pyplot as plt
import numpy as np
# plot style
plt.rcParams['figure.figsize'] = (16.0, 10.0)
plt.style.use('ggplot')
# create list of data lists
data = [std_colomns1, std_colomns2, std_colomns3, std_colomns4]
# plot data and print median
for i, d in enumerate(data, 1):
plt.plot(d, marker='.', linestyle='none', markersize=7, label=f'col_{i}')
print(f'Median col_{i}: {np.median(d)}')
# format plot
plt.xticks(range(0, 19, 1))
plt.yticks(range(1, 21, 1))
plt.ylabel('Values')
plt.xlabel('Index')
plt.legend()
plt.show()
Alternative:
I think a bar plot displays the data more clearly
I didn't add column names to the dataframe, but that can be done with the columns parameter.
column=['a', 'b', 'c', 'd'] as an example.
import pandas as pd
import matplotlib.pyplot as plt
# plot style
plt.rcParams['figure.figsize'] = (16.0, 10.0)
plt.style.use('ggplot')
# create list of data lists
data = [std_colomns1, std_colomns2, std_colomns3, std_colomns4]
# create dataframe
df = pd.DataFrame(list(zip(*data)))
# print median
stats = df.agg(['median', 'mean'])
print(stats)
0 1 2 3
median 4.276260 1.640122 1.907878 2.160000
mean 6.222733 1.993142 2.875864 2.425034
# plot
df.plot.bar()
# format plot
plt.xticks(rotation=0)
plt.yticks(range(1, 21, 1))
plt.ylabel('Values')
plt.xlabel('Index')
plt.legend()
plt.show()

Check this code:
import matplotlib.pyplot as plt
import numpy as np
std_colomns1 = [4.33976958,19.73690959,9.05452373,1.29938447,1.25155903,18.07181231,1.28825463,14.31906422,1.58,4.04618339,4.27626005,1.28062485,1.00079968,12.40582121,5.31973684,3.59755473,6.18436739,4.96310387,4.21620683]
std_colomns2 = [1.31590273,3.75281228,2.5215868,1.99959996,1.06376689,2.35703203,1.02449988,1.64012195,2.755431,1.35661343,6.20786598,1.26,1.18389189,2.10864886,1.81118746,1.4,1.6857046,1.23693169,1.18810774]
std_colomns3 = [2.45348731,8.16029411,3.09767655,1.9078784,1.23951603,8.81716508,1.08885261,3.22546121,3.85585269,1.34164079,5.62138773,1.74688294,1.20016666,1.96203975,2.9662097,1.63963411,1.69339895,1.27687118,1.34699666]
std_colomns4 = [2.48386795,4.32485838,2.03381415,2.3,3.48137904,4.8340873,3.52278299,1.41421356,1.41265707,1.26743836,3.90384426,2.44532206,1.36367151,3.3346664,2.16,0.97897906,1.68534863,1.6503333,1.47837749]
x = std_colomns1
y = std_colomns4
center_colomn1 = np.median(np.array(std_colomns1))
center_colomn2 = np.median(np.array(std_colomns2))
center_colomn3 = np.median(np.array(std_colomns3))
center_colomn4 = np.median(np.array(std_colomns4))
plt.plot(std_colomns1, 'ko', label="Face 1")
plt.plot(std_colomns2, 'ro', label="Face 2")
plt.plot(std_colomns3, 'go', label="Face 3")
plt.plot(std_colomns4, 'bo', label="Face 4")
plt.xlabel('X')
plt.ylabel('Y')
plt.title("Faces Features")
plt.legend()
plt.show()
it will provide these centers:
4.27626005
1.64012195
1.9078784
2.16
and this scatter plot:

Here is another possibility, showing 4 boxplots:
import matplotlib.pyplot as plt
import numpy as np
std_colomns1 = [4.33976958,19.73690959,9.05452373,1.29938447,1.25155903,18.07181231,1.28825463,14.31906422,1.58,4.04618339,4.27626005,1.28062485,1.00079968,12.40582121,5.31973684,3.59755473,6.18436739,4.96310387,4.21620683]
std_colomns2 = [1.31590273,3.75281228,2.5215868,1.99959996,1.06376689,2.35703203,1.02449988,1.64012195,2.755431,1.35661343,6.20786598,1.26,1.18389189,2.10864886,1.81118746,1.4,1.6857046,1.23693169,1.18810774]
std_colomns3 = [2.45348731,8.16029411,3.09767655,1.9078784,1.23951603,8.81716508,1.08885261,3.22546121,3.85585269,1.34164079,5.62138773,1.74688294,1.20016666,1.96203975,2.9662097,1.63963411,1.69339895,1.27687118,1.34699666]
std_colomns4 = [2.48386795,4.32485838,2.03381415,2.3,3.48137904,4.8340873,3.52278299,1.41421356,1.41265707,1.26743836,3.90384426,2.44532206,1.36367151,3.3346664,2.16,0.97897906,1.68534863,1.6503333,1.47837749]
plt.boxplot([std_colomns1, std_colomns2, std_colomns3, std_colomns4], positions=range(4))
plt.xticks(ticks=range(4), labels=['std_colomns1', 'std_colomns2', 'std_colomns3', 'std_colomns4'])
plt.show()
Or, using seaborn (and pandas) you could draw a violin plot or a swarm plot:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
df = pd.DataFrame({'std_colomns1': std_colomns1, 'std_colomns2': std_colomns2,
'std_colomns3': std_colomns3, 'std_colomns4': std_colomns4})
sns.violinplot(data=df)
plt.show()
At the left sns.violinplot(data=df), at the right sns.swarmplot(data=df):

Related

plot data on Geopandas matplotlib

i want to plot x and y from a csv file in a geopandas graph but only the graph axis that shows up
import fiona
import matplotlib.pyplot as plt
from mpl_toolkits.axisartist.axislines import Subplot
import pandas as pd
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
gpd.io.file.fiona.drvsupport.supported_drivers["KML"] = "rw"
dfN = pd.read_csv ("nodes.txt",delimiter ="\\s+")
dfN.to_csv ("nodes.csv", index=None)
df = gpd.read_file("data.kml", driver="KML")
df=df.to_crs(epsg=32733)
gdf = gpd.GeoDataFrame(dfN ,geometry=gpd.points_from_xy(dfN.X, dfN.Y))
dg=df.translate(433050,299)
fig,ax = plt.subplots()
ax.set_aspect('equal')
ax.scatter(gdf.X, gdf.Y , zorder=1, alpha= 1, c='r', s=10)
dg.plot(ax=ax,zorder=0,color='white', edgecolor='black',aspect= 'equal')
plt.show()
this is not a MWE so have sourced data from publicly available and have applied same transformations...
plotting code can simplified, then it works. using plot() on geopandas which includes POINT objects will produce a scatter
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests, io
# data sourcing generated two geopandas data frames, let's replace to make MWE
df = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
df=df.to_crs(epsg=32733)
dg = df.loc[df["geometry"].is_valid *df["iso_a3"].eq("GBR")].translate(433050,299)
dfN = pd.read_csv(io.StringIO(requests.get("https://assets.nhs.uk/data/foi/Hospital.csv").text),
sep="Č",engine="python",).loc[:,["OrganisationName","Latitude","Longitude"]].rename(columns={"Latitude":"Y","Longitude":"X"})
gdf = gpd.GeoDataFrame(dfN ,geometry=gpd.points_from_xy(dfN.X, dfN.Y))
gdf = gdf.set_crs("EPSG:4326").to_crs(epsg=32733)
# plotting code is simplified as:
ax = dg.plot(zorder=0,color='white', edgecolor='black',aspect= 'equal')
gdf.plot(ax=ax, zorder=1, alpha= 1, c='r', markersize=10)
output
clearly within the defined CRS, plus one set of geometry has been transformed

making colorbar values integer in a heatmap matplotlib seaborn

I'm trying to make my colourbar have integer values instead of decimals, but coding this is a lot harder than anticipated.
my initial code
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
#sns.set()
# read data
revels_data = pd.read_csv("revels2.txt")
rd = revels_data
revels = rd.pivot("Flavour", "Packet number", "Contents")
# orders flavours
revels.index = pd.CategoricalIndex(revels.index, categories=["orange", "toffee", "chocolate", "malteser", "raisin", "coffee"])
revels.sortlevel(level=0, inplace=True)
# Draw a heatmap with the numeric values in each cell
ax = sns.heatmap(revels, annot=True, fmt="d", linewidths=0.4, cmap="YlOrRd")
ax.set_title('REVELS PACKET COUNT HEATMAP', weight="bold")
plt.show()
which produces
Trying to reverse engineer one of the answers from here
by adding the following code
cmap = plt.get_cmap("YlOrRd", np.max(rd.Contents)-np.min(rd.Contents)+1)
plt.get_cmap("YlOrRd", np.max(rd.Contents)-np.min(rd.Contents)+1)
# set limits .5 outside true range
mat = plt.matshow(rd.Contents, cmap=cmap, vmin = np.min(rd.Contents)-.5, vmax = np.max(rd.Contents)+.5)
plt.matshow(rd.Contents ,cmap=cmap, vmin = np.min(rd.Contents)-.5, vmax = np.max(rd.Contents)+.5)
#tell the colorbar to tick at integers
cax = plt.colorbar(mat, ticks=np.arange(np.min(rd.Contents),np.max(rd.Contents)+1))
plt.colorbar(mat, ticks=np.arange(np.min(rd.Contents),np.max(rd.Contents)+1))
but getting errors, namely ValueError: not enough values to unpack.
I think I may have applied the code wrong, would appreciate any help.
Here is a full working example, which creates a discrete colorbar for a seaborn heatmap plot with integer values as colorbar ticks.
import pandas as pd
import numpy as np; np.random.seed(8)
import matplotlib.pyplot as plt
import seaborn.apionly as sns
plt.rcParams["figure.figsize"] = 10,5.5
flavours=["orange", "toffee", "chocolate", "malteser", "raisin", "coffee"]
num = np.arange(0, 6*36).astype(int) % 36
flavs = np.random.choice(flavours, size=len(num))
conts = np.random.randint(0,6, len(num)).astype(int)
df = pd.DataFrame({"Packet number":num ,"Flavour":flavs,"Contents" : conts})
revels = pd.pivot_table(df, index=["Flavour"], columns=["Packet number"], values="Contents", aggfunc=np.sum)
revels.index = pd.CategoricalIndex(revels.index, categories=flavours)
revels.sortlevel(level=0, inplace=True)
revels= revels.fillna(0)
ticks=np.arange(revels.values.min(),revels.values.max()+1 )
boundaries = np.arange(revels.values.min()-.5,revels.values.max()+1.5 )
cmap = plt.get_cmap("YlOrRd", revels.values.max()-revels.values.min()+1)
ax = sns.heatmap(revels, annot=True, linewidths=0.4, cmap=cmap,
cbar_kws={"ticks":ticks, "boundaries":boundaries})
ax.set_title('REVELS PACKET COUNT HEATMAP', weight="bold")
plt.tight_layout()
plt.show()

Change the facecolor of boxplot in pandas

I need to change the colors of the boxplot drawn using pandas utility function. I can change most properties using the color argument but can't figure out how to change the facecolor of the box. Someone knows how to do it?
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props)
While I still recommend seaborn and raw matplotlib over the plotting interface in pandas, it turns out that you can pass patch_artist=True as a kwarg to df.plot.box, which will pass it as a kwarg to df.plot, which will pass is as a kwarg to matplotlib.Axes.boxplot.
import pandas as pd
import numpy as np
data = np.random.randn(100, 4)
labels = list("ABCD")
df = pd.DataFrame(data, columns=labels)
props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray")
df.plot.box(color=props, patch_artist=True)
As suggested, I ended up creating a function to plot this, using raw matplotlib.
def plot_boxplot(data, ax):
bp = ax.boxplot(data.values, patch_artist=True)
for box in bp['boxes']:
box.set(color='DarkGreen')
box.set(facecolor='DarkGreen')
for whisker in bp['whiskers']:
whisker.set(color="DarkOrange")
for cap in bp['caps']:
cap.set(color="Gray")
for median in bp['medians']:
median.set(color="white")
ax.axhline(0, color="DarkBlue", linestyle=":")
ax.set_xticklabels(data.columns)
I suggest using df.plot.box with patch_artist=True and return_type='both' (which returns the matplotlib axes the boxplot is drawn on and a dictionary whose values are the matplotlib Lines of the boxplot) in order to have the best customization possibilities.
For example, given this data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(
data=np.random.randn(100, 4),
columns=list("ABCD")
)
you can set a specific color for all the boxes:
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch in props['boxes']:
patch.set_facecolor('lime')
plt.show()
you can set a specific color for each box:
colors = ['green','blue','yellow','red']
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors):
patch.set_facecolor(color)
plt.show()
you can easily integrate a colormap:
colors = np.random.randint(0,10, 4)
cm = plt.cm.get_cmap('rainbow')
colors_cm = [cm((c-colors.min())/(colors.max()-colors.min())) for c in colors]
fig,ax = plt.subplots(figsize=(9,6))
ax,props = df.plot.box(patch_artist=True, return_type='both', ax=ax)
for patch,color in zip(props['boxes'],colors_cm):
patch.set_facecolor(color)
# to add colorbar
fig.colorbar(plt.cm.ScalarMappable(
plt.cm.colors.Normalize(min(colors),max(colors)),
cmap='rainbow'
), ax=ax, cmap='rainbow')
plt.show()

Matplotlib: cbar.set_xticklabels has no effects

I've assigned the 365 days of a year to several clusters and I'm now trying to plot them on a heatmap.
My code works fine except that cbar.set_ticks(some_range) has no effects: the tick labels on my colorbar have the right text but the wrong position
Here is a MCVE
from datetime import date
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import matplotlib
import seaborn as sns
#create some random data
n_cluster = 4
index = pd.date_range('01/01/2016', end='31/12/2016', freq='1D')
df = pd.DataFrame(np.random.randint(0, n_cluster, len(index)),
index=index, columns=['cluster'])
pivot = df.pivot_table('cluster',
columns=[lambda x: x.weekofyear],
index= [lambda x: x.dayofweek])
#yticklabels of the heatmap
days = [date(2018, 1, d).strftime('%a')[:3] for d in range(1, 8)]
#get a discrete cmap
cmap = plt.cm.get_cmap('RdBu', n_cluster)
fig = plt.figure(figsize=(10,3))
gs = matplotlib.gridspec.GridSpec(1, 2, width_ratios=[50,1])
ax = plt.subplot(gs[0])
cbar = plt.subplot(gs[1])
sns.heatmap(pivot, square=True, cmap=cmap,
yticklabels=days, ax=ax, cbar_ax=cbar)
#There is something wrong here
cbar.set_yticks([i + 1/(2.0*n_cluster) for i in np.arange(0, 1, 1.0/n_cluster)])
#This one is ok
cbar.set_yticklabels(range(0, n_cluster))
Thanks for your help
As a workaround, the following adds the correct labels in the correct place,
cbar.yaxis.set_ticks([0.125, 0.375, 0.625, 0.875])
which looks like,
EDIT:
Or the more general suggestion of mfitzp,
cbar.yaxis.set_ticks([i + 1/(2.0*n_cluster)
for i in np.arange(0, 1, 1.0/n_cluster)])

matplotlib overlay a normal distribution with stddev axis onto another plot

I have a series of data that I'm reading in from a tutorial site.
I've managed to plot the distribution of the TV column in that data, however I also want to overlay a normal distribution curve with StdDev ticks on a second x-axis (so I can compare the two curves). I'm struggling to work out how to do it..
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# draw distribution curve
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
plt.plot(h, pdf)
Here is a diagram close to what I'm after, where x is the StdDeviations. All this example needs is a second x axis to show the values of data.TV
Not sure what you really want, but you could probably use second axis like this
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('Advertising.csv', index_col=0)
fig, ax1 = plt.subplots()
# draw distribution curve
h = sorted(data.TV)
ax1.plot(h,'b-')
ax1.set_xlabel('TV')
ax1.set_ylabel('Count', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
ax2 = ax1.twinx()
ax2.plot(h, pdf, 'r.')
ax2.set_ylabel('pdf', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
plt.show()
Ok, assuming that you want to plot the distribution of your data, the fitted normal distribution with two x-axes, one way to achieve this is as follows.
Plot the normalized data together with the standard normal distribution. Then use matplotlib's twiny() to add a second x-axis to the plot. Use the same tick positions as the original x-axis on the second axis, but scale the labels so that you get the corresponding original TV values. The result looks like this:
Code
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
h_n = (h - hmean) / hstd
pdf = stats.norm.pdf( h_n )
# plot data
f,ax1 = plt.subplots()
ax1.hist( h_n, 20, normed=1 )
ax1.plot( h_n , pdf, lw=3, c='r')
ax1.set_xlim( [h_n.min(), h_n.max()] )
ax1.set_xlabel( r'TV $[\sigma]$' )
ax1.set_ylabel( r'Relative Frequency')
ax2 = ax1.twiny()
ax2.grid( False )
ax2.set_xlim( ax1.get_xlim() )
ax2.set_ylim( ax1.get_ylim() )
ax2.set_xlabel( r'TV' )
ticklocs = ax2.xaxis.get_ticklocs()
ticklocs = [ round( t*hstd + hmean, 2) for t in ticklocs ]
ax2.xaxis.set_ticklabels( map( str, ticklocs ) )

Categories

Resources