I need to form a new sequence of numbers by replacing every data value, starting with the 4th entry and ending with the 4th from the last entry, with a weighted average of the seven points around it, using the following formula:
(y[i-3] + 2y[i-2] + 3y[i-1] + 3y[i] + 3y[i+1] + 2y[i+2] + y[i+3]) // 15
(NOTE. The i- or i+ some number is a subscript in case that wasn't apparent.)
Here is the code I have which produces a raw graph, but I need to smooth a new graph with the above formula. The data file produces an array of integers set up as [-24, 4, -4, -12, -52...]. I am not even sure where to begin with the formula any help would be appreciated.
from matplotlib import pyplot as plt
with open('2_Record2308.dat', 'r') as f:
data = [int(x) for x in f]
graph = data
fig, ax = plt.subplots()
ax.plot(graph)
ax.legend()
ax.set_ylabel('Raw')
plt.tight_layout()
plt.show()
This code should do the trick:
avg = [(sum(y) + sum(y[1:-1]) + sum(y[2:-2])) // 15
for y in zip(data[:-6], data[1:-5], data[2:-4], data[3:-3], data[4:-2], data[5:-1], data[6:])]
Here zip(data[:-6], data[1:-5], ...) creates the successive 7-tuples.
And sum(y) takes the 7 numbers each once. sum(y[1:-1]) takes the 5 inner numbers once again. sum(y[2:-2]) takes the 3 inner numbers a third time.
By the way, adding 7 before dividing by 15 would be closer to averaging. In the original formulation the average always gets rounded downwards.
So, I would suggest (sum(y) + sum(y[1:-1]) + sum(y[2:-2]) + 7) // 15
Here is a test based on your code and random-walk data.
from matplotlib import pyplot as plt
import random
def do_averaging_7(data):
return [(sum(y) + sum(y[1:-1]) + sum(y[2:-2]) + 7) // 15
for y in zip(data[:-6], data[1:-5], data[2:-4], data[3:-3], data[4:-2], data[5:-1], data[6:])]
data = [random.randrange(-100,101) for _ in range(100)]
for i in range(1,len(data)):
data[i] += data[i-1]
avg = do_averaging_7(data)
fig, ax = plt.subplots()
ax.plot(range(len(data)), data, "blue")
ax.plot(range(3, 3+len(avg)), avg, color="red")
ax.set_ylabel('Raw')
plt.tight_layout()
plt.show()
Resulting plot:
Related
I would be so thankful if someone would be able to help me with this. I am creating a graph in matplotib however I would to love to split up the 14 lines created from the while loop into the x and y values of P, so instead of plt.plot(t,P) it would be plt.plot(t,((P[1])[0]))) and
plt.plot(t,((P[1])[1]))). I would love if someone could help me very quick, it should be easy but i am just getting errors with the arrays
`
#Altering Alpha in Tumor Cells vs PACCs
#What is alpha? α = Rate of conversion of cancer cells to PACCs
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
from google.colab import files
value = -6
counter = -1
array = []
pac = []
while value <= 0:
def modelP(x,t):
P, C = x
λc = 0.0601
K = 2000
α = 1 * (10**value)
ν = 1 * (10**-6)
λp = 0.1
γ = 2
#returning odes
dPdt = ((λp))*P*(1-(C+(γ*P))/K)+ (α*C)
dCdt = ((λc)*C)*(1-(C+(γ*P))/K)-(α*C) + (ν***P)
return dPdt, dCdt
#initial
C0= 256
P0 = 0
Pinit = [P0,C0]
#time points
t = np.linspace(0,730)
#solve odes
P = odeint(modelP,Pinit,t)
plt.plot(t,P)
value += 1
#plot results
plt.xlabel('Time [days]')
plt.ylabel('Number of PACCs')
plt.show()
`
You can use subplots() to create two subplots and then plot the individual line into the plot you need. To do this, firstly add the subplots at the start (before the while loop) by adding this line...
fig, ax = plt.subplots(2,1) ## Plot will 2 rows, 1 column... change if required
Then... within the while loop, replace the plotting line...
plt.plot(t,P)
with (do take care of the space so that the lines are within while loop)
if value < -3: ## I am using value = -3 as the point of split, change as needed
ax[0].plot(t,P)#, ax=ax[0]) ## Add to first plot
else:
ax[1].plot(t,P)#,ax=ax[1]) ## Add to second plot
This will give a plot like this.
I have the following data set where I have to estimate the joint density of 'bwt' and 'age' using kernel density estimation with a 2-dimensional Gaussian kernel and width h=5. I can't use modules such as scipy where there are ready functions to do this and I have to built functions to calculate the density. Here's what I've gotten so far.
import numpy as np
import pandas as pd
babies_full = pd.read_csv("https://www2.helsinki.fi/sites/default/files/atoms/files/babies2.txt", sep='\t')
#Getting the columns I need
babies_full1=babies_full[['gestation', 'age']]
x=np.array(babies_full1,'int')
#2d Gaussian kernel
def k_2dgauss(x):
return np.exp(-np.sum(x**2, 1)/2) / np.sqrt(2*np.pi)
#Multivariate kernel density
def mv_kernel_density(t, x, h):
d = x.shape[1]
return np.mean(k_2dgauss((t - x)/h))/h**d
t = np.linspace(1.0, 5.0, 50)
h=5
print(mv_kernel_density(t, x, h))
However, I get a value error 'ValueError: operands could not be broadcast together with shapes (50,) (1173,2)' which think is because different shape of the matrices. I also don't understand why k_2dgauss(x) for me returns an array of zeros since it should only return one value. In general, I am new to the concept of kernel density estimation I don't really know if I've written the functions right so any hints would help!
Following on from my comments on your original post, I think this is what you want to do, but if not then come back to me and we can try again.
# info supplied by OP
import numpy as np
import pandas as pdbabies_full = \
pd.read_csv("https://www2.helsinki.fi/sites/default/files/atoms/files/babies2.txt", sep='\t')
#Getting the columns I need
babies_full1=babies_full[['gestation', 'age']]
x=np.array(babies_full1,'int')
# my contributions
from math import floor, ceil
def binMaker(arr, base):
"""function I already use for this sort of thing.
arr is the arr I want to make bins for
base is the bin separation, but does require you to import floor and ceil
otherwise you can make these bins manually yourself"""
binMin = floor(arr.min() / base) * base
binMax = ceil(arr.max() / base) * base
return np.arange(binMin, binMax + base, base)
bins1 = binMaker(x[:,0], 20.) # bins from 140. to 360. spaced 20 apart
bins2 = binMaker(x[:,1], 5.) # bins from 15. to 45. spaced 5. apart
counts = np.zeros((len(bins1)-1, len(bins2)-1)) # empty array for counts to go in
for i in range(0, len(bins1)-1): # loop over the intervals, hence the -1
boo = (x[:,0] >= bins1[i]) * (x[:,0] < bins1[i+1])
for j in range(0, len(bins2)-1): # loop over the intervals, hence the -1
counts[i,j] = np.count_nonzero((x[boo,1] >= bins2[j]) *
(x[boo,1] < bins2[j+1]))
# if you want your PDF to be a fraction of the total
# rather than the number of counts, do the next line
counts /= x.shape[0]
# plotting
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
# setting the levels so that each number in counts has its own colour
levels = np.linspace(-0.5, counts.max()+0.5, int(counts.max())+2)
cmap = plt.get_cmap('viridis') # or any colormap you like
norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True)
fig, ax = plt.subplots(1, 1, figsize=(6,5), dpi=150)
pcm = ax.pcolormesh(bins2, bins1, counts, ec='k', lw=1)
fig.colorbar(pcm, ax=ax, label='Counts (%)')
ax.set_xlabel('Age')
ax.set_ylabel('Gestation')
ax.set_xticks(bins2)
ax.set_yticks(bins1)
plt.title('Manually making a 2D (joint) PDF')
If this is what you wanted, then there is an easier way with np.histgoram2d, although I think you specified it had to be using your own methods, and not built in functions. I've included it anyway for completeness' sake.
pdf = np.histogram2d(x[:,0], x[:,1], bins=(bins1,bins2))[0]
pdf /= x.shape[0] # again for normalising and making a percentage
levels = np.linspace(-0.5, pdf.max()+0.5, int(pdf.max())+2)
cmap = plt.get_cmap('viridis') # or any colormap you like
norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True)
fig, ax = plt.subplots(1, 1, figsize=(6,5), dpi=150)
pcm = ax.pcolormesh(bins2, bins1, pdf, ec='k', lw=1)
fig.colorbar(pcm, ax=ax, label='Counts (%)')
ax.set_xlabel('Age')
ax.set_ylabel('Gestation')
ax.set_xticks(bins2)
ax.set_yticks(bins1)
plt.title('using np.histogram2d to make a 2D (joint) PDF')
Final note - in this example, the only place where counts doesn't equal pdf is for the bin between 40 <= age < 45 and 280 <= gestation 300, which I think is due to how, in my manual case, I've used <= and <, and I'm a little unsure how np.histogram2d handles values outside the bin ranges, or on the bin edges etc. We can see the element of x that is responsible
>>> print(x[1011])
[280 45]
I have a python program that reads tsv data and plots it using the matplotlib library.
I feel like my code works pretty well:
def main(compsPath: str, gibbsPath: str):
"""
Given the file paths for comps.tsv and
gibbs.tsv, this main function will
produce two separate plots - one for each file.
"""
# Read tsv data into np record arrays
# Slice off header text
with open(compsPath, 'r') as fcomps:
reader = csv.reader(fcomps, delimiter='\t')
compsHeader = next(reader)
compsData = np.array(list(reader)).astype(np.double)
with open(gibbsPath, 'r') as fgibbs:
reader = csv.reader(fgibbs, delimiter='\t')
gibbsHeader = next(reader)
gibbsData = np.array(list(reader)).astype(np.double)
# Get data dimensions:
# - - - M := Number of metabolites
# - - - N := Number of reactions
M = compsData.shape[1] - 1
N = gibbsData.shape[1] - 1
plotComps(M, compsData, compsHeader)
plotGibbs(N, gibbsData, gibbsHeader)
plt.show()
The plotGibbs function produces the following graphic for the tsv file I'm working with. For this graphic, N=3 (3 reactions).
I would like to indicate at what point in time each reaction becomes unfavorable (in the context of my project, this just means that the reaction stops). This occurs when the gibbs free energy value (∆G) of the reaction is greater than or equal to 0.
I feel like I could best emphasize this by color-coding the line plots my program generates. For negative ∆G values, I would like the line to be green, and for positive or zero ∆G values, I would like the line to be red.
Here is my current code for generating the gibbs free energy plots (does not color-code):
def plotGibbs(N: int, gibbsData: np.ndarray, gibbsHeader):
gibbsFig = plt.figure()
gibbsFig.suptitle("∆G˚ Yield Plotted over Time (days)")
numCols = ceil(N / 2)
numRows = (N // numCols) + 1
for n in range (1, N+1):
ax = gibbsFig.add_subplot(numRows, numCols, n)
ax.set_ylabel(gibbsHeader[n])
ax.set_xlabel(gibbsHeader[0])
ax.plot(gibbsData[:, 0], gibbsData[:, n])
gibbsFig.tight_layout()
How could I make it so that negative values are plotted green, and non-negative values are plotted red?
You could try to find where a change of sign occurs in your data using np.where with a simple condition like gibbsData[:, n]>0 then plot negative/positive data accordingly:
def plotGibbs(N: int, gibbsData: np.ndarray, gibbsHeader):
gibbsFig = plt.figure()
gibbsFig.suptitle("∆G˚ Yield Plotted over Time (days)")
numCols = ceil(N / 2)
numRows = (N // numCols) + 1
for n in range (1, N+1):
ax = gibbsFig.add_subplot(numRows, numCols, n)
ax.set_ylabel(gibbsHeader[n])
ax.set_xlabel(gibbsHeader[0])
# idx where sign change occurs for data n
idx_zero = np.where(gibbsData[:, n]>0)[0][0]
# negatives y values
ax.plot(gibbsData[:idx_zero, 0], gibbsData[:idx_zero,n],'g')
# positive y values
ax.plot(gibbsData[idx_zero:, 0], gibbsData[idx_zero:,n],'r')
gibbsFig.tight_layout()
I want to get a plot similar to the following plot that has different colors based on values for x-axis. Ignore the u and f letters and also the blue curve and gray lines. I only need the green and red lines. So, if you use my code, you will get a plot that is all one color. What I want is to have different color when x is between 0 and the turning point (in this case it is x=50%) and then a different color for the rest.
Code:
import matplotlib.pyplot as plt
def GRLC(values):
n = len(values)
assert(n > 0), 'Empty list of values'
sortedValues = sorted(values) #Sort smallest to largest
#Find cumulative totals
cumm = [0]
for i in range(n):
cumm.append(sum(sortedValues[0:(i + 1)]))
#Calculate Lorenz points
LorenzPoints = [[], []]
sumYs = 0 #Some of all y values
robinHoodIdx = -1 #Robin Hood index max(x_i, y_i)
for i in range(1, n + 2):
x = 100.0 * (i - 1)/n
y = 100.0 * (cumm[i - 1]/float(cumm[n]))
LorenzPoints[0].append(x)
LorenzPoints[1].append(y)
sumYs += y
maxX_Y = x - y
if maxX_Y > robinHoodIdx: robinHoodIdx = maxX_Y
giniIdx = 100 + (100 - 2 * sumYs)/n #Gini index
return [giniIdx, giniIdx/100, robinHoodIdx, LorenzPoints]
reg=[400,200]
result_reg = GRLC(reg)
print 'Gini Index Reg', result_reg[0]
print 'Gini Coefficient Reg', result_reg[1]
print 'Robin Hood Index Reg', result_reg[2]
#Plot
plt.plot(result_reg[3][0], result_reg[3][1], [0, 100], [0, 100], '--')
plt.legend(['Reg-ALSRank#10','Equity-Line'], loc='upper left',prop={'size':16})
plt.xlabel('% of items ')
plt.ylabel('% of times being recommended')
plt.show()
This is how you would plot two lines of different colors, knowing the index in the array at which the color should change.
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(0,49, num=50)
y = x**2
x0 = 23
plt.plot(x[:x0+1], y[:x0+1])
plt.plot(x[x0:], y[x0:])
plt.show()
This works because by default, subsequent line plots have a different color, but you could of course set the color yourself,
plt.plot(x[:x0+1], y[:x0+1], color="cornflowerblue")
I’m trying to plot data an in order to check my code, I’m making a comparison of the resulting plots with what has already been generated with Matlab. I am encountering several issues however with this:
Generally, the parsing of RINEX files works, and the general pattern of the presentation of the data looks similar to that the Matlab scripts plotted. However there are small deviations in data that should become apparent when zooming in on the data i.e. when using a smaller time series, for example plotting over a special 2 hour period, not 24 hours. In Matlab, this small discrepancy can be seen, and a polynomial fitting applied. However for the Python plots (the first plot shown below), the curved line of this two hour period appears “smooth” and does not deviate at all, like that seen in the Matlab script (the second plot shows the blue line as the data, against the red line of the polyfit, hence, the blue line shows a slight discrepancy at x=9.4). The Matlab script is assumed correct, as this deviation is because of an Seismic activity that disrupts the ionosphere temporarily. Please refer to the plots below:
The third plot is in Matlab, where this is simply the polyfit minus the live data.
Therefore, it is not clear just how this data is being plotted on the axes for the Python script, because the data appears to smooth? Nor if my code is wrong (see below) and somehow “smooths” out the data somehow:
#Calculating by looping through
for sv in range(32):
sat = self.obs_data_chunks_dataframe[sv, :]
#print "sat.index_{0}: {1}".format(sv+1, sat.index)
phi1 = sat['L1'] * LAMBDA_1 #Change units of L1 to meters
phi2 = sat['L2'] * LAMBDA_2 #Change units of L2 to meters
pr1 = sat['P1']
pr2 = sat['P2']
#CALCULATION: teqc Calculation
iono_teqc = COEFF * (pr2 - pr1) / 1000000 #divide to make values smaller (tbc)
print "iono_teqc_{0}: {1}".format(sv+1, iono_teqc)
#PLOTTING
#Plotting of the data
plt.plot(sat.index, iono_teqc, label=‘teqc’)
plt.xlabel('Time (UTC)')
plt.ylabel('Ionosphere Delay (meters)')
plt.title("Ionosphere Delay on {0} for Satellite {1}.".format(self.date, sv+1))
plt.legend()
ax = plt.gca()
ax.ticklabel_format(useOffset=False)
plt.grid()
if sys.platform.startswith('win'):
plt.savefig(winpath + '\Figure_SV{0}'.format(sv+1))
elif sys.platform.startswith('darwin'):
plt.savefig(macpath + 'Figure_SV{0}'.format(sv+1))
plt.close()
Following on from point 1, the polynomial fitting code below does not run the way I’d like, so I’m overlooking something here. I assume this has to do with the data used upon the x,y-axes but can’t pinpoint exactly what. Would anyone know where I am going wrong here?
#Zoomed in plots
if sv == 19:
#Plotting of the data
plt.plot(sat.index, iono_teqc, label=‘teqc’) #sat.index to plot for time in UTC
plt.xlim(8, 10)
plt.xlabel('Time (UTC)')
plt.ylabel('Ionosphere Delay (meters)')
plt.title("Ionosphere Delay on {0} for Satellite {1}.".format(self.date, sv+1))
plt.legend()
ax = plt.gca()
ax.ticklabel_format(useOffset=False)
plt.grid()
#Polynomial fitting
coefficients = np.polyfit(sat.index, iono_teqc, 2)
plt.plot(coefficients)
if sys.platform.startswith('win'):
#os.path.join(winpath, 'Figure_SV{0}'.format(sv+1))
plt.savefig(winpath + '\Zoom_SV{0}'.format(sv+1))
elif sys.platform.startswith('darwin'):
plt.savefig(macpath + 'Zoom_SV{0}'.format(sv+1))
plt.close()
My RINEX file comprises 32 satellites. However when trying to generate the plots for all 32, I receive:
IndexError: index 31 is out of bounds for axis 0 with size 31
Changing the code below to 31 solves this partly, only excluding the 32nd satellite. I’d like to also plot for satellite 32. The functions for the parsing, and formatting of the data are given below:
def read_obs(self, RINEXfile, n_sat, sat_map):
obs = np.empty((TOTAL_SATS, len(self.obs_types)), dtype=np.float64) * np.NaN
lli = np.zeros((TOTAL_SATS, len(self.obs_types)), dtype=np.uint8)
signal_strength = np.zeros((TOTAL_SATS, len(self.obs_types)), dtype=np.uint8)
for i in range(n_sat):
# Join together observations for a single satellite if split across lines.
obs_line = ''.join(padline(RINEXfile.readline()[:-1], 16) for _ in range((len(self.obs_types) + 4) / 5))
#obs_line = ''.join(padline(RINEXfile.readline()[:-1], 16) for _ in range(2))
#while obs_line
for j in range(len(self.obs_types)):
obs_record = obs_line[16*j:16*(j+1)]
obs[sat_map[i], j] = floatornan(obs_record[0:14])
lli[sat_map[i], j] = digitorzero(obs_record[14:15])
signal_strength[sat_map[i], j] = digitorzero(obs_record[15:16])
return obs, lli, signal_strength
def read_data_chunk(self, RINEXfile, CHUNK_SIZE = 10000):
obss = np.empty((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.float64) * np.NaN
llis = np.zeros((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.uint8)
signal_strengths = np.zeros((CHUNK_SIZE, TOTAL_SATS, len(self.obs_types)), dtype=np.uint8)
epochs = np.zeros(CHUNK_SIZE, dtype='datetime64[us]')
flags = np.zeros(CHUNK_SIZE, dtype=np.uint8)
i = 0 #ggfrfg
while True:
hdr = self.read_epoch_header(RINEXfile)
if hdr is None:
break
epoch_time, flags[i], sats = hdr
#epochs[i] = np.datetime64(epoch_time)
epochs[i] = epoch_time
sat_map = np.ones(len(sats)) * -1
for n, sat in enumerate(sats):
if sat[0] == 'G':
sat_map[n] = int(sat[1:]) - 1
obss[i], llis[i], signal_strengths[i] = self.read_obs(RINEXfile, len(sats), sat_map)
i += 1
if i >= CHUNK_SIZE:
break
return obss[:i], llis[:i], signal_strengths[:i], epochs[:i], flags[:i]
def read_data(self, RINEXfile):
obs_data_chunks = []
while True:
obss, _, _, epochs, _ = self.read_data_chunk(RINEXfile)
epochs = epochs.astype(np.int64)
epochs = np.divide(epochs, float(3600.000))
if obss.shape[0] == 0:
break
obs_data_chunks.append(pd.Panel(
np.rollaxis(obss, 1, 0),
items=['G%02d' % d for d in range(1, 33)],
major_axis=epochs,
minor_axis=self.obs_types
).dropna(axis=0, how='all').dropna(axis=2, how='all'))
self.obs_data_chunks_dataframe = obs_data_chunks[0]
Any suggestions?
Cheers, pymat.
I managed to solve Qu1 as it was a conversion issue with my calculation that was overlooked, the other two points are however open...