Integral of a distribution returns nan value - python

I have a distribution whose point on both axies are:
x = [21.381625583382622 ,21.201155495759807 ,21.008654068962123 ,20.90037201638843 ,20.888340677213577 ,20.701854920003328 ,20.62365121536677,
20.467243806093656 ,20.395055771044525 ,20.196538674659422 ,20.094272291673157,20.040131265386314 ,19.949896221574896 ,19.727316446840085,19.619034394266393 ,19.57090903756697 ,19.33028225406988 ,19.167859175209337 ,19.119733818509914 ,18.975357748411653 ,18.86707569583797 ,18.69863694739 ,18.620433242753442 ,18.578323555641447 ,18.27754007627008 ,18.235430389158086 ,18.169258023696383 ,17.99480360566099 ,17.86847454432501 ,17.76019249175132 ,17.639879100002773 ,17.519565708254223 ,17.278938924757128 ,17.309017272694263 ,17.122531515484013 ,16.990186784560613 ,16.875889062399487 ,16.749560001063514 ,16.64729361807725 ,16.50291754797899 ,16.394635495405296 ,16.26830643406932 ,16.11189902479621 ,16.03369532015965 ,15.91939759799853 ,15.76299018872542 ,15.648692466564295 ,15.540410413990605 ,15.408065683067202 ,15.221579925856952 ,15.071188186171264 ,15.071188186171264 ,14.87267108978616 ,14.75837336762504 ,14.674153993401056 ,14.553840601652508 ,14.409464531554251 ,14.253057122281138 ,14.19891609599429 ,14.036493017133752 ,13.843991590336074 ,13.789850564049226 ,13.669537172300679 ,13.513129763027564 ,13.392816371279016 ,13.266487309943043 ,13.140158248607065 ,12.869453117172833 ,12.82734343006084 ,12.64085767285059 ,12.496481602752333 ,12.370152541416356 ,12.261870488842664 ,12.093431740394694 ,12.003196696583283 ,11.94304000070901 ,11.75655424349876 ,11.600146834225647 ,11.497880451239382 ,11.407645407427971 ,11.19709697186801 ,11.088814919294316 ,10.992564205895478 ,10.830141127034938 ,10.7338904136361 ,10.595530013125268 ,10.469200951789292 ,10.33685622086589 ,10.180448811592775 ,10.07818242860651 ,9.957869036857963 ,9.807477297172277 ,9.687163905423729 ,9.560834844087754 ,9.452552791514059 ,9.29012971265352 ,9.1758319904924 ,9.073565607506133 ,8.953252215757583 ,8.814891815246753 ,8.54418668381252 ,8.441920300826254 ,8.297544230727997 ,8.225356195678867 ,8.050901777643471 ,7.9245727163074955 ,7.82230633332123 ,7.677930263222972 ,7.521522853949859 ,7.425272140551021 ,7.292927409627618 ,7.160582678704215 ,7.010190939018529 ,6.913940225619691 ,6.805658173045997 ,6.625188085423175 ,6.5409687111991905 ,6.4387023282129245 ,6.282294918939812 ,6.149950188016408 ,6.047683805030142 ,5.921354743694167 ,5.776978673595909 ,5.638618273085079 ,5.560414568448522 ,5.40400715917541 ,5.2716624282520055 ,5.127286358153748 ,5.0069729664052 ,4.874628235481797 ,4.730252165383538 ,4.603923104047563 ,4.49564105147387 ,4.321186633438475 ,4.218920250452209 ,4.086575519528806 ,3.9542307886054022 ,3.8218860576819993 ,3.6955569963460233 ,3.5692279350100478 ,3.4188361953243622 ,3.3045384731632415 ,3.196256420589548 ,3.0157863329667256 ,2.9014886108056044 ,2.7992222278193384 ,2.6668774968959355 ,2.534532765972532 ,2.402188035049129 ,2.2878903128880084 ,2.1435142427897502 ,1.993122503104065 ,1.8908561201177987 ,1.7585113891943958 ,1.6381979974458472 ,1.4998375969350168 ,1.3674928660116135 ,1.241163804675638 ,1.1148347433396621 ,0.9824900124162592 ,0.880223629429993 ,0.7298318897443077 ,0.6035028284083319 ,0.471158097484929 ,0.3388133665615254 ,0.21849997481297745 ,0.10420225265185667 ,-0.022126808684119315 ,-0.17251854836980485 ,-0.2868162705309256 ,-0.395098323104619 ,-0.5575214019651598 ,-0.7018974720634175 ,-0.8041638550496835 ,-0.930492916385659 ,-1.056821977721635 ,-1.1951823782324649 ,-1.3215114395684413 ,-1.441824831316989 ,-1.616279249352385 ,-1.7185456323386505 ,-1.850890363262054 ,-1.9651880854231747 ,-2.097532816346578 ,-2.2419088864448353 ,-2.3682379477808118 ,-2.512614017879069 ,-2.6329274096276176 ,-2.7291781230264567 ,-2.9036325410618513 ,-3.011914593635545 ,-3.1382436549715216 ,-3.276604055482351 ,-3.4149644559931813 ,-3.529262178154302 ,-3.64957556990285 ,-3.799967309588536 ,-3.93231204051194 ,-4.034578423498205 ,-4.172938824009036 ,-4.341377572457003 ,-4.455675294618124 ,-4.563957347191818 ,-4.714349086877503 ,-4.852709487388334 ,-4.9549758703746 ,-5.081304931710576 ,-5.243728010571116 ,-5.358025732732237 ,-5.466307785305931 ,-5.62715194579043 ,-5.730997247152738 ,-6.00170237858697 ,-6.140062779097802 ,-6.2603761708463495 ,-6.380689562594897 ,-6.494987284756018 ,-6.657410363616558 ,-6.777723755365107 ,-6.904052816701082 ,-7.042413217211912 ,-7.180773617722744 ,-7.271008661534154 ,-7.409369062044986 ,-7.55976080173067 ,-7.6860898630666465 ,-7.800387585227767 ,-7.962810664088307 ,-8.107186734186566 ,-8.191406108410549 ,-8.32976650892138 ,-8.456095570257354 ,-8.582424631593332 ,-8.672659675404743 ,-8.79297306715329 ,-9.015552841888105 ,-9.117819224874372 ,-9.208054268685784 ,-9.364461677958897 ,-9.484775069707442 ,-9.611104131043419 ,-9.695323505267403 ,-9.875793592890226 ,-10.020169662988483 ,-10.080326358862758 ,-10.26079644648558 ,-10.375094168646703 ,-10.501423229982677 ,-10.65783063925579 ,-10.760097022242057 ,-10.904473092340314 ,-11.024786484088862 ,-11.139084206249985 ,-11.337601302635088 ,-11.373695320159651 ,-11.578228086132183 ,-11.830886208804134 ,-11.95721527014011 ,-12.035418974776668 ,-12.26401441909891 ,-12.408390489197167 ,-12.40237481960974 ,-12.612923255169699 ,-12.751283655680531 ,-12.84753436907937 ,-12.991910439177627 ,-13.12425517010103 ,-13.23855289226215 ,-13.334803605660989 ,-13.503242354108957 ,-13.707775120081488 ,-13.791994494305476 ,-13.88824520770431 ,-14.056683956152279 ,-14.189028687075682 ,-14.309342078824232 ,-14.4777808272722 ,14.598094219020746 ,-14.724423280356723 ,-14.77856430643569 ,-15.019191090140664 ,-15.049269438077802 ,-15.19364550817606 ,-15.33200590868689 ,-15.494428987547431 ,-15.602711040121127 ,-15.692946083932535 ,-15.843337823618223 ,-16.077948937527893 ,-16.132089963814735 ,-16.234356346801 ,-16.38448086486688 ,-16.517092817410095 ,-16.607327861221506 ,-16.727641252970052]
y = [ 14.0 ,20.0 ,16.0 ,12.0 ,8.0 ,12.0 ,22.0 ,16.0 ,18.0 ,24.0 ,17.0 ,22.0 ,15.0 ,13.0 ,16.0 ,30.0 ,16.0 ,9.0 ,11.0 ,4.0 ,9.0 ,24.0 ,22.0 ,11.0 , 11.0 ,0.0 ,7.0 ,11.0 ,8.0, 14.0 ,13.0 ,9.0 ,4.0 ,33.0 ,27.0 ,27.0 ,26.0 ,32.0 ,18.0 ,23.0 ,26.0 ,27.0 ,21.0 ,17.0 ,33.0 ,27.0 ,32.0 ,24.0 ,31.0 ,34.0 ,14.0 ,27.0 ,22.0 ,26.0 ,17.0 ,22.0 ,21.0 ,11.0 ,37.0 ,24.0 ,34.0 ,20.0 ,29.0 ,26.0 ,25.0 ,34.0 ,26.0 ,12.0 ,28.0 ,18.0 ,20.0 ,20.0 ,25.0 ,18.0 ,36.0 ,22.0 ,20.0 ,29.0 ,19.0 ,17.0 ,16.0 ,6.0 ,17.0 ,11.0 ,25.0 ,12.0 ,21.0 ,31.0 ,29.0 ,20.0 ,11.0 ,9.0 ,0.0 ,21.0 ,24.0 ,8.0 ,12.0 ,6.0 ,16.0 ,24.0 ,17.0 ,16.0 ,10.0 ,18.0 ,15.0 ,3.0 ,8.0 ,12.0 ,10.0 ,11.0 ,7.0 ,16.0 ,12.0 ,30.0 ,23.0 ,27.0 ,28.0 ,33.0 ,29.0 ,27.0 ,25.0 ,44.0 ,40.0 ,47.0 ,48.0 ,55.0 ,75.0 ,75.0 ,71.0 ,91.0 ,92.0 ,83.0 ,120.0 133.0 ,162.0 ,163.0 ,187.0 ,237.0 ,262.0 ,306.0 ,316.0 ,385.0 ,417.0 ,474.0 597.0 ,639.0 ,687.0 ,780.0 ,868.0 ,953.0 ,1079.0 ,1187.0 ,1279.0 ,1431.0 ,1628.0 ,1875.0 ,2180.0 ,2542.0 ,2988.0 ,3406.0 ,3890.0 ,4356.0 ,4824.0 ,5222.0 ,5621.0 ,5834.0 ,5937.0 ,5875.0 ,5838.0 ,5578.0 ,5269.0 ,4847.0 ,4403.0 ,3773.0 ,3335.0 ,2934.0 ,2420.0 ,2132.0 ,1873.0 ,1612.0 ,1425.0 ,1264.0 ,1173.0 ,1030.0 ,925.0 ,857.0 ,768.0 ,684.0 ,614.0 , 523.0 ,491.0 ,422.0 ,397.0 ,339.0 ,281.0 ,236.0 ,227.0 ,202.0 ,155.0 ,155.0 ,116.0 ,101.0 ,106.0 ,96.0 ,90.0 ,68.0 ,65.0 ,66.0 ,61.0 ,58.0 ,44.0 ,37.0 ,39.0 ,32.0 ,40.0 ,40.0 ,27.0 ,33.0 ,30.0 ,19.0 ,17.0 ,31.0 ,23.0 ,42.0 ,14.0 ,26.0 ,22.0 ,15.0 ,10.0 ,22.0 ,20.0 ,18.0 ,28.0 ,23.0 ,26.0 ,14.0 ,9.0 ,17.0 ,11.0 ,15.0 ,24.0 ,25.0 ,3.0 ,10.0 ,25.0 ,9.0 ,23.0 ,16.0 ,7.0 ,28.0 ,14.0 ,20.0 ,13.0 ,11.0 ,14.0 ,22.0 ,19.0 ,11.0 ,15.0 ,25.0 ,28.0 ,28.0 ,26.0 ,12.0 ,28.0 ,28.0 ,10.0 ,13.0 ,11.0 ,37.0 ,5.0 ,26.0 ,20.0 ,15.0 ,32.0 ,24.0 ,13.0 , 29.0 ,28.0 ,22.0 ,36.0 ,36.0 ,26.0 ,27.0 ,25.0 ,27.0 ,30.0 ,35.0 ,35.0 ,35.0 ,19.0 ,26.0 ,23.0 ,24.0 ,32.0]
I want to evaluate the area under the distribution, this is the code I've used:
area = simps(y,x)
But if I run it, it gives me this error:
RuntimeWarning: divide by zero encountered in true_divide
y[slice1]hsumhsum/hprod +
RuntimeWarning: invalid value encountered in add y[slice1]hsumhsum/hprod +
I've tried to print the value of the area but it returns nan value.
Can anyone help me please?

I've run your code for myself, and found this raised errors.
I could recommend you to use trapz method instead of simps.
But be careful firstly read about mathematical background for this two functions
How to decide between scipy.integrate.simps or numpy.trapz?.
If trapz will be suite for you, use it :)

Related

Find local maxima in data from dataframe

Is there a way to find the local maxima from data I get from CSV file and put its value on the plot?
The x and y values that are in a pandas dataframe look something like this
x = 1598.78, 1596.85, 1594.92, 1592.99, 1591.07, 1589.14, 1587.21, 1585.28, 1583.35, 1581.42, 1579.49, 1577.57, 1575.64, 1573.71, 1571.78, 1569.85, 1567.92, 1565.99, 1564.07, 1562.14, 1560.21, 1558.28, 1556.35, 1554.42, 1552.49, 1550.57, 1548.64, 1546.71, 1544.78, 1542.85, 1540.92, 1538.99, 1537.07, 1535.14, 1533.21, 1531.28, 1529.35, 1527.42, 1525.49, 1523.57, 1521.64, 1519.71, 1517.78, 1515.85, 1513.92, 1511.99, 1510.07, 1508.14, 1506.21, 1504.28, 1502.35, 1500.42, 1498.49, 1496.57, 1494.64, 1492.71, 1490.78, 1488.85, 1486.92, 1484.99, 1483.07, 1481.14, 1479.21, 1477.28, 1475.35, 1473.42, 1471.49, 1469.57, 1467.64, 1465.71, 1463.78, 1461.85, 1459.92, 1457.99, 1456.07, 1454.14, 1452.21, 1450.28, 1448.35, 1446.42, 1444.49, 1442.57, 1440.64, 1438.71, 1436.78, 1434.85, 1432.92, 1430.99, 1429.07, 1427.14, 1425.21, 1423.28, 1421.35, 1419.42, 1417.49, 1415.57, 1413.64, 1411.71, 1409.78, 1407.85, 1405.92, 1403.99, 1402.07, 1400.14
y = 0.640, 0.624, 0.609, 0.594, 0.581, 0.569, 0.558, 0.547, 0.537, 0.530, 0.523, 0.516, 0.508, 0.502, 0.497, 0.491, 0.487, 0.484, 0.481, 0.480, 0.479, 0.482, 0.490, 0.503, 0.520, 0.542, 0.566, 0.586, 0.600, 0.606, 0.593, 0.569, 0.557, 0.548, 0.538, 0.531, 0.527, 0.524, 0.522, 0.522, 0.523, 0.525, 0.526, 0.527, 0.530, 0.534, 0.536, 0.539, 0.547, 0.553, 0.557, 0.563, 0.573, 0.599, 0.654, 0.738, 0.852, 0.891, 0.810, 0.744, 0.711, 0.694, 0.686, 0.683, 0.683, 0.690, 0.700, 0.706, 0.713, 0.723, 0.731, 0.732, 0.737, 0.756, 0.779, 0.786, 0.790, 0.794, 0.802, 0.815, 0.827, 0.832, 0.831, 0.826, 0.823, 0.828, 0.834, 0.834, 0.832, 0.832, 0.831, 0.825, 0.816, 0.804, 0.798, 0.794, 0.786, 0.775, 0.764, 0.752, 0.739, 0.722, 0.708, 0.697
and I'm trying to get something like this.
P.S. Note that numeric values were added with the plt.text function just to exemplify what I want.
x = [1598.78, 1596.85, 1594.92, 1592.99, 1591.07, 1589.14, 1587.21, 1585.28, 1583.35, 1581.42, 1579.49, 1577.57, 1575.64, 1573.71, 1571.78, 1569.85, 1567.92, 1565.99, 1564.07, 1562.14, 1560.21, 1558.28, 1556.35, 1554.42, 1552.49, 1550.57, 1548.64, 1546.71, 1544.78, 1542.85, 1540.92, 1538.99, 1537.07, 1535.14, 1533.21, 1531.28, 1529.35, 1527.42, 1525.49, 1523.57, 1521.64, 1519.71, 1517.78, 1515.85, 1513.92, 1511.99, 1510.07, 1508.14, 1506.21, 1504.28, 1502.35, 1500.42, 1498.49, 1496.57, 1494.64, 1492.71, 1490.78, 1488.85, 1486.92, 1484.99, 1483.07, 1481.14, 1479.21, 1477.28, 1475.35, 1473.42, 1471.49, 1469.57, 1467.64, 1465.71, 1463.78, 1461.85, 1459.92, 1457.99, 1456.07, 1454.14, 1452.21, 1450.28, 1448.35, 1446.42, 1444.49, 1442.57, 1440.64, 1438.71, 1436.78, 1434.85, 1432.92, 1430.99, 1429.07, 1427.14, 1425.21, 1423.28, 1421.35, 1419.42, 1417.49, 1415.57, 1413.64, 1411.71, 1409.78, 1407.85, 1405.92, 1403.99, 1402.07, 1400.14]
y = [0.640, 0.624, 0.609, 0.594, 0.581, 0.569, 0.558, 0.547, 0.537, 0.530, 0.523, 0.516, 0.508, 0.502, 0.497, 0.491, 0.487, 0.484, 0.481, 0.480, 0.479, 0.482, 0.490, 0.503, 0.520, 0.542, 0.566, 0.586, 0.600, 0.606, 0.593, 0.569, 0.557, 0.548, 0.538, 0.531, 0.527, 0.524, 0.522, 0.522, 0.523, 0.525, 0.526, 0.527, 0.530, 0.534, 0.536, 0.539, 0.547, 0.553, 0.557, 0.563, 0.573, 0.599, 0.654, 0.738, 0.852, 0.891, 0.810, 0.744, 0.711, 0.694, 0.686, 0.683, 0.683, 0.690, 0.700, 0.706, 0.713, 0.723, 0.731, 0.732, 0.737, 0.756, 0.779, 0.786, 0.790, 0.794, 0.802, 0.815, 0.827, 0.832, 0.831, 0.826, 0.823, 0.828, 0.834, 0.834, 0.832, 0.832, 0.831, 0.825, 0.816, 0.804, 0.798, 0.794, 0.786, 0.775, 0.764, 0.752, 0.739, 0.722, 0.708, 0.697]
import matplotlib.pyplot as plt
# The slope of a line is a measure of its steepness. Mathematically, slope is calculated as "rise over run" (change in y divided by change in x).
slope = [np.sign((y[i] - y[i-1]) / (x[i] - x[i-1])) for i in range(1, len(y))]
x_prev = slope[0]
optima_dic={'minima':[], 'maxima':[]}
for i in range(1, len(slope)):
if slope[i] * x_prev == -1: #slope changed
if x_prev == 1: # slope changed from 1 to -1
optima_dic['maxima'].append(i)
else: # slope changed from -1 to 1
optima_dic['minima'].append(i)
x_prev=-x_prev
from matplotlib.pyplot import text
plt.rcParams["figure.figsize"] = (20,10)
ix = 0
for x_, y_ in zip(x, y):
plt.plot(x_, y_, 'o--', color='grey')
if(ix in optima_dic['minima']):
plt.text(x_, y_, s = x_, fontsize=10)
ix += 1

how can i smooth the graph values or extract main signals only

when i try to run the code below i get this graph
my code:
from numpy import nan
import json
import os
import numpy as np
import subprocess
import math
import matplotlib.pyplot as plt
from statistics import mean, stdev
def smooth(t):
new_t = []
for i, x in enumerate(t):
neighbourhood = t[max(i-2,0): i+3]
m = mean(neighbourhood)
s = stdev(neighbourhood, xbar=m)
if abs(x - m) > s:
x = ( t[i - 1 + (i==0)*2] + t[i + 1 - (i+1==len(t))*2] ) / 2
new_t.append(x)
return new_t
def outLiersFN(*U):
outliers=[] # after preprocessing list
#preprocessing Fc =| 2*LF1 prev by 1 - LF2 prev by 2 |
c0 = -2 #(previous) by 2 #from original
c1 =-1 #(previous) #from original
c2 =0 #(current) #from original
c3 = 1 #(next) #from original
preP = U[0] # original list
if c2 == 0:
outliers.append(preP[0])
c1+=1
c2+=1
c0+=1
c3+=1
oldlen = len(preP)
M_RangeOfMotion = 90
while oldlen > c2 :
if c3 == oldlen:
outliers.insert(c2, preP[c2]) #preP[c2] >> last element in old list
break
if (preP[c2] > M_RangeOfMotion and preP[c2] < (preP[c1] + preP[c3])/2) or (preP[c2] < M_RangeOfMotion and preP[c2] > (preP[c1] + preP[c3])/2): #Check Paper 3.3.1
Equ = (preP[c1] + preP[c3])/2 #fn of preprocessing # From third index # ==== inserting current frame
formatted_float = "{:.2f}".format(Equ) #with .2 number only
equu = float(formatted_float) #from string float to float
outliers.insert(c2,equu) # insert the preprocessed value to the List
c1+=1
c2+=1
c0+=1
c3+=1
else :
Equ = preP[c2] # fn of preprocessing #put same element (do nothing)
formatted_float = "{:.2f}".format(Equ) # with .2 number only
equu = float(formatted_float) # from string float to float
outliers.insert(c2, equu) # insert the preprocessed value to the List
c1 += 1
c2 += 1
c0 += 1
c3 += 1
return outliers
def remove_nan(list):
newlist = [x for x in list if math.isnan(x) == False]
return newlist
the_angel = [176.04, 173.82, 170.09, 165.3, 171.8, 178.3, 178.77, 179.24, 179.93, 180.0, 173.39, 166.78, 166.03, 165.28, 165.72, 166.17, 166.71, 167.26, 168.04, 167.22, 166.68, 166.13, 161.53, 165.81, 170.1, 170.05, 170.5, 173.01, 176.02, 174.53, 160.09, 146.33, 146.38, 146.71, 150.33, 153.95, 154.32, 154.69, 134.52, 114.34, 115.6, 116.86, 134.99, 153.12, 152.28, 151.43, 151.36, 152.32, 158.9, 166.52, 177.74, 178.61, 179.47, 167.44, 155.4, 161.54, 167.68, 163.96, 160.24, 137.45, 114.66, 117.78, 120.89, 139.95, 139.62, 125.51, 111.79, 112.07, 112.74, 110.22, 107.7, 107.3, 106.52, 105.73, 103.07, 101.35, 102.5, 104.59, 104.6, 104.49, 104.38, 102.81, 101.25, 100.62, 100.25, 100.15, 100.32, 99.84, 99.36, 100.04, 100.31, 99.14, 98.3, 97.92, 97.41, 96.9, 96.39, 95.88, 95.9, 95.9, 96.02, 96.14, 96.39, 95.2, 94.56, 94.02, 93.88, 93.8, 93.77, 93.88, 94.04, 93.77, 93.65, 93.53, 94.2, 94.88, 92.59, 90.29, 27.01, 32.9, 38.78, 50.19, 61.59, 61.95, 62.31, 97.46, 97.38, 97.04, 96.46, 96.02, 96.1, 96.33, 95.61, 89.47, 89.34, 89.22, 89.48, 89.75, 90.02, 90.28, 88.16, 88.22, 88.29, 88.17, 88.17, 94.98, 94.84, 94.69, 94.94, 94.74, 94.54, 94.69, 94.71, 94.64, 94.58, 94.19, 94.52, 94.85, 87.7, 87.54, 87.38, 95.71, 96.57, 97.11, 97.05, 96.56, 96.07, 95.76, 95.56, 95.35, 95.28, 95.74, 96.2, 96.32, 96.33, 96.2, 96.14, 96.07, 96.07, 96.12, 96.17, 96.28, 96.31, 96.33, 96.16, 96.05, 95.94, 95.33, 88.96, 95.0, 95.78, 88.19, 88.19, 88.19, 87.92, 87.93, 88.03, 87.94, 87.86, 87.85, 87.89, 88.08, 88.01, 87.88, 88.02, 88.15, 88.15, 88.66, 88.73, 88.81, 88.41, 88.55, 88.68, 88.69, 88.02, 87.35, 95.19, 95.39, 95.38, 95.37, 95.27, 95.17, 95.33, 95.32, 95.31, 95.37, 95.42, 95.34, 95.44, 95.53, 95.47, 95.41, 95.13, 94.15, 94.78, 97.64, 97.1, 96.87, 97.03, 96.76, 35.44, 23.63, 23.27, 24.71, 26.16, 96.36, 113.13, 129.9, 96.82, 63.74, 34.25, 33.42, 32.6, 30.69, 31.06, 31.43, 97.14, 97.51, 97.23, 98.54, 100.13, 100.95, 28.82, 33.81, 66.81, 99.82, 102.63, 101.9, 101.44, 102.19, 103.22, 103.67, 104.13, 104.07, 104.73, 105.46, 103.74, 102.02, 103.32, 102.59, 29.54, 28.08, 28.76, 29.79, 30.82, 113.51, 129.34, 145.16, 143.18, 148.29, 153.67, 166.14, 161.16, 151.64, 149.27, 146.9, 151.67, 153.02, 149.28, 145.53, 149.1, 152.67, 158.78, 164.89, 164.84, 164.8, 162.11, 159.42, 156.73, 156.28, 155.83, 156.4, 161.0, 165.59, 164.44, 159.73, 155.76, 156.97, 158.92, 159.15, 159.39, 159.99, 160.44, 160.88, 163.89, 166.9, 167.71, 167.11, 167.0, 167.44, 168.38, 153.16, 137.94, 137.65, 152.09, 169.49, 171.36, 173.22, 174.01, 174.0, 174.2, 174.41, 157.74, 141.09, 149.32, 157.57, 156.4, 148.4, 140.78, 141.06, 141.73, 143.05, 143.91, 156.59, 169.29, 172.17, 175.05, 175.29, 175.27, 175.15, 175.02, 174.81, 174.59, 174.76, 174.94, 175.18, 175.41, 175.23, 174.51, 174.64, 174.77, 174.56, 173.25, 172.38, 174.17, 176.4, 177.27, 177.29, 177.33, 178.64, 179.98, 179.99, 176.0, 172.88, 173.77, 173.8, 173.97, 174.72, 175.24, 176.89, 179.07, 179.27, 178.78, 178.29, 175.61, 174.21, 172.8, 173.05, 173.41, 173.77, 174.65, 175.52, 175.58, 176.15, 176.71, 159.12, 141.54, 141.12, 155.62, 170.53, 165.54, 160.71, 158.22, 156.35, 156.82, 158.55, 160.27, 161.33, 162.39, 162.37, 159.48, 156.59, 156.77, 158.05, 159.32, 158.49, 157.66, 157.7, 157.74, 158.44, 159.14, 150.13, 143.06, 136.0, 125.7, 115.41, 111.19, 106.97, 107.1, 107.24, 107.45, 107.67, 113.34, 119.01, 144.87, 170.73, 174.31, 177.89, 174.78, 171.67, 163.26, 134.58, 105.9, 102.98, 100.77, 101.05, 101.39, 101.73, 99.79, 98.71, 97.64, 97.8, 97.89, 96.67, 95.45, 94.33, 93.38, 92.44, 48.53, 91.4, 91.35, 91.34, 91.33, 90.92, 90.51, 88.63, 87.0, 86.74, 86.48, 96.79, 96.09, 95.46, 95.39, 94.32, 93.25, 93.31, 93.37, 93.11, 92.57, 93.41, 94.25, 96.48, 92.71, 88.94, 90.07, 90.43, 78.06, 77.69, 77.32, 90.1, 89.15, 89.14, 88.85, 88.38, 87.63, 121.2, 120.66, 86.89, 86.42, 85.69, 84.86, 84.86, 85.34, 85.82, 86.07, 86.32, 85.82, 85.32, 86.23, 86.69, 87.15, 87.04, 86.87, 86.58, 86.0, 85.41, 85.41, 85.53, 85.66, 85.7, 85.72, 85.75, 85.92, 86.09, 85.77, 85.45, 84.94, 85.55, 86.16, 86.21, 86.1, 85.77, 85.27, 84.56, 84.99, 85.38, 85.42, 85.98, 86.54, 86.5, 86.45, 86.56, 86.63, 86.35, 86.08, 85.82, 85.51, 85.21, 84.6, 84.84, 84.97, 85.1, 86.12, 86.88, 86.8, 86.46, 86.47, 87.23, 87.8, 88.0, 88.08, 88.16, 87.72, 87.63, 87.37, 86.42, 86.48, 87.24, 87.97, 88.09, 88.19, 88.32, 88.44, 87.82, 87.2, 86.03, 85.78, 91.5, 93.0, 88.2, 88.52, 88.42, 87.28, 85.73, 85.62, 85.5, 85.5, 87.06, 87.6, 88.1, 88.31, 88.53, 88.77, 89.14, 89.52, 89.46, 89.4, 90.28, 89.74, 91.28, 92.17, 92.16, 92.15, 93.08, 94.0, 94.66, 95.32, 94.13, 93.7, 93.32, 93.69, 94.58, 95.47, 97.25, 99.03, 99.63, 99.67, 99.71, 100.33, 101.58, 103.36, 103.49, 103.41, 106.31, 109.34, 109.28, 109.21, 107.76, 106.31, 105.43, 104.94, 104.44, 111.19, 117.93, 115.59, 113.24, 116.15, 119.06, 125.43, 140.72, 156.0, 161.7, 143.52, 135.33, 127.13, 127.68, 148.68, 169.68, 172.2, 174.72, 174.75, 174.66, 158.57, 142.63, 145.13, 153.29, 161.45, 163.34, 165.24, 162.25, 159.89, 159.07, 156.39, 155.21, 156.04, 159.29, 160.07, 160.85, 163.45, 162.93, 161.71, 160.06, 158.4, 144.74, 132.64, 134.57, 150.22, 165.86, 172.95, 174.12, 175.3, 175.5, 176.31, 177.71, 179.72, 168.13, 156.55, 146.24, 155.75, 176.0, 175.99, 175.98, 176.0, 176.02, 176.25, 175.13, 174.26, 173.38, 173.37, 173.46, 176.34, 174.55, 172.77, 168.45, 166.35, 166.47, 168.81, 167.43, 166.79, 167.35, 168.65, 168.51, 168.37, 168.88, 169.74, 171.19, 171.33, 169.91, 168.49, 167.11, 166.83, 167.01, 168.68, 170.34, 170.43, 172.15, 173.86, 177.62, 177.61, 175.34, 173.06, 176.47, 179.87, 179.9, 177.67, 175.67, 175.39, 175.36, 177.03, 176.0, 174.98, 174.96, 174.94, 175.76, 176.57, 169.05, 162.99, 164.97, 168.74, 172.51, 167.38, 165.08, 163.03, 163.81, 164.83, 164.81, 164.8, 165.88, 165.36, 159.61, 153.86, 153.57, 153.61, 153.65, 154.62, 155.58, 157.97, 156.35, 155.66, 154.98, 156.11, 157.24, 159.25, 159.6, 160.43, 161.26, 164.71, 168.17, 147.46, 126.92, 106.38, 105.23, 104.4, 105.37, 106.65, 109.21, 107.44, 104.65, 101.86, 102.35, 102.84, 102.79, 102.19, 101.59, 100.98, 100.38, 98.72, 97.73, 97.32, 96.9, 95.11, 93.97, 94.12, 94.12, 93.1, 92.08, 89.29, 90.35, 90.35, 90.35, 90.35, 86.95, 86.37, 86.06, 85.74, 94.56, 93.16, 92.46, 91.76, 88.55, 85.33, 87.52, 92.18, 93.68, 95.18, 94.4, 92.17, 89.94, 89.4, 89.37, 99.44, 100.98, 102.52, 103.18, 88.96, 88.23, 87.5, 85.2, 85.19, 86.87, 121.42, 155.96, 155.97, 155.97, 86.2, 86.5, 86.8, 87.22, 87.36, 87.34, 87.03, 87.04, 87.05, 86.36, 85.68, 85.71, 85.84, 85.93, 86.01, 86.04, 86.08, 85.92, 86.05, 86.18, 86.17, 86.19, 86.23, 86.22, 86.09, 85.92, 85.66, 85.69, 85.69, 85.31, 84.91, 84.93, 84.95, 84.93, 84.91, 84.9, 84.9, 84.9, 84.9, 85.38, 85.52, 85.66, 85.66, 85.4, 85.14, 85.47, 85.8, 85.72, 85.64, 86.09, 85.84, 85.27, 85.47, 85.66, 85.59, 85.52, 85.38, 85.39, 85.28, 85.17, 85.39, 85.7, 85.98, 86.26, 86.61, 92.97, 93.15, 86.58, 86.58, 86.53, 86.47, 98.55, 99.41, 100.16, 100.9, 89.19, 90.28, 91.38, 91.39, 91.4, 91.44, 92.05, 131.05, 170.63, 170.13, 162.43, 125.64, 88.85, 88.85, 99.08, 100.38, 101.69, 100.74, 99.79, 96.33, 93.31, 93.73, 94.87, 96.01, 96.93, 97.85, 98.97, 97.85, 98.14, 99.37, 102.01, 103.8, 105.58, 108.52, 108.12, 107.72, 106.75, 106.82, 109.08, 112.37, 112.52, 112.66, 112.97, 114.12, 115.64, 117.1, 118.57, 126.13, 133.69, 149.27, 163.96, 166.62, 169.27, 164.94, 160.61, 149.35, 141.18, 143.41, 143.57, 149.26, 157.49, 159.94, 151.93, 147.47, 145.97, 145.56, 145.15, 143.85, 142.54, 142.18, 142.43, 143.12, 144.41, 144.38, 151.99, 159.59, 174.81, 174.94, 175.84, 176.87, 162.41, 152.94, 151.59, 155.24, 155.22, 155.19, 155.04]
p0 = outLiersFN(smooth(remove_nan(the_angel)))
the_angel = p0
plt.plot(the_angel) #list(filter(fun, L1))
plt.show()
print((the_angel))
how can i smooth the values in (the_angel) to get graph like this (red line)
i mean ignoring all unnecessary and noisy values and get only main line instead
you can edit my code or suggest me new filter or algorithm
pandas has a rolling() method for dataframes that you can use to calculate the mean over a window of values, e.g. the 70 closest ones:
import pandas as pd
import matplotlib.pyplot as plt
WINDOW_SIZE = 70
the_angel = [176.04, 173.82, 170.09, 165.3, 171.8, # ...
]
df = pd.DataFrame({'the angel': the_angel})
df[f'mean of {WINDOW_SIZE}'] = df['the angel'].rolling(
window=WINDOW_SIZE, center=True).mean()
df.plot(color=['blue', 'red']);

Python: Input variable for my "matching" function [duplicate]

This question already has an answer here:
Python: How to call a long function containing hundreds of lists in a short form
(1 answer)
Closed 5 years ago.
I have a lot of lists such as:
ABCC8 = ['TRIM29', 'IGL#', 'DOCK6', 'SVEP1', 'S100A11', 'EPHA2', 'KLHL7', 'ANXA3', 'NAB1', 'CELF2', 'EDNRB', 'PLAGL1', 'IL6ST', 'S100A8', 'CKLF', 'TIPARP', 'CDH3', 'MAP3K8', 'LYST', 'LEPR', 'FHL2', 'ARL4C', 'IL1RN', 'ESR1', 'CD93', 'ATP2B4', 'KAT2B', 'ELOVL5', 'SCD', 'SPTBN1', 'AKAP13', 'LDLR', 'ADRB2', 'LTBP4', 'TGM2', 'TIMP3', 'RAN', 'LAMA3', 'ASPH', 'ID4', 'STX11', 'CNN2', 'EGR1']
APP = ['GULP1', 'PREPL', 'FHL1', 'METTL7A', 'TRIM13', 'YPEL5', 'PTEN', 'FAM190B', 'GSN', 'UBL3', 'PTGER3', 'COBLL1', 'EPB41L3', 'KLF4', 'BCL2L2', 'CYLD', 'SLK', 'ENSA', 'SKAP2', 'FBXO3', 'PDCD4', 'ATP2A2', 'AKAP11', 'PAFAH1B1', 'RALGAPA1', 'YWHAZ', 'BNIP3L', 'ATP8A1', 'TNXB', 'DICER1', 'C17orf91', 'BEX4', 'PPM1A', '2017-09-10', 'NDRG2', 'NCOA1', 'NAB1', 'STX7', 'ZFAND5', 'CD47', 'SFRS5', 'CLASP2', 'PBX1', 'NR3C1', 'ABCA8', 'ETFDH', 'RBPMS', 'FOXO1', 'KLF6', 'ADH1B', 'RAB22A', 'CCNG2', 'NFIB', 'IDS', 'NR3C2', 'MAF', 'NDEL1', 'EZR', 'PCDH9', 'KIAA0494', 'CITED2', 'MGEA5', 'RUFY3', 'ALDH3A2', 'N4BP2L2', 'EPS15', 'TSPAN5', 'SNRPN', 'SSBP2', 'ELOVL5', 'C5orf4', 'FOXN3', 'ABCA5', 'SEC62', 'PELI1', 'MYCBP2', 'USP15', 'TACC1', 'SHMT1', 'RNF103', 'CDC14B', 'SYNE1', 'NDN', 'PHKB', 'EIF1', 'TROVE2', 'MBD4', 'GAB1']
BECN1 = ['LMNA', 'NHP2L1', 'IDS', 'ATP6V0B', 'ENSA', 'TBCB', 'NDUFA13', 'TOLLIP', 'PLEKHB2', 'MBOAT7', 'C16orf13', 'PGAM1', 'MIF', 'ACTR1A', 'OAZ1', 'GNAS', 'ARF1', 'MAPKAPK3', 'LCMT1', 'ATP6V1D', 'FLOT1', 'PRR13', 'COX5B', 'PGP', 'CYB561', 'CNIH4', 'COX6B1', 'ARPC5L', 'NCKIPSD', 'C9orf16', 'LSM4', 'ATP5L', 'C14orf2', 'AURKAIP1', 'MRPL41', 'PDPK1', 'NOP10', 'CANT1', 'CALM3', 'PSEN2', 'C9orf86', 'ATP6V0E1', 'PIN1', 'LARP1', 'HTATIP2', 'PPP1R7', 'HCFC1R1', 'UQCR10', 'FAM134A', 'GPAA1', 'THY1', 'PPM1A', 'NAPA', 'NDUFC2', 'EPS8L1', 'PSME2', 'UBE2M', 'ORMDL2', 'TCEB2', 'RMND5B', 'ATPIF1', 'RNF19B', 'PEBP1', 'PCBP2', 'GHITM', 'AP3S2', 'TSPAN5', 'AP2S1', 'C20orf24', 'RABIF', 'NDUFB2', 'PFDN2', 'GPR172A', 'RTN4', 'GAPDH', 'MAPK13', 'FKBP8', 'PTGER3', 'BSCL2', 'TUBG1', 'FAM162A', 'GDI1', 'SPTLC2', 'YWHAZ', 'BCAP31', 'OSBPL1A', 'ATP6AP1', 'CALM1', 'PEX16', 'MYCBP2']
ARNTL = ['NCAM1', 'SLC11A2', 'RPL35A', 'PDLIM5', 'RPL31', 'NFIB', 'GYG2', 'IGHG1', 'NAAA', 'DLC1', 'EPOR', 'DIO2', 'ESR1', 'KLK10', 'CYP2C9', 'SPN', 'RPS9', 'PRELP', 'CYP3A43', 'PLAGL1', 'COBLL1', 'ADCK2', 'RPL13', 'NRP2', 'SCEL', 'DOCK6', 'NENF', 'MLLT4', 'SERPINB13', 'PALMD', 'TMEM132A', 'ASAP3', 'MTAP', 'NOVA1', 'ALOX12', 'SPINK5', 'LDB3', 'ATP5S', 'LMNA', 'BAIAP2', 'FZD4', 'GNAS', 'OBSL1', 'TCL6', 'ICOSLG', 'MACROD2', 'MAST4', 'EDA', 'ADAM22', 'CSHL1', 'SYNGR1', 'THBS1', 'PEX16', 'NOS1', 'SLCO1A2', 'CYP2A7', 'PRDM2', 'DTNA', 'HSD17B4', 'RPL29', 'PDCD4', 'IL1RN', 'CASZ1', 'C9orf16', 'RGS12', 'TRD#', 'ATP1A2', 'MPRIP', 'PDE4C', 'SPTLC2', 'TNXB', 'DDAH2', 'AOX1', 'PAIP2B', 'HNF4A', 'GLS', 'EMP1', 'ARHGEF4', 'FUT6', 'ACACB', 'NR5A2', 'N4BP2L1', 'APAF1', 'DSC2', 'EDNRB', 'RPL27A', 'CYP2C18']
I have a function which returns me the number of matches of same strings among the different lists of strings compared to my reference (`ref`) list.
def sort_by_matches(ref, lists):
reference = set(ref)
lists = sorted([[len(reference.intersection(set(l))), name, l] for name, l in lists], key=lambda x: (x[0], -len(x[2])), reverse=True)
for matches, name, a_list in lists:
print("Matches {} in {}".format(matches, name))
sort_by_matches(APP, [("ABCC8", ABCC8), ("APP", APP), ("BECN1", BECN1), ("ARNTL", ARNTL), ("BMI1", BMI1), ("CASP8", CASP8), ("CASP9", CASP9), ("CLOCK", CLOCK), ("CRAT", CRAT), ("CRY2", CRY2), ("CSF1", CSF1), ("CTCF", CTCF), ("DNMT1", DNMT1), ("EP300", EP300), ("FBXW7", FBXW7), ("FOXA1", FOXA1), ("FOXO1", FOXO1), ("FOXO3", FOXO3), ("GADD34", GADD34), ("GATA3", GATA3), ("GCK", GCK), ("GLI1", GLI1), ("GLP1", GLP1), ("GLP1R", GLP1R), ("GLUT1", GLUT1),("GLUT2", GLUT2),("HES1", HES1),("HEY1", HEY1),("HIF1A", HIF1A),("HNF1A", HNF1A),("HNF4A", HNF4A),("ICMT", ICMT),("ID1", ID1),("IDH1", IDH1),("IL4", IL4),("IL6", IL6),("LC3A", LC3A),("LYL1", LYL1),("MFSD2A", MFSD2A),("MOAP1", MOAP1),("MTNR1B", MTNR1B),("MTOR", MTOR),("MYF5", MYF5),("MYOD1", MYOD1),("MSTN", MSTN),("NANOG", NANOG),("NOTCH1", NOTCH1),("NR1D1", NR1D1),("POU5F1", POU5F1),("PAX7", PAX7),("PDK1", PDK1),("PER2", PER2),("PHF6", PHF6),("PRMT5", PRMT5),("PSEN1", PSEN1),("PSEN2", PSEN2),("PTCH1", PTCH1),("RMST", RMST),("RUNX1", RUNX1),("SETD2", SETD2),("SIN3A", SIN3A),("SOCS1", SOCS1),("SOX2", SOX2),("STAT3", STAT3),("STK11", STK11),("TAF1", TAF1),("TCF3", TCF3),("TEAD1", TEAD1), ("TERT", TERT),("RANKL", RANKL),("TOP2A", TOP2A), ("TOX3", TOX3), ("TRIM28", TRIM28), ("TSHZ2", TSHZ2), ("TSHZ3", TSHZ3), ("TSP1", TSP1), ("TWIST1", TWIST1), ("FN1", FN1), ("VHL", VHL), ("WLS", WLS), ("WNT3", WNT3), ("WNT3A", WNT3A), ("WNT5A", WNT5A), ("WT1", WT1), ("YAP1", YAP1), ('MYBPC3', MYBPC3), ("PPARG", PPARG), ("NKD1", NKD1), ("LRP5", LRP5), ("SMO", SMO), ("CSNK1E", CSNK1E), ("DKK1", DKK1), ("MYH7", MYH7), ("AXIN2", AXIN2), ("TCF7", TCF7), ("NEUROD1", NEUROD1), ("FZD5", FZD5), ("FZD8", FZD8), ("CREB1", CREB1), ("TCF7L2", TCF7L2), ("SOX17", SOX17), ("TP53", TP53), ("PTGER3", PTGER3), ("FERMT2", FERMT2), ("WNT1", WNT1), ("WNT7B", WNT7B), ("MDM4", MDM4), ("IL10", IL10 ), ("DVL1", DVL1 ), ("PGR", PGR), ("TSC1", TSC1), ("ASCL2", ASCL2)])
How can I use 'input' variable for the ref in my function sort_by_matches(ref, lists) instead of copy-pasting different my function every time with different reference.
Copy pasting the above matching function is too long, since I have hundreds of lists. How can I solve this problem?
There is probably a more succinct way of doing this, as I am new to Python, but I made the following modifications:
The lists you have defined are put into a list of tuples, with the first item in the tuple representing the name of the list, and the second item referencing the list you have already defined (similar to how you're calling your function already):
myLists = [("ABCC8", ABCC8), ("APP", APP), ("BECN1", BECN1), ("ARNTL", ARNTL)]
I modified the ref parameter so your function sort_by_matches expects a tuple for that parameter (this is so you can reference the name of the list, if needed):
def sort_by_matches(ref, lists):
reference = set(ref[1])
lists = sorted([[len(reference.intersection(set(l))), name, l] for name, l in lists], key=lambda x: (x[0], -len(x[2])), reverse=True)
for matches, name, a_list in lists:
print("{} Matches {} in {}".format(ref[0],matches, name))
Next, loop over myLists, calling sort_by_matches for the reference list and all lists following it:
i = 0
while(i < len(myLists) - 1):
sort_by_matches(myLists[i], myLists[i + 1:])
i = i + 1
This compares each list to every other list.
The output looks like this:
ABCC8 Matches 5 in ARNTL
ABCC8 Matches 2 in APP
ABCC8 Matches 0 in BECN1
APP Matches 7 in BECN1
APP Matches 4 in ARNTL
BECN1 Matches 5 in ARNTL
EDIT
You will need a way to reference all the lists some way since they are contained within the code. Below I am storing references to your lists in the variable MY_LISTS. You would just need to add an entry to this variable each time you define another list in your program. (However you decide to implement this, since the lists are defined explicitly in the code, there will need to be a collection referencing the lists to avoid having to reference all the lists explicitly each time you call your function.)
No modifications have been made to the sort_by_matches function. Instead, in the while loop, the name of the current reference list is printed out so you'll know which list the other lists are being compared to.
The loop starts with the first list in MY_LISTS, comparing it to all subsequent lists in MY_LISTS. In each iteration of the loop, the reference list is only compared to the subsequent lists in MY_LISTS. This makes it so a comparison is only performed on every possible pair of lists once.
ABCC8 = ['TRIM29', 'IGL#', 'DOCK6', 'SVEP1', 'S100A11', 'EPHA2', 'KLHL7', 'ANXA3', 'NAB1', 'CELF2', 'EDNRB', 'PLAGL1', 'IL6ST', 'S100A8', 'CKLF', 'TIPARP', 'CDH3', 'MAP3K8', 'LYST', 'LEPR', 'FHL2', 'ARL4C', 'IL1RN', 'ESR1', 'CD93', 'ATP2B4', 'KAT2B', 'ELOVL5', 'SCD', 'SPTBN1', 'AKAP13', 'LDLR', 'ADRB2', 'LTBP4', 'TGM2', 'TIMP3', 'RAN', 'LAMA3', 'ASPH', 'ID4', 'STX11', 'CNN2', 'EGR1']
APP = ['GULP1', 'PREPL', 'FHL1', 'METTL7A', 'TRIM13', 'YPEL5', 'PTEN', 'FAM190B', 'GSN', 'UBL3', 'PTGER3', 'COBLL1', 'EPB41L3', 'KLF4', 'BCL2L2', 'CYLD', 'SLK', 'ENSA', 'SKAP2', 'FBXO3', 'PDCD4', 'ATP2A2', 'AKAP11', 'PAFAH1B1', 'RALGAPA1', 'YWHAZ', 'BNIP3L', 'ATP8A1', 'TNXB', 'DICER1', 'C17orf91', 'BEX4', 'PPM1A', '2017-09-10', 'NDRG2', 'NCOA1', 'NAB1', 'STX7', 'ZFAND5', 'CD47', 'SFRS5', 'CLASP2', 'PBX1', 'NR3C1', 'ABCA8', 'ETFDH', 'RBPMS', 'FOXO1', 'KLF6', 'ADH1B', 'RAB22A', 'CCNG2', 'NFIB', 'IDS', 'NR3C2', 'MAF', 'NDEL1', 'EZR', 'PCDH9', 'KIAA0494', 'CITED2', 'MGEA5', 'RUFY3', 'ALDH3A2', 'N4BP2L2', 'EPS15', 'TSPAN5', 'SNRPN', 'SSBP2', 'ELOVL5', 'C5orf4', 'FOXN3', 'ABCA5', 'SEC62', 'PELI1', 'MYCBP2', 'USP15', 'TACC1', 'SHMT1', 'RNF103', 'CDC14B', 'SYNE1', 'NDN', 'PHKB', 'EIF1', 'TROVE2', 'MBD4', 'GAB1']
BECN1 = ['LMNA', 'NHP2L1', 'IDS', 'ATP6V0B', 'ENSA', 'TBCB', 'NDUFA13', 'TOLLIP', 'PLEKHB2', 'MBOAT7', 'C16orf13', 'PGAM1', 'MIF', 'ACTR1A', 'OAZ1', 'GNAS', 'ARF1', 'MAPKAPK3', 'LCMT1', 'ATP6V1D', 'FLOT1', 'PRR13', 'COX5B', 'PGP', 'CYB561', 'CNIH4', 'COX6B1', 'ARPC5L', 'NCKIPSD', 'C9orf16', 'LSM4', 'ATP5L', 'C14orf2', 'AURKAIP1', 'MRPL41', 'PDPK1', 'NOP10', 'CANT1', 'CALM3', 'PSEN2', 'C9orf86', 'ATP6V0E1', 'PIN1', 'LARP1', 'HTATIP2', 'PPP1R7', 'HCFC1R1', 'UQCR10', 'FAM134A', 'GPAA1', 'THY1', 'PPM1A', 'NAPA', 'NDUFC2', 'EPS8L1', 'PSME2', 'UBE2M', 'ORMDL2', 'TCEB2', 'RMND5B', 'ATPIF1', 'RNF19B', 'PEBP1', 'PCBP2', 'GHITM', 'AP3S2', 'TSPAN5', 'AP2S1', 'C20orf24', 'RABIF', 'NDUFB2', 'PFDN2', 'GPR172A', 'RTN4', 'GAPDH', 'MAPK13', 'FKBP8', 'PTGER3', 'BSCL2', 'TUBG1', 'FAM162A', 'GDI1', 'SPTLC2', 'YWHAZ', 'BCAP31', 'OSBPL1A', 'ATP6AP1', 'CALM1', 'PEX16', 'MYCBP2']
ARNTL = ['NCAM1', 'SLC11A2', 'RPL35A', 'PDLIM5', 'RPL31', 'NFIB', 'GYG2', 'IGHG1', 'NAAA', 'DLC1', 'EPOR', 'DIO2', 'ESR1', 'KLK10', 'CYP2C9', 'SPN', 'RPS9', 'PRELP', 'CYP3A43', 'PLAGL1', 'COBLL1', 'ADCK2', 'RPL13', 'NRP2', 'SCEL', 'DOCK6', 'NENF', 'MLLT4', 'SERPINB13', 'PALMD', 'TMEM132A', 'ASAP3', 'MTAP', 'NOVA1', 'ALOX12', 'SPINK5', 'LDB3', 'ATP5S', 'LMNA', 'BAIAP2', 'FZD4', 'GNAS', 'OBSL1', 'TCL6', 'ICOSLG', 'MACROD2', 'MAST4', 'EDA', 'ADAM22', 'CSHL1', 'SYNGR1', 'THBS1', 'PEX16', 'NOS1', 'SLCO1A2', 'CYP2A7', 'PRDM2', 'DTNA', 'HSD17B4', 'RPL29', 'PDCD4', 'IL1RN', 'CASZ1', 'C9orf16', 'RGS12', 'TRD#', 'ATP1A2', 'MPRIP', 'PDE4C', 'SPTLC2', 'TNXB', 'DDAH2', 'AOX1', 'PAIP2B', 'HNF4A', 'GLS', 'EMP1', 'ARHGEF4', 'FUT6', 'ACACB', 'NR5A2', 'N4BP2L1', 'APAF1', 'DSC2', 'EDNRB', 'RPL27A', 'CYP2C18']
MY_LISTS = [("ABCC8", ABCC8), ("APP", APP), ("BECN1", BECN1), ("ARNTL", ARNTL)]
def sort_by_matches(ref, lists):
reference = set(ref)
lists = sorted([[len(reference.intersection(set(l))), name, l] for name, l in lists], key=lambda x: (x[0], -len(x[2])), reverse=True)
for matches, name, a_list in lists:
print("Matches {} in {}".format(matches, name))
i = 0
while(i < len(MY_LISTS) - 1):
print("Comparing Lists to " + MY_LISTS[i][0])
sort_by_matches(MY_LISTS[i][1], MY_LISTS[i + 1:])
i = i + 1
The output looks like this:
Comparing Lists to ABCC8
Matches 5 in ARNTL
Matches 2 in APP
Matches 0 in BECN1
Comparing Lists to APP
Matches 7 in BECN1
Matches 4 in ARNTL
Comparing Lists to BECN1
Matches 5 in ARNTL

Peak Detection Using the Lomb-Scargle Method

I am trying to get a Python code working that finds peaks in data using the Lomb-Scargle method.
http://www.astropython.org/snippets/fast-lomb-scargle-algorithm32/
Using this method as below,
import lomb
x = np.arange(10)
y = np.sin(x)
fx,fy, nout, jmax, prob = lomb.fasper(x,y, 6., 6.)
print jmax
works fine, without problems. It prints 8. However on another piece of data (data dump below),
df = pd.read_csv('extinct.csv',header=None)
Y = pd.rolling_mean(df[0],window=5)
fx,fy, nout, jmax, prob = lomb.fasper(np.array(Y.index),np.array(Y),6.,6.)
print jmax
displays only 0. I tried passing different ofac,hifac values, none gives me sensible values.
Main function
"""
from numpy import *
from numpy.fft import *
def __spread__(y, yy, n, x, m):
"""
Given an array yy(0:n-1), extirpolate (spread) a value y into
m actual array elements that best approximate the "fictional"
(i.e., possible noninteger) array element number x. The weights
used are coefficients of the Lagrange interpolating polynomial
Arguments:
y :
yy :
n :
x :
m :
Returns:
"""
nfac=[0,1,1,2,6,24,120,720,5040,40320,362880]
if m > 10. :
print 'factorial table too small in spread'
return
ix=long(x)
if x == float(ix):
yy[ix]=yy[ix]+y
else:
ilo = long(x-0.5*float(m)+1.0)
ilo = min( max( ilo , 1 ), n-m+1 )
ihi = ilo+m-1
nden = nfac[m]
fac=x-ilo
for j in range(ilo+1,ihi+1): fac = fac*(x-j)
yy[ihi] = yy[ihi] + y*fac/(nden*(x-ihi))
for j in range(ihi-1,ilo-1,-1):
nden=(nden/(j+1-ilo))*(j-ihi)
yy[j] = yy[j] + y*fac/(nden*(x-j))
def fasper(x,y,ofac,hifac, MACC=4):
""" function fasper
Given abscissas x (which need not be equally spaced) and ordinates
y, and given a desired oversampling factor ofac (a typical value
being 4 or larger). this routine creates an array wk1 with a
sequence of nout increasing frequencies (not angular frequencies)
up to hifac times the "average" Nyquist frequency, and creates
an array wk2 with the values of the Lomb normalized periodogram at
those frequencies. The arrays x and y are not altered. This
routine also returns jmax such that wk2(jmax) is the maximum
element in wk2, and prob, an estimate of the significance of that
maximum against the hypothesis of random noise. A small value of prob
indicates that a significant periodic signal is present.
Reference:
Press, W. H. & Rybicki, G. B. 1989
ApJ vol. 338, p. 277-280.
Fast algorithm for spectral analysis of unevenly sampled data
(1989ApJ...338..277P)
Arguments:
X : Abscissas array, (e.g. an array of times).
Y : Ordinates array, (e.g. corresponding counts).
Ofac : Oversampling factor.
Hifac : Hifac * "average" Nyquist frequency = highest frequency
for which values of the Lomb normalized periodogram will
be calculated.
Returns:
Wk1 : An array of Lomb periodogram frequencies.
Wk2 : An array of corresponding values of the Lomb periodogram.
Nout : Wk1 & Wk2 dimensions (number of calculated frequencies)
Jmax : The array index corresponding to the MAX( Wk2 ).
Prob : False Alarm Probability of the largest Periodogram value
MACC : Number of interpolation points per 1/4 cycle
of highest frequency
History:
02/23/2009, v1.0, MF
Translation of IDL code (orig. Numerical recipies)
"""
#Check dimensions of input arrays
n = long(len(x))
if n != len(y):
print 'Incompatible arrays.'
return
nout = 0.5*ofac*hifac*n
nfreqt = long(ofac*hifac*n*MACC) #Size the FFT as next power
nfreq = 64L # of 2 above nfreqt.
while nfreq < nfreqt:
nfreq = 2*nfreq
ndim = long(2*nfreq)
#Compute the mean, variance
ave = y.mean()
##sample variance because the divisor is N-1
var = ((y-y.mean())**2).sum()/(len(y)-1)
# and range of the data.
xmin = x.min()
xmax = x.max()
xdif = xmax-xmin
#extirpolate the data into the workspaces
wk1 = zeros(ndim, dtype='complex')
wk2 = zeros(ndim, dtype='complex')
fac = ndim/(xdif*ofac)
fndim = ndim
ck = ((x-xmin)*fac) % fndim
ckk = (2.0*ck) % fndim
for j in range(0L, n):
__spread__(y[j]-ave,wk1,ndim,ck[j],MACC)
__spread__(1.0,wk2,ndim,ckk[j],MACC)
#Take the Fast Fourier Transforms
wk1 = ifft( wk1 )*len(wk1)
wk2 = ifft( wk2 )*len(wk1)
wk1 = wk1[1:nout+1]
wk2 = wk2[1:nout+1]
rwk1 = wk1.real
iwk1 = wk1.imag
rwk2 = wk2.real
iwk2 = wk2.imag
df = 1.0/(xdif*ofac)
#Compute the Lomb value for each frequency
hypo2 = 2.0 * abs( wk2 )
hc2wt = rwk2/hypo2
hs2wt = iwk2/hypo2
cwt = sqrt(0.5+hc2wt)
swt = sign(hs2wt)*(sqrt(0.5-hc2wt))
den = 0.5*n+hc2wt*rwk2+hs2wt*iwk2
cterm = (cwt*rwk1+swt*iwk1)**2./den
sterm = (cwt*iwk1-swt*rwk1)**2./(n-den)
wk1 = df*(arange(nout, dtype='float')+1.)
wk2 = (cterm+sterm)/(2.0*var)
pmax = wk2.max()
jmax = wk2.argmax()
#Significance estimation
#expy = exp(-wk2)
#effm = 2.0*(nout)/ofac
#sig = effm*expy
#ind = (sig > 0.01).nonzero()
#sig[ind] = 1.0-(1.0-expy[ind])**effm
#Estimate significance of largest peak value
expy = exp(-pmax)
effm = 2.0*(nout)/ofac
prob = effm*expy
if prob > 0.01:
prob = 1.0-(1.0-expy)**effm
return wk1,wk2,nout,jmax,prob
def getSignificance(wk1, wk2, nout, ofac):
""" returns the peak false alarm probabilities
Hence the lower is the probability and the more significant is the peak
"""
expy = exp(-wk2)
effm = 2.0*(nout)/ofac
sig = effm*expy
ind = (sig > 0.01).nonzero()
sig[ind] = 1.0-(1.0-expy[ind])**effm
return sig
Data,
13.5945121951
13.5945121951
12.6615853659
12.6615853659
12.6615853659
4.10975609756
4.10975609756
4.10975609756
7.99695121951
7.99695121951
16.237804878
16.237804878
16.237804878
16.0823170732
16.237804878
16.237804878
8.92987804878
8.92987804878
10.6402439024
10.6402439024
28.0548780488
28.0548780488
28.0548780488
27.8993902439
27.8993902439
41.5823170732
41.5823170732
41.5823170732
41.5823170732
41.5823170732
41.5823170732
18.7256097561
15.9268292683
15.9268292683
15.9268292683
15.9268292683
15.9268292683
15.9268292683
14.0609756098
14.0609756098
14.0609756098
14.0609756098
14.0609756098
23.8567073171
23.8567073171
23.8567073171
23.8567073171
25.4115853659
25.4115853659
28.0548780488
40.0274390244
40.0274390244
40.0274390244
40.0274390244
40.0274390244
40.0274390244
20.5914634146
20.5914634146
20.4359756098
19.6585365854
18.2591463415
19.3475609756
18.2591463415
10.3292682927
27.743902439
27.743902439
27.743902439
27.743902439
27.743902439
27.743902439
22.3018292683
22.3018292683
21.368902439
21.368902439
21.368902439
21.5243902439
20.4359756098
20.4359756098
20.4359756098
20.4359756098
20.4359756098
20.4359756098
20.4359756098
11.8841463415
11.8841463415
1.0
11.1067073171
10.1737804878
14.5274390244
14.5274390244
14.5274390244
14.5274390244
14.5274390244
14.5274390244
11.7286585366
11.7286585366
12.6615853659
11.7286585366
8.15243902439
1.0
7.84146341463
6.90853658537
12.6615853659
12.6615853659
12.6615853659
12.6615853659
12.6615853659
12.6615853659
12.6615853659
12.6615853659
12.6615853659
13.1280487805
12.9725609756
12.9725609756
12.9725609756
10.3292682927
10.3292682927
10.3292682927
10.3292682927
9.55182926829
10.4847560976
29.9207317073
29.9207317073
29.9207317073
29.9207317073
30.0762195122
30.0762195122
26.1890243902
7.99695121951
25.256097561
7.99695121951
7.99695121951
7.99695121951
6.59756097561
6.59756097561
6.59756097561
6.59756097561
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
10.0182926829
10.0182926829
10.0182926829
10.0182926829
10.0182926829
10.0182926829
10.4847560976
15.9268292683
15.9268292683
15.9268292683
15.9268292683
15.9268292683
16.8597560976
15.9268292683
15.9268292683
16.8597560976
16.7042682927
16.7042682927
16.7042682927
9.08536585366
8.46341463415
8.46341463415
8.46341463415
8.46341463415
6.90853658537
7.84146341463
6.90853658537
4.26524390244
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
14.2164634146
14.2164634146
14.2164634146
14.0609756098
14.0609756098
14.0609756098
14.0609756098
16.8597560976
16.8597560976
16.7042682927
16.7042682927
16.7042682927
16.7042682927
17.9481707317
17.9481707317
19.6585365854
19.6585365854
19.6585365854
19.6585365854
10.7957317073
10.7957317073
10.7957317073
10.7957317073
10.7957317073
12.1951219512
12.1951219512
22.9237804878
22.9237804878
22.9237804878
22.9237804878
22.9237804878
22.9237804878
22.9237804878
7.84146341463
7.84146341463
7.84146341463
7.84146341463
8.7743902439
8.7743902439
7.84146341463
8.61890243902
8.61890243902
8.61890243902
8.61890243902
18.2591463415
18.2591463415
18.2591463415
18.2591463415
18.2591463415
18.2591463415
18.2591463415
18.2591463415
18.2591463415
9.39634146341
9.39634146341
9.24085365854
9.24085365854
9.24085365854
9.24085365854
9.08536585366
9.08536585366
9.08536585366
9.08536585366
9.55182926829
9.55182926829
9.55182926829
9.55182926829
9.55182926829
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
16.5487804878
1.0
16.0823170732
16.0823170732
16.0823170732
16.0823170732
16.0823170732
16.0823170732
16.0823170732
16.0823170732
16.0823170732
17.1707317073
17.0152439024
21.9908536585
21.9908536585
21.9908536585
21.9908536585
21.9908536585
21.9908536585
21.9908536585
7.84146341463
8.7743902439
7.84146341463
6.75304878049
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
3.95426829268
7.06402439024
7.06402439024
7.06402439024
11.262195122
11.262195122
11.262195122
11.262195122
11.262195122
11.262195122
9.08536585366
9.86280487805
7.99695121951
7.99695121951
14.2164634146
14.0609756098
14.0609756098
14.0609756098
14.0609756098
14.0609756098
2.24390243902
2.08841463415
3.02134146341
3.02134146341
2.08841463415
4.73170731707
4.73170731707
4.73170731707
4.73170731707
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.44207317073
6.59756097561
6.59756097561
6.59756097561
6.75304878049
1.0
6.28658536585
6.28658536585
7.21951219512
6.28658536585
10.6402439024
10.6402439024
10.6402439024
10.6402439024
10.6402439024
10.6402439024
10.6402439024
14.3719512195
14.3719512195
15.6158536585
15.6158536585
15.6158536585
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
35.6737804878
28.6768292683
28.6768292683
28.6768292683
28.6768292683
28.6768292683
51.8445121951
51.8445121951
51.8445121951
51.8445121951
51.8445121951
52.0
52.0
4.42073170732
4.42073170732
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
4.10975609756
3.95426829268
3.64329268293
3.64329268293
4.73170731707
4.73170731707
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
5.9756097561
5.82012195122
5.82012195122
5.82012195122
5.82012195122
5.82012195122
12.1951219512
12.1951219512
12.1951219512
12.1951219512
12.1951219512
12.1951219512
12.1951219512
12.1951219512
1.0
11.7286585366
11.7286585366
11.7286585366
11.7286585366
11.7286585366
11.7286585366
11.1067073171
11.1067073171
11.1067073171
11.1067073171
11.1067073171
11.1067073171
11.1067073171
11.1067073171
10.0182926829
10.0182926829
16.7042682927
16.7042682927
16.7042682927
16.7042682927
16.7042682927
16.7042682927
29.1432926829
29.1432926829
29.1432926829
29.1432926829
29.1432926829
29.1432926829
29.1432926829
29.1432926829
29.1432926829
1.15548780488
2.71036585366
2.71036585366
2.71036585366
2.71036585366
2.71036585366
2.71036585366
2.71036585366
3.17682926829
4.10975609756
4.10975609756
5.9756097561
5.9756097561
5.9756097561
6.90853658537
5.9756097561
10.1737804878
10.1737804878
10.1737804878
8.61890243902
8.46341463415
8.46341463415
9.39634146341
8.46341463415
8.46341463415
5.35365853659
5.35365853659
5.35365853659
5.35365853659
5.35365853659
5.35365853659
3.33231707317
4.42073170732
3.33231707317
6.59756097561
6.44207317073
5.82012195122
6.75304878049
5.82012195122
5.82012195122
5.82012195122
4.73170731707
5.66463414634
5.66463414634
4.73170731707
4.73170731707
5.66463414634
5.66463414634
5.50914634146
2.71036585366
5.50914634146
2.71036585366
2.71036585366
5.50914634146
5.50914634146
5.50914634146
6.28658536585
6.28658536585
5.9756097561
5.9756097561
7.06402439024
5.9756097561
7.53048780488
8.46341463415
8.46341463415
13.2835365854
13.2835365854
13.2835365854
13.2835365854
2.55487804878
2.55487804878
2.55487804878
2.55487804878
4.10975609756
3.17682926829
3.17682926829
4.26524390244
3.64329268293
3.64329268293
3.64329268293
3.33231707317
3.33231707317
3.33231707317
2.24390243902
3.33231707317
2.24390243902
2.24390243902
3.64329268293
3.64329268293
3.64329268293
3.64329268293
3.64329268293
3.64329268293
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
6.28658536585
6.28658536585
7.21951219512
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
3.7987804878
4.73170731707
3.7987804878
3.7987804878
3.7987804878
3.7987804878
3.7987804878
3.7987804878
4.26524390244
4.26524390244
5.19817073171
5.19817073171
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
7.53048780488
3.7987804878
3.7987804878
3.95426829268
3.02134146341
3.02134146341
3.02134146341
1.0
1.93292682927
2.55487804878
2.55487804878
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
5.9756097561
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
6.28658536585
16.0823170732
16.0823170732
31.3201219512
31.3201219512
31.3201219512
31.3201219512
31.3201219512
31.3201219512
31.3201219512
31.3201219512
3.64329268293
3.64329268293
4.26524390244
4.26524390244
3.7987804878
4.73170731707
3.7987804878
3.7987804878
2.55487804878
3.48780487805
2.55487804878
2.55487804878
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.17682926829
3.33231707317
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
12.3506097561
4.73170731707
4.73170731707
4.73170731707
4.73170731707
4.73170731707
4.73170731707
4.73170731707
4.73170731707
2.86585365854
2.86585365854
1.46646341463
1.46646341463
1.46646341463
1.46646341463
1.46646341463
1.46646341463
1.62195121951
1.62195121951
1.62195121951
1.77743902439
1.77743902439
4.42073170732
4.42073170732
4.42073170732
4.42073170732
4.42073170732
4.42073170732
4.42073170732
3.95426829268
3.95426829268
2.71036585366
2.71036585366
2.71036585366
2.71036585366
2.71036585366
1.77743902439
2.86585365854
3.02134146341
2.86585365854
2.86585365854
3.17682926829
3.17682926829
Plot
Any help would be appreciated,
After some digging, it looks like AstroML method is the best.
import numpy as np
from matplotlib import pyplot as plt
from astroML.time_series import lomb_scargle, search_frequencies
import pandas as pd
df = pd.read_csv('extinct.csv',header=None)
Y = df[0]
dy = 0.5 + 0.5 * np.random.random(len(df))
omega = np.linspace(10, 100, 1000)
sig = np.array([0.1, 0.01, 0.001])
PS, z = lomb_scargle(df.index, Y, dy, omega, generalized=True, significance=sig)
plt.plot(omega,PS)
plt.hold(True)
xlim = (omega[0], omega[-1])
for zi, pi in zip(z, sig):
plt.plot(xlim, (zi, zi), ':k', lw=1)
plt.text(xlim[-1] - 0.001, zi - 0.02, "$%.1g$" % pi, ha='right', va='top')
plt.hold(True)
plt.show()
which gives
Significance levels are shown on the graph as well. I used to generalized LS, and used no smoothing.

Incorrect scikit-learn linear model prediction with date offset

I'm trying to predict time-series data, but by offsetting the result by date_offset-timepoints before training and prediction. The reason for doing this is to try and predict date_offset-timepoints into the future with the present data. See http://glowingpython.blogspot.co.za/2015/01/forecasting-beer-consumption-with.html for an example.
So in summary:
data = [1,2,3,4,5] should predict result = [2,3,4,5,6] if date_offset = 1
The results on the plot below show the red line being shifted by date_offset, and not predicting date_offset into the future. No matter how big I make date_offset, it keeps shifting and not predicting the last result I have, i.e. result = 5 (which is already know). In fact, the red line should not shift at all, just loose accuracy the bigger date_offset becomes. What am I doing wrong?
See example code and resulting image below:
from sklearn import linear_model
import matplotlib.pyplot as plt
import numpy as np
date_offset = 1
data = np.array([9330.0, 9470.0, 9550.0, 9620.0, 9600.0, 9585.0, 9600.0, 9600.0, 9430.0, 9460.0, 9450.0, 9650.0, 9620.0, 9650.0, 9500.0, 9400.0, 9165.0, 9100.0, 8755.0, 8850.0, 8990.0, 9150.0, 9195.0, 9175.0, 9250.0, 9200.0, 9350.0, 9280.0, 9370.0, 9470.0, 9445.0, 9440.0, 9280.0, 9325.0, 9170.0, 9270.0, 9200.0, 9450.0, 9510.0, 9371.0, 9499.0, 9499.0, 9400.0, 9500.0, 9550.0, 9670.0, 9700.0, 9760.0, 9767.4599999999991, 9652.0, 9520.0, 9600.0, 9610.0, 9700.0, 9825.0, 9900.0, 9950.0, 9801.0, 9770.0, 9545.0, 9630.0, 9710.0, 9700.0, 9700.0, 9600.0, 9615.0, 9575.0, 9500.0, 9600.0, 9480.0, 9565.0, 9510.0, 9475.0, 9600.0, 9400.0, 9400.0, 9400.0, 9300.0, 9430.0, 9410.0, 9380.0, 9320.0, 9000.0, 9100.0, 9000.0, 9200.0, 9210.0, 9251.0, 9460.0, 9400.0, 9600.0, 9621.0, 9440.0, 9490.0, 9675.0, 9850.0, 9680.0, 10100.0, 9900.0, 10100.0, 9949.0, 10040.0, 10050.0, 10200.0, 10400.0, 10350.0, 10200.0, 10175.0, 10001.0, 10110.0, 10400.0, 10401.0, 10300.0, 10548.0, 10515.0, 10475.0, 10200.0, 10481.0, 10500.0, 10540.0, 10559.0, 10300.0, 10400.0, 10202.0, 10330.0, 10450.0, 10540.0, 10540.0, 10650.0, 10450.0, 10550.0, 10501.0, 10206.0, 10250.0, 10345.0, 10225.0, 10330.0, 10506.0, 11401.0, 11245.0, 11360.0, 11549.0, 11415.0, 11450.0, 11460.0, 11600.0, 11530.0, 11450.0, 11402.0, 11299.0])
data = data[np.newaxis].T
results = np.array([9470.0, 9545.0, 9635.0, 9640.0, 9600.0, 9622.0, 9555.0, 9429.0, 9495.0, 9489.0, 9630.0, 9612.0, 9630.0, 9501.0, 9372.0, 9165.0, 9024.0, 8780.0, 8800.0, 8937.0, 9051.0, 9100.0, 9166.0, 9220.0, 9214.0, 9240.0, 9254.0, 9400.0, 9450.0, 9470.0, 9445.0, 9301.0, 9316.0, 9170.0, 9270.0, 9251.0, 9422.0, 9466.0, 9373.0, 9440.0, 9415.0, 9410.0, 9500.0, 9520.0, 9620.0, 9705.0, 9760.0, 9765.0, 9651.0, 9520.0, 9600.0, 9610.0, 9700.0, 9805.0, 9900.0, 9950.0, 9800.0, 9765.0, 9602.0, 9630.0, 9790.0, 9710.0, 9800.0, 9649.0, 9580.0, 9780.0, 9560.0, 9501.0, 9511.0, 9530.0, 9498.0, 9475.0, 9595.0, 9500.0, 9460.0, 9400.0, 9310.0, 9382.0, 9375.0, 9385.0, 9320.0, 9100.0, 8990.0, 9045.0, 9129.0, 9201.0, 9251.0, 9424.0, 9440.0, 9500.0, 9621.0, 9490.0, 9512.0, 9599.0, 9819.0, 9684.0, 10025.0, 9984.0, 10110.0, 9950.0, 10048.0, 10095.0, 10200.0, 10338.0, 10315.0, 10200.0, 10166.0, 10095.0, 10110.0, 10400.0, 10445.0, 10360.0, 10548.0, 10510.0, 10480.0, 10180.0, 10488.0, 10520.0, 10510.0, 10565.0, 10450.0, 10400.0, 10240.0, 10338.0, 10410.0, 10540.0, 10481.0, 10521.0, 10530.0, 10325.0, 10510.0, 10446.0, 10249.0, 10236.0, 10211.0, 10340.0, 10394.0, 11370.0, 11250.0, 11306.0, 11368.0, 11415.0, 11400.0, 11452.0, 11509.0, 11500.0, 11455.0, 11400.0, 11300.0, 11369.0])
# Date offset to predict next i-days results
data = data[:-date_offset]
results = results[date_offset:]
train_data = data[:-50]
train_results = results[:-50]
test_data = data[-50:]
test_results = results[-50:]
regressor = linear_model.BayesianRidge(normalize=True)
regressor.fit(train_data, train_results)
plt.figure(figsize=(8,6))
plt.plot(regressor.predict(test_data), '--', color='#EB3737', linewidth=2, label='Prediction')
plt.plot(test_results, label='True', color='green', linewidth=2)
plt.legend(loc='best')
plt.show()
First of all, the model is not really bad. For instance, when the real value is 10450, it predict 10350, which is really close. And, obviously, the farther in time the predicted point is, the less accurate its predictions, as the variance is growing and sometimes even bias is also growing. You cannot expect the opposite.
Secondly, it is a linear model, so it cannot be absolutely exact when the predicted variable is not linear by nature.
Thirdly, one have to choose a predicted variable with care. For instance, in this case you might try to predict not the value at time T, but the change in value at time T (i.e. C[T]=V[T]-V[T-1]) or the moving average of the last K values. Here you might (or, on the contrary, might not) find out that you are trying to model the so called "random walk" which is hard to predict exactly by its random nature.
And lastly, you might consider other models, like ARIMA, which are better suited for predicting time series.
Adding back the organize_data step:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
def organize_data(to_forecast, window, horizon):
"""
Input:
to_forecast, univariate time series organized as numpy array
window, number of items to use in the forecast window
horizon, horizon of the forecast
Output:
X, a matrix where each row contains a forecast window
y, the target values for each row of X
"""
shape = to_forecast.shape[:-1] + \
(to_forecast.shape[-1] - window + 1, window)
strides = to_forecast.strides + (to_forecast.strides[-1],)
X = np.lib.stride_tricks.as_strided(to_forecast,
shape=shape,
strides=strides)
y = np.array([X[i+horizon][-1] for i in range(len(X)-horizon)])
return X[:-horizon], y
data = np.array([9330.0, 9470.0, 9550.0, 9620.0, 9600.0, 9585.0, 9600.0, 9600.0, 9430.0, 9460.0, 9450.0, 9650.0, 9620.0, 9650.0, 9500.0, 9400.0, 9165.0, 9100.0, 8755.0, 8850.0, 8990.0, 9150.0, 9195.0, 9175.0, 9250.0, 9200.0, 9350.0, 9280.0, 9370.0, 9470.0, 9445.0, 9440.0, 9280.0, 9325.0, 9170.0, 9270.0, 9200.0, 9450.0, 9510.0, 9371.0, 9499.0, 9499.0, 9400.0, 9500.0, 9550.0, 9670.0, 9700.0, 9760.0, 9767.4599999999991, 9652.0, 9520.0, 9600.0, 9610.0, 9700.0, 9825.0, 9900.0, 9950.0, 9801.0, 9770.0, 9545.0, 9630.0, 9710.0, 9700.0, 9700.0, 9600.0, 9615.0, 9575.0, 9500.0, 9600.0, 9480.0, 9565.0, 9510.0, 9475.0, 9600.0, 9400.0, 9400.0, 9400.0, 9300.0, 9430.0, 9410.0, 9380.0, 9320.0, 9000.0, 9100.0, 9000.0, 9200.0, 9210.0, 9251.0, 9460.0, 9400.0, 9600.0, 9621.0, 9440.0, 9490.0, 9675.0, 9850.0, 9680.0, 10100.0, 9900.0, 10100.0, 9949.0, 10040.0, 10050.0, 10200.0, 10400.0, 10350.0, 10200.0, 10175.0, 10001.0, 10110.0, 10400.0, 10401.0, 10300.0, 10548.0, 10515.0, 10475.0, 10200.0, 10481.0, 10500.0, 10540.0, 10559.0, 10300.0, 10400.0, 10202.0, 10330.0, 10450.0, 10540.0, 10540.0, 10650.0, 10450.0, 10550.0, 10501.0, 10206.0, 10250.0, 10345.0, 10225.0, 10330.0, 10506.0, 11401.0, 11245.0, 11360.0, 11549.0, 11415.0, 11450.0, 11460.0, 11600.0, 11530.0, 11450.0, 11402.0, 11299.0])
train_window = 50
k = 5 # number of previous observations to use
h = 2 # forecast horizon
X,y = organize_data(data, k, h)
train_data = X[:train_window]
train_results = y[:train_window]
test_data = X[train_window:]
test_results = y[train_window:]
regressor = linear_model.BayesianRidge(normalize=True)
regressor.fit(train_data, train_results)
plt.figure(figsize=(8,6))
plt.plot(regressor.predict(X), '--', color='#EB3737', linewidth=2, label='Prediction')
plt.plot(y, label='True', color='green', linewidth=2)
plt.legend(loc='best')
plt.show()

Categories

Resources