Get a coordinate distance matrix using Pandas without loops - python

I am currently getting a distance matrix of coordinates from two data frames (ref_df and comp_df) using a nested for-loop over rows in both data frames, as shown below.
import geopy.distance
import pandas as pd
ref_df = pd.DataFrame({"grp_id":['M-00353','M-00353','M-00353','M-00538','M-00538','M-00160','M-00160','M-00160',
'M-00509','M-00509','M-00509','M-00509'],"name": ['B1','IIS','IISB I','BK',
'MM - BK','H(SL)','H(PKS SL)','PTH','ASSM 1','PKS SSM','SSM',
'Sukajadi Sawit Mekar 1'],"lat": [0.43462,0.43462,0.43462,1.74887222,1.74887222,-2.6081,
-2.6081,-2.6081, -2.378258,-2.378258,-2.378258,-2.378258],"long":[101.822603,101.822603,101.822603,101.3710944,101.3710944,
104.12525,104.12525,104.12525,112.542356,112.542356,112.542356,112.542356]})
comp_df = pd.DataFrame({"uml_id": ['PO1000000021','PO1000000054','PO1000000058','PO1000000106'],
"mill_name": ['PT IIS-BI','PT MM-BK','HL','PT SSM'],
"Latitude": [0.4344444,0.077043,-2.6081,-2.381111],"Longitude":[101.825,102.030838,104.12525,112.539722]})
matched_coords = []
for row in ref_df.index:
mill_id = ref_df.get_value(row, "grp_id")
mill_lat = ref_df.get_value(row, "lat")
mill_long = ref_df.get_value(row, "long")
for columns in comp_df.index:
gm_id = comp_df.get_value(columns, "uml_id")
gm_lat = comp_df.get_value(columns, "Latitude")
gm_long = comp_df.get_value(columns, "Longitude")
dist = geopy.distance.distance(
(mill_lat, mill_long),
(gm_lat, gm_long)).km
matched_coords.append([
mill_id, mill_lat, mill_long,
gm_id, gm_lat, gm_long, dist
])
# Convert to data frame
mc_df = pd.DataFrame(matched_coords)
mc_df.columns = [
'grp_id', 'grp_lat', 'grp_long',
'match_id', 'match_lat', 'match_long', 'dist'
]
# Pivot to create wide data frame (matrix of distances)
mc_wide_df = mc_df.pivot_table(
values="dist",
index=["grp_id", "grp_lat","grp_long"],
columns="match_id").reset_index()
However, I'd like to simplify the process and code by just creating a helper function using an apply on the data frames. My attempt below is not working. Is anybody able to help me figure out what's going wrong here.
# Test apply!
def get_coords_dist(x):
dist = geopy.distance.distance((x['lat'],x['long']),(comp_df['Latitude'],comp_df['Longitude'])).km
return pd.Series({comp_df.iloc[i[2]]['uml_id']: i for i in dist})
mc_df = ref_df.merge(ref_df.sort_values('grp_id').apply(get_coords_dist, axis=1), left_index=True, right_index=True)

You're looking to perform a cross join between the two data frames ref_df and comp_df. One way to do this is to pd.merge on a dummy column.
def distance_km(x, y):
return geopy.distance.distance(x, y).km
# it looks like your coordinates depend only on grp_id
ref_df_dd = ref_df.drop_duplicates(['grp_id', 'lat', 'long'])
# assign a dummy "_" column in both data frames, merge, and drop the dummy
# column afterwards
merged_df = pd.merge(
ref_df_dd.assign(_=1),
comp_df.assign(_=1),
).drop('_', axis=1)
# apply your distance function on (lat, long) tuples in the Cartesian product
merged_df['distance'] = list(
map(distance_km,
merged_df[['lat', 'long']].apply(tuple, 1),
merged_df[['Latitude', 'Longitude']].apply(tuple, 1)))
# pivot table
merged_df.set_index(['grp_id', 'uml_id']).distance.unstack()
At this point merged_df looks like
uml_id PO1000000021 PO1000000054 PO1000000058 PO1000000106
grp_id
M-00160 422.745678 377.461999 0.000000 936.147322
M-00353 0.267531 45.832819 422.922708 1232.700696
M-00509 1232.642382 1200.904305 936.449658 0.430525
M-00538 153.871840 198.911938 571.009484 1324.234511
which is pretty close to what you want.
Another solution (which is more transparent and 2x faster than the approach above) makes use of itertools.product.
from itertools import product
# create a data frame by iterating over row pairs in the Cartesian product
merged_df = pd.DataFrame([{
'grp_id': r.grp_id,
'uml_id': c.uml_id,
'distance': distance_km((r.lat, r.long), (c.Latitude, c.Longitude))
} for r, c in product(ref_df_dd.itertuples(), comp_df.itertuples())])
# pivot table
merged_df.set_index(['grp_id', 'uml_id']).distance.unstack()
This gives the same merged_df as above.

Related

Grouping a dataframe and performing operations on the resulting matrix in a parallelized manner using Python/Dask/multiprocessing?

I am working on a project where I need to group molecules in a database by their ID and perform operations on the resulting matrix. I am using Python and I want to improve performance by parallelizing the process.
I am currently loading the molecules from an SDF file and storing them in a Pandas dataframe. Each molecule has an ID, a unique Pose ID, and a unique Structure. My goal is to group the dataframe by ID and create a matrix for each ID group. The rows and columns of the matrix would correspond to the unique Pose IDs of the molecules in that ID group. Then, I can calculate values for each cell in the matrix, such as the similarity between the molecules that define that cell. However, the specific operations on the molecules are not important for this question. I am primarily asking for advice on how to set up such a system for parallelized computing using Dask or Multiprocessing, or if there are other better options.
Here is a gist of the version without any parallelisation (please note i have heavily modified to make my questions clearer, the code below outputs the desired things, but I am looking to calculate the celles on molecules not the Pose ID) : https://gist.github.com/Tonylac77/abfd54b1ceef39f0d161fb6b21950edb
#Generate sample dataframe
import pandas as pd
df = pd.DataFrame(columns=['ID', 'Pose ID'])
ids = ['ID' + str(i) for i in range(1, 6)]
pose_ids = ['Pose ' + str(i) for i in range(1, 11)]
# For each ID, add 10 rows to the dataframe with the corresponding Pose IDs
df_list = []
for i in ids:
temp_df = pd.DataFrame({'ID': [i] * 10, 'Pose ID': pose_ids})
df_list.append(temp_df)
df= pd.concat(df_list)
print(df)
################
from tqdm import tqdm
import itertools
import functools
import numpy as np
from IPython.display import display
def full_matrix_calculation(df):
#Here I am using just string concatenation as an example calculation, in reality i am calling external functions
def matrix_calculation(df, id_list):
matrices = {}
calc_dataframes = []
for id in tqdm(id_list):
df_name = df[df['ID']==id]
df_name.index = range(len(df_name['Pose ID']))
matrix = pd.DataFrame(0.0, index=[df_name['Pose ID']], columns=df_name['Pose ID'])
for subset in itertools.combinations(df_name['Pose ID'], 2):
result = subset[0]+subset[1]
matrix.iloc[df_name[df_name['Pose ID']==subset[0]].index.values, df_name[df_name['Pose ID']==subset[1]].index.values] = result
matrix.iloc[df_name[df_name['Pose ID']==subset[1]].index.values, df_name[df_name['Pose ID']==subset[0]].index.values] = result
matrices[id] = matrix
return matrices
id_list = np.unique(np.array(df['ID']))
calculated_dfs = matrix_calculation(df, id_list)
return calculated_dfs
calculated_dfs = full_matrix_calculation(df)
display(calculated_dfs)
I have tried using multiprocessing, however, my implementation seems to be slower than the non-parallelized version : https://gist.github.com/Tonylac77/b4bbada97ee2bab7c37d4a29079af574
def function(tuple):
return tuple[0]+tuple[1]
def full_matrix_calculation(df):
#Here I am using just string concatenation as an example calculation, in reality i am calling external functions
def matrix_calculation(df, id_list):
matrices = {}
calc_dataframes = []
for id in tqdm(id_list):
df_name = df[df['ID']==id]
df_name.index = range(len(df_name['Pose ID']))
matrix = pd.DataFrame(0.0, index=[df_name['Pose ID']], columns=df_name['Pose ID'])
with multiprocessing.Pool() as p:
try:
results = p.map(function, itertools.combinations(df_name['Pose ID'], 2))
except KeyError:
print('Incorrect clustering method selected')
return
results_list = list(zip(itertools.combinations(df_name['Pose ID'], 2), results))
for subset, result in results_list:
matrix.iloc[df_name[df_name['Pose ID']==subset[0]].index.values, df_name[df_name['Pose ID']==subset[1]].index.values] = result
matrix.iloc[df_name[df_name['Pose ID']==subset[1]].index.values, df_name[df_name['Pose ID']==subset[0]].index.values] = result
matrices[id] = matrix
for subset in itertools.combinations(df_name['Pose ID'], 2):
result = subset[0]+subset[1]
matrix.iloc[df_name[df_name['Pose ID']==subset[0]].index.values, df_name[df_name['Pose ID']==subset[1]].index.values] = result
matrix.iloc[df_name[df_name['Pose ID']==subset[1]].index.values, df_name[df_name['Pose ID']==subset[0]].index.values] = result
matrices[id] = matrix
return matrices
id_list = np.unique(np.array(df['ID']))
calculated_dfs = matrix_calculation(df, id_list)
return calculated_dfs
calculated_dfs = full_matrix_calculation(df)
display(calculated_dfs)
I have also started playing around with Dask, however the main issue I'm facing is that I need all of the values of one ID to be in the same dask partition, otherwise I will have incomplete matrices (if I understand correctly at least). I have tried to find a solution to this (like chunking in x partitions etc) but so far to no avail. Will update this thread if something changes.
Any advice welcome to speed these calculations up. For reference, the actual datasets I'm working contain ~10000 unique IDs and ~300000 Pose IDs. With the calculations I'm running on the molecules, some of these are taking 40h to complete.
This should be pretty straightforward using Dask Dataframe and groupBy:
ddf = your_dataframe_as_dask
def matrix_calculation(df, id):
matrix = pd.DataFrame(0.0, index=[df['Pose ID']], columns=df_name['Pose ID'])
for subset in itertools.combinations(df['Pose ID'], 2):
result = subset[0]+subset[1]
matrix.iloc[df[df['Pose ID']==subset[0]].index.values, df_name[df_name['Pose ID']==subset[1]].index.values] = result
matrix.iloc[df[df['Pose ID']==subset[1]].index.values, df_name[df_name['Pose ID']==subset[0]].index.values] = result
return matrix
ddf.groupby('ID').apply(matrix_calculation).compute()
See https://examples.dask.org/dataframes/02-groupby.html#Groupby-Apply.
This will parallelize the work for each ID.
You might then want to look at https://docs.dask.org/en/stable/scheduling.html to chose the scheduler that suits your need (default with Dataframe is threads, which might not be efficient depending on your code).

How to concatenate a series to a pandas dataframe in python?

I would like to iterate through a dataframe rows and concatenate that row to a different dataframe basically building up a different dataframe with some rows.
For example:
`IPCSection and IPCClass Dataframes
allcolumns = np.concatenate((IPCSection.columns, IPCClass.columns), axis = 0)
finalpatentclasses = pd.DataFrame(columns=allcolumns)
for isec, secrow in IPCSection.iterrows():
for icl, clrow in IPCClass.iterrows():
if (secrow[0] in clrow[0]):
pdList = [finalpatentclasses, pd.DataFrame(secrow), pd.DataFrame(clrow)]
finalpatentclasses = pd.concat(pdList, axis=0, ignore_index=True)
display(finalpatentclasses)
The output is:
I want the nan values to dissapear and move all the data under the correct columns. I tried axis = 1 but messes up the column names. Append does not work as well all values are placed diagonally at the table with nan values as well.
Alright, I have figured it out. The idea is that you create a newrowDataframe and concatenate all the data in a list from there you can add it to the dataframe and then conc with the final dataframe.
Here is the code:
allcolumns = np.concatenate((IPCSection.columns, IPCClass.columns), axis = 0)
finalpatentclasses = pd.DataFrame(columns=allcolumns)
for isec, secrow in IPCSection.iterrows():
for icl, clrow in IPCClass.iterrows():
newrow = pd.DataFrame(columns=allcolumns)
values = np.concatenate((secrow.values, subclrow.values), axis=0)
newrow.loc[len(newrow.index)] = values
finalpatentclasses = pd.concat([finalpatentclasses, newrow], axis=0)
finalpatentclasses.reset_index(drop=false, inplace=True)
display(finalpatentclasses)
Update the code below is more efficient:
allcolumns = np.concatenate((IPCSection.columns, IPCClass.columns, IPCSubClass.columns, IPCGroup.columns), axis = 0)
newList = []
for secrow in IPCSection.itertuples():
for clrow in IPCClass.itertuples():
if (secrow[1] in clrow[1]):
values = ([secrow[1], secrow[2], subclrow[1], subclrow[2]])
new_row = {IPCSection.columns[0]: [secrow[1]], IPCSection.columns[1]: [secrow[2]],
IPCClass.columns[0]: [clrow[1]], IPCClass.columns[1]: [clrow[2]]}
newList.append(values)
finalpatentclasses = pd.DataFrame(newList, columns=allcolumns)
display(finalpatentclasses)

Creating Cartesian Product DataFrame without maxing Memory

I have several dataframes, from which I'm creating a cartesian product (on purpose!)
After this, I'm exporting the result to disk.
I believe the size of the resulting dataframe could exceed my memory footprint, so I'm wondering is there a way that I can chunk this so that the dataframe doesn't need to all be in memory at the same time?
Example Code:
import pandas as pd
def create_list_from_range(r1,r2):
if (r1 == r2):
return r1
else:
res = []
while(r1 < r2+1 ):
res.append(r1)
r1 += 1
return res
# make a list of options
color_opt = ['red','blue','green','orange']
dow_opt = create_list_from_range(1,7)
hod_opt = create_list_from_range(0,23)
# turn each list into a dataframe
df_color = pd.DataFrame({'color': color_opt})
df_day = pd.DataFrame({'day_of_week': dow_opt})
df_hour = pd.DataFrame({'hour_of_day': hod_opt})
# add a dummy columns to everything so I can easily do a cartesian product
df_color['dummy']=1
df_day['dummy']=1
df_hour['dummy']=1
# now cartesian product... cascading
merge1 = pd.merge(df_day, df_hour, on='dummy')
FINAL = pd.merge(merge1, df_color, on='dummy')
FINAL.to_csv('FINAL_OUTPUT.csv', index=False)
You could try building up individual rows using itertools.product. In your example, you could do this as follows:
from itertools import product
prod = product(color_opt, dow_opt, hod_opt)
You can then get a number of rows and append them to an existing csv file using
df.to_csv("file", mode="a")

Given a Geopandas GeoDataFrame with several geometry columns: How to calculate the distances to a main geometry column?

To make my problem comprehensible, I first build two simple GeoDataFrames with one column each for the point geometries, and extend the first one by as many columns as there are in the second one, and fill it with the corresponding values of the second one.
df_1 = pd.DataFrame(
{"geometry":["POINT (4601713.002 3161641.211)",
"POINT (4596192.207 3241423.174)",
"POINT (4572005.011 3257270.689)"]})
df_1['geometry'] = df_1['geometry'].apply(wkt.loads)
gdf_1 = gp.GeoDataFrame(df_1,
geometry="geometry",
crs={'init': 'epsg:' + str(3035)})
df_2 = pd.DataFrame(
{'geometry':["POINT (4627355.438 3211988.792)",
"POINT (4599267.641 3220442.514)",
"POINT (4557279.752 3237223.279)"]})
df_2['geometry'] = df_2['geometry'].apply(wkt.loads)
gdf_2 = gp.GeoDataFrame(df_2,
geometry="geometry",
crs={'init': 'epsg:' + str(3035)})
gdf_2 = gdf_2.assign(Name = lambda x: 'gdf2_' + (gdf_2.index.astype(str)))
gdf_1 = gdf_1.assign(**dict.fromkeys(gdf_2.Name))
# Loop through df_1:
columnsOfInterest = gdf_1.columns[gdf_1.columns.str.startswith("gdf2_")]
for i in columnsOfInterest:
# get the geometry from gdf_1: (works!)
gdf_1[i]= list(gdf_1[columnsOfInterest==i].geometry)*len(gdf_1)
gdf_1
DataFrame_gdf_1
Now I have problems with the distance calculation. Since I have a lot of point data in my original datasets, I need a solution that allows me to calculate the distances all in one step. What I tried so far is to loop through the new columns of df_1, but the distance calculation seems to not work on a Series. I get the AttributeError: "'Series' object has no attribute 'distance'").
# Loop through df_1:
columnsOfInterest = gdf_1.columns[gdf_1.columns.str.startswith("gdf2_")]
for i in columnsOfInterest:
# get the geometry from gdf_1: (works!)
gdf_1[i]= list(gdf_1[columnsOfInterest==i].geometry)*len(gdf_1)
# distance calculation: (does not work!)
gdf_1[i]=gdf_1[i].distance(gdf_1.geometry)
gdf_1
Does anyone know a solution to the problem?
keys = [c for c in gdf_1 if c.startswith('gdf2_')]
gdf1_melted = pd.melt(gdf_1, id_vars=['geometry','Name'], value_vars=keys, value_name='TargetPointCoo')
gdf1_melted['dist_km'] = gdf1_melted.apply(lambda r: round(r['geometry'].distance(r['TargetPointCoo'])/1000,1), axis=1)
gdf1_melted.drop('TargetPointCoo', axis=1, inplace=True)
pivoted = gdf1_melted.pivot(index = "Name", columns = "variable", values = "dist_km").reset_index()
pivoted
joined = pivoted.join(copyFromPreviousGdf_1.set_index('Name'), on='Name')
print(joined)

Deleting the same outliers in two timeseries

I have a question about eliminating outliers from two-time series. One time series includes spot market prices and the other includes power outputs. The two series are from 2012 to 2016 and are both CSV files with the with a timestamp and then a value. As example for the power output: 2012-01-01 00:00:00,2335.2152646951617 and for the price: 2012-01-01 00:00:00,17.2
Because the spot market prices are very volatile and have a lot of outliers, I have filtered them. For the second time series, I have to delete the values with the same timestamp, which were eliminated in the time series of the prices. I thought about generating a list with the deleted values and writing a loop to delete the values with the same timestamp in the second time series. But so far that has not worked and I'm not really on. Does anyone have an idea?
My python code looks as follow:
import pandas as pd
import matplotlib.pyplot as plt
power_output = pd.read_csv("./data/external/power_output.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(power_output.head())
plt.plot(power_output)
spotmarket = pd.read_csv("./data/external/spotmarket_dhp.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(spotmarket.head())
r = spotmarket['price'].pct_change().dropna() * 100
print(r)
plt.plot(r)
Q1 = r.quantile(.25)
Q3 = r.quantile(.75)
q1 = Q1-2*(Q3-Q1)
q3 = Q3+2*(Q3-Q1)
a = r[r.between(q1, q3)]
print(a)
plt.plot(a)
Can somebody help me?
If your question is about how to compare two timestamps you can have a look at this.
Basically you could do:
out = r[~r.between(q1, q3)] # negation of your between to get the outliers
df=pd.merge(spotmarker,out,on=['date'],how="outer",indicator=True)
df=df[df['_merge']=='left_only']
Which is a merge operation that conserves only those rows that are only present in the left dataframe
The following suggestion is based on an answer of mine from a previous post.
You can solve your problem by merging both of your series and storing them in pandas dataframe. Then you can use any desired technique to identify and remove outliers. Take a look at the post mentioned above.
Here is my take on your particular problem using a snippet that can handle more than one series:
Since I don't have access to your data, the following snippet will produce two series where one of them has a distinctive outlier:
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
Series 1 - No outliers
Series 2 - A distinctive outlier
Now you can merge those series like this:
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
What is considered an outlier will depend full on the nature of your dataset. In this case, you can set the level for identifying outliers using sscipy.zscore(). In the following case, every observation with a difference that exceeds 3 is considered an outlier.
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
##%%
#df = df_merged
#level = 3
#keepFirst = True
##%%
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()
Let me know how this works out for you.
Here's the whole thing for an easy copy-paste:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
np.random.seed(22)
# A function for noisy data with a trend element
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()

Categories

Resources