Collapse certain columns horizontally - python

I have:
haves = pd.DataFrame({'Product':['R123','R234'],
'Price':[1.18,0.23],
'CS_Medium':[1, 0],
'CS_Small':[0, 1],
'SC_A':[1,0],
'SC_B':[0,1],
'SC_C':[0,0]})
print(haves)
given a list of columns, like so:
list_of_starts_with = ["CS_", "SC_"]
I would like to arrive here:
wants = pd.DataFrame({'Product':['R123','R234'],
'Price':[1.18,0.23],
'CS':['Medium', 'Small'],
'SC':['A', 'B'],})
print(wants)
I am aware of wide_to_long but don't think it is applicable here?

We could convert "SC" and "CS" column values to boolean mask to filter the column names; then join it back to the original DataFrame:
msk = haves.columns.str.contains('_')
s = haves.loc[:, msk].astype(bool)
s = s.apply(lambda x: dict(s.columns[x].str.split('_')), axis=1)
out = haves.loc[:, ~msk].join(pd.DataFrame(s.tolist(), index=s.index))
Output:
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B

Based on the list of columns (assuming the starts_with is enough to identify them), it is possible to do the changes in bulk:
def preprocess_column_names(list_of_starts_with, column_names):
"Returns a list of tuples (merged_column_name, options, columns)"
columns_to_transform = []
for starts_with in list_of_starts_with:
len_of_start = len(starts_with)
columns = [col for col in column_names if col.startswith(starts_with)]
options = [col[len_of_start:] for col in columns]
merged_column_name = starts_with[:-1] # Assuming that the last char is not needed
columns_to_transform.append((merged_column_name, options, columns))
return columns_to_transform
def merge_columns(df, merged_column_name, options, columns):
for col, option in zip(columns, options):
df.loc[df[col] == 1, merged_column_name] = option
return df.drop(columns=columns)
def merge_all(df, columns_to_transform):
for merged_column_name, options, columns in columns_to_transform:
df = merge_columns(df, merged_column_name, options, columns)
return df
And to run:
columns_to_transform = preprocess_column_names(list_of_starts_with, haves.columns)
wants = merge_all(haves, columns_to_transform)
If your column names are not surprising (such as Index_ being in list_of_starts_with) the above code should solve the problem with a reasonable performance.

One option is to convert the data to a long form, filter for rows that have a value of 1, then convert back to wide form. We can use pivot_longer from pyjanitor for the wide to long part, and pivot to return to wide form:
# pip install pyjanitor
import pandas as pd
import janitor
( haves
.pivot_longer(index=["Product", "Price"],
names_to=("main", "other"),
names_sep="_")
.query("value==1")
.pivot(index=["Product", "Price"],
columns="main",
values="other")
.rename_axis(columns=None)
.reset_index()
)
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B
You can totally avoid pyjanitor, by tranforming on the columns before reshaping (it still involves wide to long, then long to wide):
index = [col for col in haves
if not col.startswith(tuple(list_of_starts_with))]
temp = haves.set_index(index)
temp.columns = (temp
.columns.str.split("_", expand=True)
.set_names(["main", "other"])
# reshape to get final dataframe
(temp
.stack(["main", "other"])
.loc[lambda df: df == 1]
.reset_index("other")
.drop(columns=0)
.unstack()
.droplevel(0, 1)
.rename_axis(columns=None)
.reset_index()
)
Product Price CS SC
0 R123 1.18 Medium A
1 R234 0.23 Small B

Related

KEGG Drug database Python script

I have a drug database saved in a SINGLE column in CSV file that I can read with Pandas. The file containts 750000 rows and its elements are devided by "///". The column also ends with "///". Seems every row is ended with ";".
I would like to split it to multiple columns in order to create structured database. Capitalized words (drug information) like "ENTRY", "NAME" etc. will be headers of these new columns.
So it has some structure, although the elements can be described by different number and sort of information. Meaning some elements will just have NaN in some cells. I have never worked with such SQL-like format, it is difficult to reproduce it as Pandas code, too. Please, see the PrtScs for more information.
An example of desired output would look like this:
df = pd.DataFrame({
"ENTRY":["001", "002", "003"],
"NAME":["water", "ibuprofen", "paralen"],
"FORMULA":["H2O","C5H16O85", "C14H24O8"],
"COMPONENT":[NaN, NaN, "paracetamol"]})
I am guessing there will be .split() involved based on CAPITALIZED words? The Python 3 code solution would be appreciated. It can help a lot of people. Thanks!
Whatever he could, he helped:
import pandas as pd
cols = ['ENTRY', 'NAME', 'FORMULA', 'COMPONENT']
# We create an additional dataframe.
dfi = pd.DataFrame()
# We read the file, get two columns and leave only the necessary lines.
df = pd.read_fwf(r'drug', header=None, names=['Key', 'Value'])
df = df[df['Key'].isin(cols)]
# To "flip" the dataframe, we first prepare an additional column
# with indexing by groups from one 'ENTRY' row to another.
dfi['Key1'] = dfi['Key'] = df[(df['Key'] == 'ENTRY')].index
dfi = dfi.set_index('Key1')
df = df.join(dfi, lsuffix='_caller', rsuffix='_other')
df.fillna(method="ffill", inplace=True)
df = df.astype({"Key_other": "Int64"})
# Change the shape of the table.
df = df.pivot(index='Key_other', columns='Key_caller', values='Value')
df = df.reindex(columns=cols)
# We clean up the resulting dataframe a little.
df['ENTRY'] = df['ENTRY'].str.split(r'\s+', expand=True)[0]
df.reset_index(drop=True, inplace=True)
pd.set_option('display.max_columns', 10)
Small code refactoring:
import pandas as pd
cols = ['ENTRY', 'NAME', 'FORMULA', 'COMPONENT']
# We read the file, get two columns and leave only the necessary lines.
df = pd.read_fwf(r'C:\Users\ф\drug\drug', header=None, names=['Key', 'Value'])
df = df[df['Key'].isin(cols)]
# To "flip" the dataframe, we first prepare an additional column
# with indexing by groups from one 'ENTRY' row to another.
df['Key_other'] = None
df.loc[(df['Key'] == 'ENTRY'), 'Key_other'] = df[(df['Key'] == 'ENTRY')].index
df['Key_other'].fillna(method="ffill", inplace=True)
# Change the shape of the table.
df = df.pivot(index='Key_other', columns='Key', values='Value')
df = df.reindex(columns=cols)
# We clean up the resulting dataframe a little.
df['ENTRY'] = df['ENTRY'].str.split(r'\s+', expand=True)[0]
df['NAME'] = df['NAME'].str.split(r'\(', expand=True)[0]
df.reset_index(drop=True, inplace=True)
pd.set_option('display.max_columns', 10)
print(df)
Key ENTRY NAME FORMULA \
0 D00001 Water H2O
1 D00002 Nadide C21H28N7O14P2
2 D00003 Oxygen O2
3 D00004 Carbon dioxide CO2
4 D00005 Flavin adenine dinucleotide C27H33N9O15P2
... ... ... ...
11983 D12452 Fostroxacitabine bralpamide hydrochloride C22H30BrN4O8P. HCl
11984 D12453 Guretolimod C24H34F3N5O4
11985 D12454 Icenticaftor C12H13F6N3O3
11986 D12455 Lirafugratinib C28H24FN7O2
11987 D12456 Lirafugratinib hydrochloride C28H24FN7O2. HCl
Key COMPONENT
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
11983 NaN
11984 NaN
11985 NaN
11986 NaN
11987 NaN
[11988 rows x 4 columns]
Need a little more to bring to mind, I leave it to your work.

How to merge multiple columns having same column name in one dataframe with rules python pandas

I have a CSV file with multiple columns having the same name. I want to merge the values and keep only the unique columns as outputs with certain rules to choose between two conflicting values. And if the values are the same, just select one. This is how my CSV would look like. (P.S. My CSV doesn't have headings separated with an underscore. For sake of creating dataframe, I have used underscore.)
df_data_hworkforce = pd.DataFrame({"Country": ['Afghanistan','Bahrain','Djibouti','Egypt','Iran'],
"2019": [2.9,28,2.1,8.5,15.2],
"2019_1": [np.nan,27.9,np.nan,np.nan,np.nan ],
"2018": [2.9,27.3,1.1,6.5,5.2],
"2018_1": [2.9,27,2.1,6,np.nan],
"2017": [3,26,1.9,np.nan,np.nan],
})
Directly creating same name dataframe was not possible. So doing this to present an example.
df_data_hworkforce.rename(columns = {'2019_1':'2019','2018_1':'2018'},inplace = True)
This is how dataframe looks like
Joining the columns with same name the following way:
def sjoin(x): return ';'.join(x[x.notnull()].astype(str))
df_data_hworkforce.groupby(level=0, axis=1).apply(lambda x: x.apply(sjoin, axis=1))
This combines the value of two columns and gives the following results.
However, my desired output is to select only one data when the data is same in both columns and if they are different by less than 0.5, select the not rounded off value. Below is my desired output.
This is a very peculiar data transformation and can not be implemented very efficiently.
However an approach you can take is:
groupby each pairing of data values
aggregate according to your desired threshold & transformation
Update original data
def combine(df, threshold=.5):
arr = df.astype(float).to_numpy().T
diffs = np.nan_to_num(arr - arr[:, None, :], 0)
max_diffs = abs(diffs).max(axis=(0, 1))
non_rounded_values = ((arr.round() != arr) & ~np.isnan(arr)).argmax(axis=0)
rounded_values = ((arr.round() == arr) & ~np.isnan(arr)).argmax(axis=0)
non_rounded_values[non_rounded_values == 0] += rounded_values[non_rounded_values == 0]
cat_values = (
df.astype(str)
.where(df.notnull())
.apply(lambda row: ';'.join(row.dropna().drop_duplicates()), axis=1)
)
choices = np.where(max_diffs >= threshold, non_rounded_values, len(df.columns))
return choices.choose([*arr, cat_values])
import pandas as pd
import numpy as np
df = pd.DataFrame(
data=zip(*[
['Afghanistan','Bahrain','Djibouti','Egypt','Iran'],
[2.9,28,2.1,8.5,15.2],
[2.0,28,2.0,8.5,15.2],
[np.nan,27.9,np.nan,np.nan,np.nan ],
[2.9,27.3,1.1,6.5,5.2],
[2.9,27,2.1,6,np.nan],
[3,26,1.9,np.nan,np.nan],
]),
columns=['Country', '2019', '2019', '2019', '2018', '2018', '2017']
)
to_update = (
df.select_dtypes('number')
.loc[:, lambda d: d.columns.duplicated(keep=False)]
.groupby(level=0, axis=1).agg(combine, threshold=.5)
)
out = df.loc[:, ~df.columns.duplicated()].copy()
out.update(to_update)
print(out.fillna('').sort_index(axis=1))
Country 2019 2018 2017
0 Afghanistan 2.9;2.0 2.9 3.0
1 Bahrain 27.9 27.3 26.0
2 Djibouti 2.1 1.1;2.1 1.9
3 Egypt 8.5 6.5;6.0 NaN
4 Iran 15.2 5.2 NaN
Update: simplified the code since the column names are exact matches.

How to write complicated function to aggregate DataFrame

I have a DataFrame in Python like below, which presents agreements of clients:
df = pd.DataFrame({"ID" : [1,2,1,1,3],
"amount" : [100,200,300,400,500],
"status" : ["active", "finished", "finished",
"active", "finished"]})
I need to write FUNCTION in Python, which will calculate:
1.Number (NumAg) and amount (AmAg) of contracts per "ID"
2.Number (NumAct) and amount of active (AmAct) contracts per ID
3.Number (NumFin) and amount of finished (AmFin) contracts per ID
To be more precision i need to create by this function DataFrame like below:
The below solution should fit your use case.
import pandas as pd
def summarise_df(df):
# Define mask to filter df by 'active' value in 'status' column for 'NumAct', 'AmAct', 'NumFin', and 'AmFin' columns
active_mask = df['status'].str.contains('active')
return df.groupby('ID').agg( # Create first columns in output df using agg (no mask needed)
NumAg=pd.NamedAgg(column='amount', aggfunc='count'),
AmAg=pd.NamedAgg(column='amount', aggfunc='sum'
)).join( # Add columns using values with 'active' status
df[active_mask].groupby('ID').agg(
NumAct=pd.NamedAgg(column='amount', aggfunc='count'),
AmAct=pd.NamedAgg(column='amount', aggfunc='sum')
)).join( # Add columns using values with NOT 'active' (i.e. 'finished') status
df[~active_mask].groupby('ID').agg(
NumFin=pd.NamedAgg(column='amount', aggfunc='count'),
AmFin=pd.NamedAgg(column='amount', aggfunc='sum')
)).fillna(0) # Replace nan values with 0
I would recommend reading over this function and its comments alongside documentation for groupby() and join() so that you can develop a better understanding of exactly what is being done here. It is seldom a wise decision to rely upon code that you don't have a good grasp on.
You could use groupby on ID with agg, after adding two bool columns that make the aggregation easier:
df['AmAct'] = df.amount[df.status.eq('active')]
df['AmFin'] = df.amount[df.status.eq('finished')]
df = df.groupby('ID').agg(
NumAg = ('ID', 'count'),
AmAg = ('amount', 'sum'),
NumAct = ('status', lambda col: col.eq('active').sum()),
AmAct = ('AmAct', 'sum'),
NumFin = ('status', lambda col: col.eq('finished').sum()),
AmFin = ('AmFin', 'sum')
)
Result:
NumAg AmAg NumAct AmAct NumFin AmFin
ID
1 3 800 2 500.0 1 300.0
2 1 200 0 0.0 1 200.0
3 1 500 0 0.0 1 500.0
Or add some more columns to df to do a simpler groupby on ID with sum:
df.insert(1, 'NumAg', 1)
df['NumAct'] = df.status.eq('active')
df['AmAct'] = df.amount[df.NumAct]
df['NumFin'] = df.status.eq('finished')
df['AmFin'] = df.amount[df.NumFin]
df.drop(columns=['status'], inplace=True)
df = df.groupby('ID').sum().rename(columns={'amount': 'AmAg'})
with the same result.
Or, maybe the easiest way, let pivot_table do most of the work, after adding a count column to df, and some column-rearranging afterwards:
df['count'] = 1
df = df.pivot_table(index='ID', columns='status', values=['count', 'amount'],
aggfunc=sum, fill_value=0, margins=True).drop('All')
df.columns = ['AmAct', 'AmFin', 'AmAg', 'NumAct', 'NumFin', 'NumAg']
df = df[['NumAg', 'AmAg', 'NumAct', 'AmAct', 'NumFin', 'AmFin']]

pandas multiindex column styler

Versions: Python 3.7.6, pandas 1.0.0
Input dataframe
df = pd.DataFrame(dict(
recruit_dt=["1/1/2017"]*3+["1/1/2018"]*3+["1/1/2019"]*3,
label = [1,3,4]*3,
nmem = np.random.choice(list(range(10000,3000000)),9),
pct_fem = np.random.sample(9),
mean_age = 50 + 10*np.random.sample(9),
sd_age = 8 + 2*np.random.sample(9)
))
Would like to present this after the following transformations
dfp = pd.pivot_table(df, values=["nmem","pct_fem","mean_age","sd_age"], index="recruit_dt", columns="label")
dfp = dfp.reindex(columns=['nmem', 'pct_fem', 'mean_age', 'sd_age'], level=0)
How do I write the styler so that all the nmem columns have thousand separators {:,}, 'pct_fem' are percentages to two decimal places, mean_age and sd_age are floating point numbers with two decimal places? Is there an approach which uses styler.format or styler.apply with IndexSlice?
==
EDIT: this seems to work. Is there a more concise solution?
dfp.columns.names = ["metrics","label"]
dfp.style.format("{:,}", subset=pd.IndexSlice[:,'nmem']) \
.format("{:.2%}", subset=pd.IndexSlice[:,'pct_fem']) \
.format("{:.2f}", subset=pd.IndexSlice[:,['mean_age','sd_age']])
You can specify an argument to the subset parameter using a list comprehension to select the relevant columns.
>>> (dfp
.style
.format('{:.0f}', na_rep='-', subset=[col for col in dfp.columns if col[0] == 'nmen'])
.format('{:.2%}', na_rep='-', subset=[col for col in dfp.columns if col[0] == 'pct_fem'])
.format('{:,.2f}', na_rep='-', subset=[col for col in dfp.columns if col[0] in {'mean_age', 'sd_age'}])
)
A more general solution:
# Styles.
pct_two = '{:.2%}'
comma_float = '{:.0f}'
comma_float_2 = '{:.2f}'
# Styling to be applied to specified columns.
formats = {
'nmean': comma_float,
'pct_fem': pct_two,
'mean_age': comma_float_2,
'sd_age': comma_float_2,
}
# Create dictionary of multi-index columns with specified styling.
format_dict = {
midx: formats[level_val]
for level_val in formats
for midx in [col for col in dfp if col[0] == level_val]
}
# Apply styling to dataframe.
dfp.style.format(format_dict)
Let's try this:
idx = pd.IndexSlice
formatter_dict = {i:"{:,}" for i in dfp.loc[:, idx['nmem', :]].columns}
formatter_dict2 = {i:"{:.2%}" for i in dfp.loc[:, idx['pct_fem', :]].columns}
formatter_dict3 = {i:"{:.2f}" for i in dfp.loc[:, idx[['mean_age', 'sd_age'], :]].columns}
formatter_dict.update(formatter_dict2)
formatter_dict.update(formatter_dict3)
dfp.style.format(formatter_dict)
Output:

Deleting the same outliers in two timeseries

I have a question about eliminating outliers from two-time series. One time series includes spot market prices and the other includes power outputs. The two series are from 2012 to 2016 and are both CSV files with the with a timestamp and then a value. As example for the power output: 2012-01-01 00:00:00,2335.2152646951617 and for the price: 2012-01-01 00:00:00,17.2
Because the spot market prices are very volatile and have a lot of outliers, I have filtered them. For the second time series, I have to delete the values with the same timestamp, which were eliminated in the time series of the prices. I thought about generating a list with the deleted values and writing a loop to delete the values with the same timestamp in the second time series. But so far that has not worked and I'm not really on. Does anyone have an idea?
My python code looks as follow:
import pandas as pd
import matplotlib.pyplot as plt
power_output = pd.read_csv("./data/external/power_output.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(power_output.head())
plt.plot(power_output)
spotmarket = pd.read_csv("./data/external/spotmarket_dhp.csv", delimiter=",", parse_dates=[0], index_col=[0])
print(spotmarket.head())
r = spotmarket['price'].pct_change().dropna() * 100
print(r)
plt.plot(r)
Q1 = r.quantile(.25)
Q3 = r.quantile(.75)
q1 = Q1-2*(Q3-Q1)
q3 = Q3+2*(Q3-Q1)
a = r[r.between(q1, q3)]
print(a)
plt.plot(a)
Can somebody help me?
If your question is about how to compare two timestamps you can have a look at this.
Basically you could do:
out = r[~r.between(q1, q3)] # negation of your between to get the outliers
df=pd.merge(spotmarker,out,on=['date'],how="outer",indicator=True)
df=df[df['_merge']=='left_only']
Which is a merge operation that conserves only those rows that are only present in the left dataframe
The following suggestion is based on an answer of mine from a previous post.
You can solve your problem by merging both of your series and storing them in pandas dataframe. Then you can use any desired technique to identify and remove outliers. Take a look at the post mentioned above.
Here is my take on your particular problem using a snippet that can handle more than one series:
Since I don't have access to your data, the following snippet will produce two series where one of them has a distinctive outlier:
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
Series 1 - No outliers
Series 2 - A distinctive outlier
Now you can merge those series like this:
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
What is considered an outlier will depend full on the nature of your dataset. In this case, you can set the level for identifying outliers using sscipy.zscore(). In the following case, every observation with a difference that exceeds 3 is considered an outlier.
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
##%%
#df = df_merged
#level = 3
#keepFirst = True
##%%
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()
Let me know how this works out for you.
Here's the whole thing for an easy copy-paste:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
np.random.seed(22)
# A function for noisy data with a trend element
def sample(colname):
base = 100
nsample = 20
sigma = 10
# Basic df with trend and sinus seasonality
trend1 = np.linspace(0,1, nsample)
y1 = np.sin(trend1)
dates = pd.date_range(pd.datetime(2016, 1, 1).strftime('%Y-%m-%d'), periods=nsample).tolist()
df = pd.DataFrame({'dates':dates, 'trend1':trend1, 'y1':y1})
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
# Gaussian Noise with amplitude sigma
df['y2'] = sigma * np.random.normal(size=nsample)
df['y3'] = df['y2'] + base + (np.sin(trend1))
df['trend2'] = 1/(np.cos(trend1)/1.05)
df['y4'] = df['y3'] * df['trend2']
df=df['y4'].to_frame()
df.columns = [colname]
return(df)
df_sample1 = sample(colname = 'series1')
df_sample2 = sample(colname = 'series2')
df_sample2['series2'].iloc[10] = 800
df_sample1.plot()
df_sample2.plot()
# Merge dataframes
df_merged = pd.merge(df_sample1, df_sample2, how='outer', left_index=True, right_index=True)
df_merged.plot()
# A function for removing outliers
def noSpikes(df, level, keepFirst):
# 1. Get some info about the original data:
firstVal = df[:1]
colNames = df.columns
colNumber = len(df.columns)
#cleanBy = 'Series1'
# 2. Take the first difference and
df_diff = df.diff()
# 3. Remove missing values
df_clean = df_diff.dropna()
# 4. Select a level for a Z-score to identify and remove outliers
df_Z = df_clean[(np.abs(stats.zscore(df_clean)) < level).all(axis=1)]
ix_keep = df_Z.index
# 5. Subset the raw dataframe with the indexes you'd like to keep
df_keep = df.loc[ix_keep]
# 6.
# df_keep will be missing some indexes.
# Do the following if you'd like to keep those indexes
# and, for example, fill missing values with the previous values
df_out = pd.merge(df_keep, df, how='outer', left_index=True, right_index=True)
# 7. Keep only the original columns (drop the diffs)
df_out = df_out.ix[:,:colNumber]
# 8. Fill missing values
df_complete = df_out.fillna(axis=0, method='ffill')
# 9. Reset column names
df_complete.columns = colNames
# Keep the first value
if keepFirst:
df_complete.iloc[0] = firstVal.iloc[0]
return(df_complete)
df_clean = noSpikes(df = df_merged, level = 3, keepFirst = True)
df_clean.plot()

Categories

Resources