Merge_asof but only let the nearest merge on key - python

I am currently trying to merge two data frames using the merge_asof method. However, when using this method I stumbled upon the issue that if I have a empty gap in any of my data then there will be issues with duplicate cells in the merged dataframe. For clarification, I two dataframes that look like this:
1.
index Meter_Indication (km) Fuel1 (l)
0 35493 245
1 35975 267
2 36000 200
3 36303 160
4 36567 300
5 38653 234
index Meter_Indication (km) Fuel2 (l)
0 35494 300
1 35980 203
2 36573 323
3 38656 233
These two dataframes contain data about refueling vehicles where the fuel column is refueled amount in liters and the Meter_Indication indicate how many km the car in total has driven (something that is impossible to become less over time, and is why it is a great key to merge on). However, as you can see there are less rows in df2 than in df1 which currently (in my case makes it so that the values merge on the nearest value. Like this:
(merged df)
index Meter_Indication (km) Fuel1 (l) Fuel2(l)
0 35493 245 300
1 35975 267 203
2 36000 200 203
3 36303 160 323
4 36567 300 323
5 38653 234 233
As you can see there are duplicates of the value 203 and 323. My goal is to instead of the dataframe containing all the 5 rows, instead excluding the ones that dont have a "nearest"-match. I want only the actually nearest to merge with the value. With other words my desired data frame is:
index Meter_Indication (km) Fuel1 (l) Fuel2(l)
0 35493 245 300
1 35975 267 203
3 36567 300 323
4 38653 234 233
You can see here that the values that were not a "closest" match with another value were dropped.
I have tried looking for this everywhere but cant find anything to match my desired outcome.
My current code is:
#READS PROVIDED DOCUMENTS.
df1 = pd.read_excel(
filepathname1, "CWA107 Event", na_values=["NA"], skiprows=1, usecols="A, B, C, D, E, F")
df2 = pd.read_excel(
filepathname2,
na_values=["NA"],
skiprows=1,
usecols=["Fuel2 (l)", "Unnamed: 3", "Meter_Indication"],)
# Drop NaN rows.
df2.dropna(inplace=True)
df1.dropna(inplace=True)
#Filters out rows with the keywords listed in 'blacklist'.
df1.rename(columns={"Bränslenivå (%)": "Bränsle"}, inplace=True)
df1 = df1[~df1.Bränsle.isin(blacklist)]
df1.rename(columns={"Bränsle": "Bränslenivå (%)"}, inplace=True)
#Creates new column for the difference in fuellevel column.
df1["Difference (%)"] = df1["Bränslenivå (%)"]
df1["Difference (%)"] = df1.loc[:, "Bränslenivå (%)"].diff()
# Renames time-column so that they match.
df2.rename(columns={"Unnamed: 3": "Tid"}, inplace=True)
# Drops rows where the difference is equal to 0.
df1filt = df1[(df1["Difference (%)"] != 0)]
# Converts time-column to only year, month and date.
df1filt["Tid"] = pd.to_datetime(df1filt["Tid"]).dt.strftime("%Y%m%d").astype(str)
df1filt.reset_index(level=0, inplace=True)
#Renames the index column to "row" in order to later use the "row" column
df1filt.rename(columns={"index": "row"}, inplace=True)
# Creates a new column for the difference in total driven kilometers (used for matching)
df1filt["Match"] = df1filt["Vägmätare (km)"]
df1filt["Match"] = df1filt.loc[:, "Vägmätare (km)"].diff()
#Merges refuels that are previously seperated because of the timeintervals. For example when a refuel takes a lot of time and gets split into two different refuels.
ROWRANGE = len(df1filt)+1
thevalue = 0
for currentrow in range(ROWRANGE-1):
if df1filt.loc[currentrow, 'Difference (%)'] >= 0.0 and df1filt.loc[currentrow-1, 'Difference (%)'] <= 0:
thevalue = 0
thevalue += df1filt.loc[currentrow,'Difference (%)']
df1filt.loc[currentrow,'Match'] = "SUMMED"
if df1filt.loc[currentrow, 'Difference (%)'] >= 0.0 and df1filt.loc[currentrow-1, 'Difference (%)'] >= 0:
thevalue += df1filt.loc[currentrow,'Difference (%)']
if df1filt.loc[currentrow, 'Difference (%)'] <= 0.0 and df1filt.loc[currentrow-1, 'Difference (%)'] >= 0:
df1filt.loc[currentrow-1,'Difference (%)'] = thevalue
df1filt.loc[currentrow-1,'Match'] = "OFFICIAL"
thevalue = 0
#Removes single "refuels" that are lower than 5
df1filt = df1filt[(df1filt['Difference (%)'] > 5)]
#Creates a new dataframe for the summed values
df1filt2 = df1filt[(df1filt['Match'] == "OFFICIAL")]
#Creates a estimated refueled amount column for the automatic
df1filt2["Fuel1 (l)"] = df1filt2["Difference (%)"]
df1filt2["Fuel1 (l)"] = df1filt2.loc[:, "Difference (%)"]/100 *fuelcapacity
#Renames total kilometer column so that the two documents can match
df1filt2.rename(columns={"Vägmätare (km)": "Meter_Indication"}, inplace=True)
#Filters out rows where refuel and kilometer = NaN (Manual)
df2filt = df2[(df2['Fuel2 (l)'] != NaN) & (df2['Meter_Indication'] != NaN)]
#Drops first row
df2filt.drop(df2filt.index[0], inplace=True)
#Adds prefix for the time column so that they match (not used anymore because km is used to match)
df2filt['Tid'] = '20' + df2filt['Tid'].astype(str)
#Rounds numeric columns
decimals = 0
df2filt['Meter_Indication'] = pd.to_numeric(df2filt['Meter_Indication'],errors='coerce')
df2filt['Fuel2 (l)'] = pd.to_numeric(df2filt['Fuel2 (l)'],errors='coerce')
df2filt['Meter_Indication'] = df2filt['Meter_Indication'].apply(lambda x: round(x, decimals))
df2filt['Fuel2 (l)'] = df2filt['Fuel2 (l)'].apply(lambda x: round(x, decimals))
#Removes last number (makes the two excels matchable)
df2filt['Meter_Indication'] //= 10
df1filt2['Meter_Indication'] //= 10
#Creates merged dataframe with the two
merged_df = df1filt2.merge(df2filt, on='Meter_Indication')
Hopefully this was enough information! Thank you in advance.

Try this:
# Assign new column to keep meter indication from df2
df = pd.merge_asof(df1, df2.assign(meter_indication_2=df2['Meter_Indication (km)']), on='Meter_Indication (km)', direction='nearest')
# Calculate absolute difference
df['meter_indication_diff'] = df['Meter_Indication (km)'].sub(df['meter_indication_2']).abs()
# Sort values, drop duplicates (keep the ones with the smallest diff) and do some clean up
df = df.sort_values(by=['meter_indication_2', 'meter_indication_diff']).drop_duplicates(subset=['meter_indication_2']).sort_index().drop(['meter_indication_2', 'meter_indication_diff'], axis=1)
# Output
Meter_Indication (km) Fuel1 (l) Fuel2 (l)
0 35493 245 300
1 35975 267 203
4 36567 300 323
5 38653 234 233

Related

Comparing pandas DataFrames where column values are lists

I have some chemical data that I'm trying to process using Pandas. I have two dataframes:
C_atoms_all.head()
id_all index_all label_all species_all position
0 217 1 C C [6.609, 6.6024, 19.3301]
1 218 2 C C [4.8792, 11.9845, 14.6312]
2 219 3 C C [4.8373, 10.7563, 13.9466]
3 220 4 C C [4.7366, 10.9327, 12.5408]
4 6573 5 C C [1.9482, -3.8747, 19.6319]
C_atoms_a.head()
id_a index_a label_a species_a position
0 55 1 C C [6.609, 6.6024, 19.3302]
1 56 2 C C [4.8792, 11.9844, 14.6313]
2 57 3 C C [4.8372, 10.7565, 13.9467]
3 58 4 C C [4.7367, 10.9326, 12.5409]
4 59 5 C C [5.1528, 15.5976, 14.1249]
What I want to do is get a mapping of all of the id_all values to the id_a values where their position matches. You can see that for C_atoms_all.iloc[0]['id_all'] (which returns 55) and the same query for C_atoms_a, the position values match (within a small fudge factor), which I should also include in the query.
The problem I'm having is that I can't merge or filter on the position columns because lists aren't hashable in Python.
I'd ideally like to return a dataframe that looks like so:
id_all id_a position
217 55 [6.609, 6.6024, 19.3301]
... ... ...
for every row where the position values match.
You can do it like below:
I named your C_atoms_all as df_all and C_atoms_a as df_a:
# First we try to extract different values in "position" columns for both dataframes.
df_all["val0"] = df_all["position"].str[0]
df_all["val1"] = df_all["position"].str[1]
df_all["val2"] = df_all["position"].str[2]
df_a["val0"] = df_a["position"].str[0]
df_a["val1"] = df_a["position"].str[1]
df_a["val2"] = df_a["position"].str[2]
# Then because the position values match (within a small fudge factor)
# we round them with three decimal
df_all.loc[:, ["val0", "val1", "val2"]] = df_all[["val0", "val1", "val2"]].round(3)
df_a.loc[:, ["val0", "val1", "val2"]]= df_a[["val0", "val1", "val2"]].round(3)
# We use loc to modify the original dataframe, instead of a copy of it.
# Then we use merge on three extracted values from position column
df = df_all.merge(df_a, on=["val0", "val1", "val2"], left_index=False, right_index=False,
suffixes=(None, "_y"))
# Finally we just keep the the desired columns
df = df[["id_all", "id_a", "position"]]
print(df)
id_all id_a position
0 217 55 [6.609, 6.6024, 19.3301]
1 218 56 [4.8792, 11.9845, 14.6312]
2 219 57 [4.8373, 10.7563, 13.9466]
3 220 58 [4.7366, 10.9327, 12.5408]
This isn't pretty, but it might work for you
def do(x, df_a):
try:
return next((df_a.iloc[i]['id_a'] for i in df_a.index if df_a.iloc[i]['position'] == x))
except StopIteration:
return np.NAN
match = pd.DataFrame(C_atoms_all[['id_all', 'position']])
match['id_a'] = C_atoms_all['position'].apply(do, args=(C_atoms_a,))
You can create a new column in both datasets that contains the hash of the position column and then merge both datasets by that new column.
# Custom hash function
def hash_position(position):
return hash(tuple(position))
# Create the hash column "hashed_position"
C_atoms_all['hashed_position'] = C_atoms_all['position'].apply(hash_position)
C_atoms_a['hashed_position'] = C_atoms_a['position'].apply(hash_position)
# merge datasets
C_atoms_a.merge(C_atoms_all, how='inner', on='hashed_position')
# ... keep the columns you need
Your question is not clear. It seems to me an interesting question though. For that reason I have reproduced your data in a more useful format just in case there is some one who can help more than I can.
Data
C_atoms_all = pd.DataFrame({
'id_all': [217,218,219,220,6573],
'index_all': [1,2,3,4,5],
'label_all': ['C','C','C','C','C'],
'species_all': ['C','C','C','C','C'],
'position':[[6.609, 6.6024, 19.3301],[4.8792, 11.9845, 14.6312],[4.8373, 10.7563, 13.9466],[4.7366, 10.9327, 12.5408],[1.9482,-3.8747, 19.6319]]})
C_atoms_a = pd.DataFrame({
'id_a': [55,56,57,58,59],
'index_a': [1,2,3,4,5],
'label_a': ['C','C','C','C','C'],
'species_a': ['C','C','C','C','C'],
'position':[[6.609, 6.6024, 19.3302],[4.8792, 11.9844, 14.6313],[4.8372, 10.7565, 13.9467],[4.7367, 10.9326, 12.5409],[5.1528, 15.5976, 14.1249]]})
C_atoms_ab
Solution
#new dataframe bringing together columns position
df3=C_atoms_all.set_index('index_all').join(C_atoms_a.set_index('index_a').loc[:,'position'].to_frame(),rsuffix='_r').reset_index()
#Create temp column that gives you the comparison tolerances
df3['temp']=df3.filter(regex='^position').apply(lambda x: np.round(np.array(x[0])-np.array(x[1]), 4), axis=1)
#Assume tolerance is where only one of the values is over 0.0
C_atoms_all[df3.explode('temp').groupby(level=0)['temp'].apply(lambda x:x.eq(0).sum()).gt(1)]
id_all index_all label_all species_all position
0 217 1 C C [6.609, 6.6024, 19.3301]

How to find the first row with min value of a column in dataframe

I have a data frame where I am trying to get the row of min value by subtracting the abs difference of two columns to make a third column where I am trying to get the first or second min value of the data frame of col[3] I get an error. Is there a better method to get the row of min value from a column[3].
df2 = df[[2,3]]
df2[4] = np.absolute(df[2] - df[3])
#lowest = df.iloc[df[6].min()]
2 3 4
0 -111 -104 7
1 -130 110 240
2 -105 -112 7
3 -118 -100 18
4 -147 123 270
5 225 -278 503
6 102 -122 224
2 3 4
desired result = 2 -105 -112 7
Get difference to Series, add Series.abs and then compare by minimal value in boolean indexing:
s = (df[2] - df[3]).abs()
df = df[s == s.min()]
If want new column for diffence:
df['diff'] = (df[2] - df[3]).abs()
df = df[df['diff'] == df['diff'].min()]
Another idea is get index by minimal value by Series.idxmin and then select by DataFrame.loc, for one row DataFrame are necessary [[]]:
s = (df[2] - df[3]).abs()
df = df.loc[[s.idxmin()]]
EDIT:
For more dynamic code with convert to integers if possible use:
def int_if_possible(x):
try:
return x.astype(int)
except Exception:
return x
df = df.apply(int_if_possible)

Pandas: get pairs of rows with similar (the difference being within some bound) column values

I have a Pandas dataframe with 1M rows, 3 columns (TrackP, TrackPt, NumLongTracks) and I want to find pairs of 'matching' rows, such that for say two 'matching' rows the difference between the values for each row of column 1 (TrackP), column 2 (TrackPt) and column 3 (NumLongTracks) are all within some bound i.e. no more than ±1,
TrackP TrackPt NumLongTracks
1 2801 544 102
2 2805 407 65
3 2802 587 70
4 2807 251 145
5 2802 543 101
6 2800 545 111
For this particular case you would only retain the pair row 1 and row 5, because for this pair
TrackP(row 1) - TrackP(row 5) = -1,
TrackPt(row 1) - TrackP(row 5) = +1,
NumLongTracks(row 1) - NumLongTracks(row 5) = +1
This is trivial when the values are exactly the same between rows, but I'm having trouble figuring out the best way to do this for this particular case.
I think is easier to handle the columns as a single value for comparision.
#new dataframe
tr = track.TrackP.astype(str) + track.TrackPt.astype(str) + track.NumLongTracks.astype(str)
# finding matching routes
matching = []
for r,i in zip(tr,tr.index):
if r[0:4]: #4 position is TrackP
close = (int(r[0:4])-1,int(r[0:4])+1) #range 1 up/down
ptRange = (int(r[5:7])-1,int(r[5:7])+1)
nLRange = (int(r[8:])-1,int(r[8:])+1)
for r2 in tr:
if int(r2[0:4]) in close: #TrackP in range
if int(r2[5:7]) in ptRange: #TrackPt in range
if int(r2[8:]) in nLRange: #NumLongTracks in range
pair = [r,r2]
matching.append(pair)
# back to the format
#[['2801544102', '2802543101'], ['2802543101', '2801544102']]
import collections
routes = collections.defaultdict(list)
for seq in matching:
routes['TrackP'].append(int(seq[0][0:4]))
routes['TrackPt'].append(int(seq[0][4:7]))
routes['NumLongTracks'].append(int(seq[0][7:]))
Now you can easily decompress in a dataframe using the formula:
df = pd.DataFrame.from_dict(dict(routes))
print(df)
TrackP TrackPt NumLongTracks
0 2801 544 102
1 2802 543 101

Adding new records to a dataframe for variables extracted from the same dataframe

I am trying to consolidate variables in a data set.
I have something like this:
import pandas as pd
import numpy as np
data = np.array([[160,90,'skirt_trousers', 'tight_comfy'],[180,100,'trousers_skirt', 'long_short']])
dford = pd.DataFrame(data, columns = ['height','size','order', 'preference'])
and am trying to get it to something like this:
dataForTarget = np.array([['o1',160,90,'skirt', 'tight'],['o2', 180,100,'trousers', 'long'],['o1',160,90,'trousers', 'comfy'],['o2', 180,100,'skirt', 'short']])
Targetdford = pd.DataFrame(dataForTarget, columns = ['orderID','height','size','order', 'preference'])
As a first step, I have extracted as much data as possible from the strings,
then cleaned them:
variables = dford.columns.tolist()
variables.append('ord1')
secondord = dford.order.str.extractall (r'_(.*)')
secondord = secondord.unstack()
secondord.columns = secondord.columns.droplevel()
dford1 = dford.join(secondord)
dford1. columns = variables
dford1.order = dford1.order.str.replace(r'(_.*)','')
variables = dford1.columns.tolist()
variables.append('pref1')
secondpref = dford.preference.str.extractall (r'_(.*)')
secondpref = secondpref.unstack()
secondpref.columns = secondpref.columns.droplevel()
dford2 = dford1.join(secondpref)
dford2. columns = variables
dford2.order = dford2.order.str.replace(r'(_.*)','')
Which gets me here:
At this stage I am at a loss on how to add these new information as observations (in rows).
The best I could come up with follows, but fails as the index contains
duplicate entries. But even if it did not fail, I suspect it would
only be useful if I were trying to fill in missing values.
But I got nowhere.
dford3 = dford2.rename(columns = {'ord1': 'order', 'pref1': 'preference'})
dford3= dford3.stack()
dford3= dford3.unstack()
Use Series.str.split with DataFrame.stack and concat for new DataFrame and add to original by DataFrame.join:
df = pd.concat([dford.pop('order').str.split('_', expand=True).stack().rename('order'),
dford.pop('preference').str.split('_', expand=True).stack().rename('preference')], axis=1)
dford = (dford.join(df.reset_index(level=1)).rename_axis('orderID')
.reset_index()
.sort_values(['level_1','orderID'])
.drop('level_1', 1)
.reset_index(drop=True)
.assign(orderID = lambda x: 'o' + x['orderID'].add(1).astype('str')))
print (dford)
orderID height size order preference
0 o1 160 90 skirt tight
1 o2 180 100 trousers long
2 o1 160 90 trousers comfy
3 o2 180 100 skirt short
Use DataFrame.apply + Series.str.split.
concatenate the resulting dataframes with pd.concat and use Series.map to create the Hight and Size Series:
df=pd.concat([df.T for df in dford[['order','preference']].apply(lambda x: x.str.split('_',expand=True),axis=1)]).rename_axis(index='OrderID').reset_index()
df['height']=df['OrderID'].map(dford['height'])
df['size']=df['OrderID'].map(dford['size'])
print(df)
OrderID order preference height size
0 0 skirt tight 160 90
1 1 trousers comfy 180 100
2 0 trousers long 160 90
3 1 skirt short 180 100
finally add one to the OrderID column and add the character o
df['OrderID']='o'+df['OrderID'].add(1).astype('str')
print(df)
OrderID order preference height size
0 o1 skirt tight 160 90
1 o2 trousers comfy 180 100
2 o1 trousers long 160 90
3 o2 skirt short 180 100

intersection of two columns of pandas dataframe

I have 2 pandas dataframes: dataframe1 and dataframe2 that look like this:
mydataframe1
Out[15]:
Start End
100 200
300 450
500 700
mydataframe2
Out[16]:
Start End Value
0 400 0
401 499 -1
500 1000 1
1001 1698 1
Each row correspond to a segment (start-end).
For each segment in dataframe1 I would like to assign a value depending on the values assigned to the segments in dataframe2.
For example:
the first segment in dataframe1 100 200 is included in the first segment of dataframe2 0 400 then I should assign the value 0
the second segment in dataframe1 300 450 is contained in the first 0 400 and second 401 499 segments of dataframe2. In this case I need to split this segments in 2 and assign the 2 corresponding values. ie 300 400 -> value 0 and 401 - 450 value ->-1
the final dataframe1 should look like
mydataframe1
Out[15]:
Start End Value
100 200 0
300 400 0
401 450 -1
500 700 1
I hope I was claer..can you help me?
I doubt that there is a Pandas method that you can use to solve this directly.
You have to calculate the intersections manually to get the result you want. The intervaltree library makes the interval overlap calculation easier and more efficient at least.
IntervalTree.search() returns the (full) intervals that overlap with the provided one but does not calculate their intersection. This is why I also apply the intersect() function I have defined.
import pandas as pd
from intervaltree import Interval, IntervalTree
def intersect(a, b):
"""Intersection of two intervals."""
intersection = max(a[0], b[0]), min(a[1], b[1])
if intersection[0] > intersection[1]:
return None
return intersection
def interval_df_intersection(df1, df2):
"""Calculate the intersection of two sets of intervals stored in DataFrames.
The intervals are defined by the "Start" and "End" columns.
The data in the rest of the columns of df1 is included with the resulting
intervals."""
tree = IntervalTree.from_tuples(zip(
df1.Start.values,
df1.End.values,
df1.drop(["Start", "End"], axis=1).values.tolist()
))
intersections = []
for row in df2.itertuples():
i1 = Interval(row.Start, row.End)
intersections += [list(intersect(i1, i2)) + i2.data for i2 in tree[i1]]
# Make sure the column names are in the correct order
data_cols = list(df1.columns)
data_cols.remove("Start")
data_cols.remove("End")
return pd.DataFrame(intersections, columns=["Start", "End"] + data_cols)
interval_df_intersection(mydataframe2, mydataframe1)
The result is identical to what you were after.
Here is an answer using the NCLS library. It does not do the splitting, but rather answers the question in the title and does so really quickly.
Setup:
from ncls import NCLS
contents = """Start End
100 200
300 450
500 700"""
import pandas as pd
from io import StringIO
df = pd.read_table(StringIO(contents), sep="\s+")
contents2 = """Start End Value
0 400 0
401 499 -1
500 1000 1
1001 1698 1"""
df2 = pd.read_table(StringIO(contents2), sep="\s+")
Execution:
n = NCLS(df.Start.values, df.End.values, df.index.values)
x, x2 = n.all_overlaps_both(df2.Start.values, df2.End.values, df2.index.values)
dfx = df.loc[x]
# Start End
# 0 100 200
# 0 100 200
# 1 300 450
# 2 500 700
df2x = df2.loc[x2]
# Start End Value
# 0 0 400 0
# 1 401 499 -1
# 1 401 499 -1
# 2 500 1000 1
dfx.insert(dfx.shape[1], "Value", df2x.Value.values)
# Start End Value
# 0 100 200 0
# 0 100 200 0
# 1 300 450 -1
# 2 500 700 1

Categories

Resources