Related
dt = {'id': {0: 'x1', 1: 'x2', 2: 'x3', 3: 'x4', 4: 'x5', 5: 'x6', 6: 'x7', 7: 'x8', 8: 'x9', 9: 'x10'}, 'trt': {0: 'cnt', 1: 'cnt', 2: 'tr', 3: 'tr', 4: 'tr', 5: 'cnt', 6: 'tr', 7: 'tr', 8: 'cnt', 9: 'cnt'}, 'work.T1': {0: 0.6516556669957936, 1: 0.567737752571702, 2: 0.1135089821182191, 3: 0.5959253052715212, 4: 0.3580499750096351, 5: 0.4288094183430075, 6: 0.0519033221062272, 7: 0.2641776674427092, 8: 0.3987907308619469, 9: 0.8361341434065253}, 'play.T1': {0: 0.8647212258074433, 1: 0.6153524168767035, 2: 0.7751098964363337, 3: 0.3555686913896352, 4: 0.4058499720413238, 5: 0.7066469138953835, 6: 0.8382876652758569, 7: 0.2395891312044114, 8: 0.7707715332508087, 9: 0.3558977444190532}, 'talk.T1': {0: 0.5355970377568156, 1: 0.0930881295353174, 2: 0.169803041499108, 3: 0.8998324507847428, 4: 0.4226376069709658, 5: 0.7477464678231627, 6: 0.8226525799836963, 7: 0.9546536463312804, 8: 0.6854445093777031, 9: 0.5005032296758145}, 'work.T2': {0: 0.2754838624969125, 1: 0.2289039448369294, 2: 0.0144339059479534, 3: 0.7289645625278354, 4: 0.2498804717324674, 5: 0.1611832766793668, 6: 0.0170426501426845, 7: 0.4861003451514989, 8: 0.1029001718852669, 9: 0.8015470046084374}, 'play.T2': {0: 0.3543280649464577, 1: 0.9364325392525644, 2: 0.2458663922734558, 3: 0.4731414613779634, 4: 0.191560871200636, 5: 0.5832219698932022, 6: 0.4594731898978352, 7: 0.467434047954157, 8: 0.3998325555585325, 9: 0.5052855962421745}, 'talk.T2': {0: 0.0318881559651345, 1: 0.1144675880204886, 2: 0.468935475917533, 3: 0.3969867376144975, 4: 0.8336191941052675, 5: 0.7611217433586717, 6: 0.5733564489055425, 7: 0.447508045937866, 8: 0.0838020080700516, 9: 0.2191385473124683}}
mydt = pd.DataFrame(dt, columns = ['id', 'trt', 'work.T1', '', 'play.T1', 'talk.T1','work.T2', '', 'play.T2', 'talk.T2'])
So I have the above dataset and need to tidy it up. I have used the following code but it returns "ValueError: stubname can't be identical to a column name." How can I fix the code to avoid this problem?
names = ['play', 'talk', 'work']
activities = pd.wide_to_long(dt, stubnames=names, i='id', j='time', sep='.', suffix='T\d').sort_index().reset_index()
activities
Note: I am trying to get the dataframe to look like the following.
Changed :
activities = pd.wide_to_long(activities, stubnames=names, i='id', j='time', sep='.', suffix='T\d').sort_index().reset_index()
To:
activities = pd.wide_to_long(mydt, stubnames=names, i='id', j='time', sep='.', suffix='T\d').sort_index().reset_index()
and then it works.
I want to plot by Step Typ: Traction and Stribeck. The different load stages should have his own plot. At the respective load level, the line plots should be broken down by temperature. y-axis is Traction (-) and x-axis the respective counterpart SRR (%) or Rolling speed (mm/s) (for Traction and Stribeck respectively). At the end, I should have four different plots.
Example, how it should look like:
My attempt so far, which leads to an empty plot.
import pandas as pd
import matplotlib.pyplot as plt
data = {'Step 1': {'Step Typ': 'Traction', 'SRR (%)': {1: 8.384, 2: 9.815, 3: 7.531, 4: 10.209, 5: 7.989, 6: 7.331, 7: 5.008, 8: 2.716, 9: 9.6, 10: 7.911}, 'Traction (-)': {1: 5.602, 2: 6.04, 3: 2.631, 4: 2.952, 5: 8.162, 6: 9.312, 7: 4.994, 8: 2.959, 9: 10.075, 10: 5.498}, 'Temperature': 30, 'Load': 40}, 'Step 3': {'Step Typ': 'Traction', 'SRR (%)': {1: 2.909, 2: 5.552, 3: 5.656, 4: 9.043, 5: 3.424, 6: 7.382, 7: 3.916, 8: 2.665, 9: 4.832, 10: 3.993}, 'Traction (-)': {1: 9.158, 2: 6.721, 3: 7.787, 4: 7.491, 5: 8.267, 6: 2.985, 7: 5.882, 8: 3.591, 9: 6.334, 10: 10.43}, 'Temperature': 80, 'Load': 40}, 'Step 5': {'Step Typ': 'Traction', 'SRR (%)': {1: 4.765, 2: 9.293, 3: 7.608, 4: 7.371, 5: 4.87, 6: 4.832, 7: 6.244, 8: 6.488, 9: 5.04, 10: 2.962}, 'Traction (-)': {1: 6.656, 2: 7.872, 3: 8.799, 4: 7.9, 5: 4.22, 6: 6.288, 7: 7.439, 8: 7.77, 9: 5.977, 10: 9.395}, 'Temperature': 30, 'Load': 70}, 'Step 7': {'Step Typ': 'Traction', 'SRR (%)': {1: 9.46, 2: 2.83, 3: 3.249, 4: 9.273, 5: 8.792, 6: 9.673, 7: 6.784, 8: 3.838, 9: 8.779, 10: 4.82}, 'Traction (-)': {1: 5.245, 2: 8.491, 3: 10.088, 4: 9.988, 5: 4.886, 6: 4.168, 7: 8.628, 8: 5.038, 9: 7.712, 10: 3.961}, 'Temperature': 80, 'Load': 70}, 'Step 2': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 4.862, 2: 4.71, 3: 4.537, 4: 6.35, 5: 6.691, 6: 5.337, 7: 8.419, 8: 10.303, 9: 5.018, 10: 10.195}, 'Traction (-)': {1: 6.674, 2: 10.137, 3: 2.822, 4: 5.494, 5: 9.986, 6: 9.095, 7: 3.53, 8: 6.96, 9: 8.251, 10: 7.836}, 'Temperature': 30, 'Load': 40}, 'Step 4': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 4.04, 2: 8.288, 3: 3.731, 4: 10.137, 5: 5.32, 6: 8.504, 7: 5.917, 8: 9.677, 9: 8.641, 10: 7.685}, 'Traction (-)': {1: 9.522, 2: 4.749, 3: 3.46, 4: 3.21, 5: 5.005, 6: 9.886, 7: 8.023, 8: 5.935, 9: 8.74, 10: 5.117}, 'Temperature': 80, 'Load': 40}, 'Step 6': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 6.244, 2: 7.015, 3: 5.998, 4: 4.894, 5: 6.117, 6: 6.644, 7: 7.619, 8: 10.477, 9: 9.61, 10: 2.958}, 'Traction (-)': {1: 7.353, 2: 7.98, 3: 6.675, 4: 8.853, 5: 7.537, 6: 5.256, 7: 4.923, 8: 10.293, 9: 2.873, 10: 10.407}, 'Temperature': 30, 'Load': 70}, 'Step 8': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 3.475, 2: 2.756, 3: 7.809, 4: 9.449, 5: 2.72, 6: 4.133, 7: 10.139, 8: 10.0, 9: 3.71, 10: 8.267}, 'Traction (-)': {1: 6.307, 2: 2.83, 3: 9.258, 4: 3.405, 5: 9.659, 6: 6.662, 7: 6.413, 8: 6.488, 9: 7.972, 10: 6.288}, 'Temperature': 80, 'Load': 70} }
df = pd.DataFrame(data)
items = list()
series = list()
for item, d in data.items():
items.append(item)
series.append(pd.DataFrame.from_dict(d))
df = pd.concat(series, keys=items)
df.set_index(['Step Typ', 'Load', 'Temperature'], inplace=True)
df.loc[('Stribeck')]
for force, _ in df.groupby(level=1):
plt.figure(figsize=(15, 12))
for i, row in df.loc[('Traction'), force].iterrows():
plt.ylim(0, 0.1)
plt.ylabel('Traction Coeff (-)')
plt.xlabel('Rolling Speed (mm/s)')
plt.title('Title comes later', loc='left')
plt.plot(row['Rolling Speed (mm/s)'], row['Traction (-)'], label=f"{i} - {force}")
print(f"{i} - {force}")
plt.show()
I have changed your plotting loop. The code below will generate two plots for Traction (one for each Load value), where each has two curves (one for each temperature). I have commented the line where you set the ylim(a, b) because this could lead to empty plot if data fall out of (a, b) range.
import pandas as pd
import matplotlib.pyplot as plt
data = {'Step 1': {'Step Typ': 'Traction', 'SRR (%)': {1: 8.384, 2: 9.815, 3: 7.531, 4: 10.209, 5: 7.989, 6: 7.331, 7: 5.008, 8: 2.716, 9: 9.6, 10: 7.911}, 'Traction (-)': {1: 5.602, 2: 6.04, 3: 2.631, 4: 2.952, 5: 8.162, 6: 9.312, 7: 4.994, 8: 2.959, 9: 10.075, 10: 5.498}, 'Temperature': 30, 'Load': 40}, 'Step 3': {'Step Typ': 'Traction', 'SRR (%)': {1: 2.909, 2: 5.552, 3: 5.656, 4: 9.043, 5: 3.424, 6: 7.382, 7: 3.916, 8: 2.665, 9: 4.832, 10: 3.993}, 'Traction (-)': {1: 9.158, 2: 6.721, 3: 7.787, 4: 7.491, 5: 8.267, 6: 2.985, 7: 5.882, 8: 3.591, 9: 6.334, 10: 10.43}, 'Temperature': 80, 'Load': 40}, 'Step 5': {'Step Typ': 'Traction', 'SRR (%)': {1: 4.765, 2: 9.293, 3: 7.608, 4: 7.371, 5: 4.87, 6: 4.832, 7: 6.244, 8: 6.488, 9: 5.04, 10: 2.962}, 'Traction (-)': {1: 6.656, 2: 7.872, 3: 8.799, 4: 7.9, 5: 4.22, 6: 6.288, 7: 7.439, 8: 7.77, 9: 5.977, 10: 9.395}, 'Temperature': 30, 'Load': 70}, 'Step 7': {'Step Typ': 'Traction', 'SRR (%)': {1: 9.46, 2: 2.83, 3: 3.249, 4: 9.273, 5: 8.792, 6: 9.673, 7: 6.784, 8: 3.838, 9: 8.779, 10: 4.82}, 'Traction (-)': {1: 5.245, 2: 8.491, 3: 10.088, 4: 9.988, 5: 4.886, 6: 4.168, 7: 8.628, 8: 5.038, 9: 7.712, 10: 3.961}, 'Temperature': 80, 'Load': 70}, 'Step 2': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 4.862, 2: 4.71, 3: 4.537, 4: 6.35, 5: 6.691, 6: 5.337, 7: 8.419, 8: 10.303, 9: 5.018, 10: 10.195}, 'Traction (-)': {1: 6.674, 2: 10.137, 3: 2.822, 4: 5.494, 5: 9.986, 6: 9.095, 7: 3.53, 8: 6.96, 9: 8.251, 10: 7.836}, 'Temperature': 30, 'Load': 40}, 'Step 4': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 4.04, 2: 8.288, 3: 3.731, 4: 10.137, 5: 5.32, 6: 8.504, 7: 5.917, 8: 9.677, 9: 8.641, 10: 7.685}, 'Traction (-)': {1: 9.522, 2: 4.749, 3: 3.46, 4: 3.21, 5: 5.005, 6: 9.886, 7: 8.023, 8: 5.935, 9: 8.74, 10: 5.117}, 'Temperature': 80, 'Load': 40}, 'Step 6': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 6.244, 2: 7.015, 3: 5.998, 4: 4.894, 5: 6.117, 6: 6.644, 7: 7.619, 8: 10.477, 9: 9.61, 10: 2.958}, 'Traction (-)': {1: 7.353, 2: 7.98, 3: 6.675, 4: 8.853, 5: 7.537, 6: 5.256, 7: 4.923, 8: 10.293, 9: 2.873, 10: 10.407}, 'Temperature': 30, 'Load': 70}, 'Step 8': {'Step Typ': 'Stribeck', 'Rolling Speed (mm/s)': {1: 3.475, 2: 2.756, 3: 7.809, 4: 9.449, 5: 2.72, 6: 4.133, 7: 10.139, 8: 10.0, 9: 3.71, 10: 8.267}, 'Traction (-)': {1: 6.307, 2: 2.83, 3: 9.258, 4: 3.405, 5: 9.659, 6: 6.662, 7: 6.413, 8: 6.488, 9: 7.972, 10: 6.288}, 'Temperature': 80, 'Load': 70} }
df = pd.DataFrame(data)
items = list()
series = list()
for item, d in data.items():
items.append(item)
series.append(pd.DataFrame.from_dict(d))
df = pd.concat(series, keys=items)
df.set_index(['Step Typ', 'Load', 'Temperature'], inplace=True)
for force, _ in df.groupby(level=1):
fig, ax = plt.subplots(figsize=(8, 6))
df_step = df.loc[('Traction'), force]
for temperature in df_step.index.unique():
df_temp = df_step.loc[temperature].sort_values('SRR (%)')
# ax.set_ylim(0, 0.1)
ax.set_ylabel('Traction Coeff (-)')
ax.set_xlabel('SRR (%)')
ax.set_title('Title comes later', loc='left')
ax.plot(df_temp['SRR (%)'], df_temp['Traction (-)'], label = f'T = {df_temp.index.unique().values[0]}°C - Load = {force}')
ax.legend(frameon = True)
plt.show()
I am trying to plot a line graph to show the trends of each key of a dictionary in Jupyter Notebook with Python. This is what I have in the k_rmse_values variable as shown below:
k_rmse_values =
{'bore': {1: 8423.759328233446,
3: 6501.928933614838,
5: 6807.187615513473,
7: 6900.29659028346,
9: 7134.8868708101645},
'city-mpg': {1: 4265.365592771621,
3: 3865.0178306330113,
5: 3720.409335758634,
7: 3819.183283405616,
9: 4219.677972675927},
'compression-rate': {1: 7016.906657495168,
3: 7319.354017489066,
5: 6301.624922763969,
7: 6133.006310754547,
9: 6417.253959732598},
'curb-weight': {1: 3950.9888180049306,
3: 4201.343428000144,
5: 4047.052502155118,
7: 3842.0974736649846,
9: 3943.9478256384205},
'engine-size': {1: 2853.7338453331627,
3: 2793.6254775629623,
5: 3123.320055069605,
7: 2941.73029681235,
9: 2931.996240628853},
'height': {1: 6330.178232877807,
3: 7049.500497198366,
5: 6869.570862695864,
7: 6738.641089739572,
9: 6344.062937760911},
'highway-mpg': {1: 4826.0580187146525,
3: 3510.253629329685,
5: 3379.2250123364083,
7: 4044.271135312068,
9: 4462.027046251678},
'horsepower': {1: 3623.6389886411143,
3: 4294.825669466819,
5: 4778.254807521257,
7: 4730.538701514935,
9: 4662.8601512508885},
'length': {1: 4952.798701744297,
3: 5403.624431188139,
5: 5500.731909846179,
7: 5103.4515274528885,
9: 4471.077661709427},
'normalized-losses': {1: 9604.929081466453,
3: 7494.820436511842,
5: 6391.912634697067,
7: 6699.853883298577,
9: 6861.6389834002875},
'peak-rpm': {1: 8041.2366213164005,
3: 7502.080095843049,
5: 6521.863037752326,
7: 6869.602542315512,
9: 6884.533017667794},
'stroke': {1: 10330.231237489314,
3: 8947.585146097614,
5: 6973.912792744113,
7: 7266.333478250421,
9: 7026.017456146411},
'wheel-base': {1: 2797.4144312203725,
3: 3392.8627620671928,
5: 4238.25624378706,
7: 4456.687059524217,
9: 4426.032222634904},
'width': {1: 2849.2691940215127,
3: 4076.59327053035,
5: 3979.9751617315405,
7: 3845.3326184519606,
9: 3687.926625900343}}
When I used this code to plot
for k,v in k_rmse_values.items():
x = list(v.keys())
y = list(v.values())
plt.plot(x,y)
plt.xlabel('k value')
plt.ylabel('RMSE')
and it doesn't plot from 1 to 9 in order; it gives this graph
it plots in this k-value order 1, 3, 9 , 5, 7
I have spent hours on this problem and still can't figure out a way to do it. Your help with this would be greatly appreciated.
One solution is to sort the keys and get the matching values:
for k,v in k_rmse_values.items():
xs = list(v.keys()).sort()
ys = [v[x] for x in xs]
# Note I renamed these arrays so following uses should be changed accordingly
I am trying to rename the key and subkey in python nested dictionary. However, I haven't got the result that I expected yet. Below is the original nested key that I have.
nested_dict = {
0: {0: 33.97, 1: 55.32, 2: 57.31, 3: 71.56},
1: {0: 27.31, 1: 23.32, 2: 32.25, 3: 60.21},
2: {0: 65.38, 1: 36.88, 2: 70.88, 3: 21.93},
3: {0: 35.44, 1: 21.21, 2: 40.72, 3: 51.35}
}
I am trying to change the key and subkey to another value into this.
nested_dict = {
4: {4: 33.97, 5: 55.32, 6: 57.31, 7: 71.56},
5: {4: 27.31, 5: 23.32, 6: 32.25, 7: 60.21},
6: {4: 65.38, 5: 36.88, 6: 70.88, 7: 21.93},
7: {4: 35.44, 5: 21.21, 6: 40.72, 7: 51.35}
}
What I have in mind is renaming the key using a list. I have tried to replace the key and subkey with a list below:
new_key = []
for i in range(4,8):
new_key.append(i)
However, I still haven't got it. Another idea is using pandas DataFrame to rename both key and subkey. I am not sure whether using lists or pandas is suitable for the given problem.
Code for renaming a key from here:
mydict[new_key] = mydict.pop(old_key)
You could use a (nested) dict comprehension ([Python]: PEP 274 -- Dict Comprehensions). Note that it generates a new dictionary (but you can assign it to the old variable):
>>> from pprint import pprint as pp
>>>
>>> nested_dict = {
... 0: {0: 33.97, 1: 55.32, 2: 57.31, 3: 71.56},
... 1: {0: 27.31, 1: 23.32, 2: 32.25, 3: 60.21},
... 2: {0: 65.38, 1: 36.88, 2: 70.88, 3: 21.93},
... 3: {0: 35.44, 1: 21.21, 2: 40.72, 3: 51.35}
... }
>>>
>>> pp(nested_dict)
{0: {0: 33.97, 1: 55.32, 2: 57.31, 3: 71.56},
1: {0: 27.31, 1: 23.32, 2: 32.25, 3: 60.21},
2: {0: 65.38, 1: 36.88, 2: 70.88, 3: 21.93},
3: {0: 35.44, 1: 21.21, 2: 40.72, 3: 51.35}}
>>>
>>> modified_nested_dict = {k0 + 4: {k1 + 4: v1 for k1, v1 in v0.items()} for k0, v0 in nested_dict.items()}
>>>
>>> pp(modified_nested_dict)
{4: {4: 33.97, 5: 55.32, 6: 57.31, 7: 71.56},
5: {4: 27.31, 5: 23.32, 6: 32.25, 7: 60.21},
6: {4: 65.38, 5: 36.88, 6: 70.88, 7: 21.93},
7: {4: 35.44, 5: 21.21, 6: 40.72, 7: 51.35}}
You can use Pandas Dataframe for the desired task, as follows:
import pandas as pd
nested_dict = {
0: {0: 33.97, 1: 55.32, 2: 57.31, 3: 71.56},
1: {0: 27.31, 1: 23.32, 2: 32.25, 3: 60.21},
2: {0: 65.38, 1: 36.88, 2: 70.88, 3: 21.93},
3: {0: 35.44, 1: 21.21, 2: 40.72, 3: 51.35}
}
print("Dictionary before renaming: ", nested_dict)
# Convert nested dictionary to Pandas Dataframe
my_dataframe = pd.DataFrame.from_dict(nested_dict)
new_keys = list(range(4, 8)) # List of new keys
my_dataframe.columns = new_keys # Set columns to the new keys
my_dataframe.set_index([new_keys], inplace=True) # Set index to the new keys
nested_dict = my_dataframe.to_dict() # Convert back to nested dictionary
print("Dictionary after renaming: ", nested_dict)
This gives you the following expected output:
Dictionary before renaming: {0: {0: 33.97, 1: 55.32, 2: 57.31, 3: 71.56}, 1: {0: 27.31, 1: 23.32, 2: 32.25, 3: 60.21}, 2: {0: 65.38, 1: 36.88, 2: 70.88, 3: 21.93}, 3: {0: 35.44, 1: 21.21, 2: 40.72, 3: 51.35}}
Dictionary after renaming: {4: {4: 33.97, 5: 55.32, 6: 57.31, 7: 71.56}, 5: {4: 27.31, 5: 23.32, 6: 32.25, 7: 60.21}, 6: {4: 65.38, 5: 36.88, 6: 70.88, 7: 21.93}, 7: {4: 35.44, 5: 21.21, 6: 40.72, 7: 51.35}}
I am trying to create a function that creates a dataframe based on different lists of words that come up in a certain column of another dataframe.
In my example, I want a dataframe created on the basis of the words "chandos" and "electronics" coming up in the "description" column of the "uncategorised" dataframe.
The point of the function is that I want to be able to run this on different lists of words so I end up with different dataframes containing just the words I want.
words_Telephone = ["tfl", "electronics"]
df_Telephone = pd.DataFrame(columns=['date','description','paid out'])
def categorise(word_list, df_name):
""" takes the denoted terms from the "uncategorised" df and puts it into new df"""
for word in word_list:
df_name = uncategorised[uncategorised['description'].str.contains(word)]
return(df_name)
#apply the function
categorise(words_Telephone, df_Telephone)
I am expecting a dataframe that contains:
d = {'date': {0: '05/04/2017',
1: '06/04/2017',
2: '08/04/2017',
3: '08/04/2017',
4: '08/04/2017',
5: '10/04/2017',
6: '10/04/2017',
7: '10/04/2017'},
'description': {0: 'tfl',
1: 'tfl',
2: 'tfl',
3: 'tfl',
4: 'ac electronics ',
5: 'ac electronics ',},
'index': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10},
'paid out': {0: 3.0,
1: 4.3,
2: 6.1,
3: 1.5,
4: 16.39,
5: 20.4,}}
Reproducible df:
d = {'date': {0: '05/04/2017',
1: '06/04/2017',
2: '06/04/2017',
3: '08/04/2017',
4: '08/04/2017',
5: '08/04/2017',
6: '10/04/2017',
7: '10/04/2017',
8: '10/04/2017'},
'description': {0: 'tfl',
1: 'mu subscription',
2: 'tfl',
3: 'tfl',
4: 'tfl',
5: 'ac electronics ',
6: 'itunes',
7: 'ac electronics ',
8: 'google adwords'},
'index': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10},
'paid out': {0: 3.0,
1: 16.9,
2: 4.3,
3: 6.1,
4: 1.5,
5: 16.39,
6: 12.99,
7: 20.4,
8: 39.68}}
SOLUTION:
def categorise(word_list):
""" takes the denoted terms from the "uncategorised" df and puts it into new df then deletes from the uncategorised df"""
global uncategorised
new_dfs = []
for word in word_list:
new_dfs.append(uncategorised[uncategorised['description'].str.contains(word)])
uncategorised= uncategorised[ ~uncategorised['description'].str.contains(word)]
return (uncategorised)
return (pd.concat(new_dfs).reset_index())
#apply the function
df_Telephone = categorise(words_Telephone)
df_Telephone
words_Telephone = ["tfl", "electronics"]
original_df = pd.DataFrame().from_dict({'date': ['05/04/2017','06/04/2017','06/04/2017','08/04/2017','08/04/2017','08/04/2017','10/04/2017','10/04/2017','10/04/2017'], 'description': ['tfl','mu subscription','tfl','tfl','tfl','ac electronics','itunes','ac electronics','google adwords'], 'paid out' :[ 3.0,16.9, 4.3,6.1,1.5,16.39,12.99,20.4,39.68]})
def categorise(word_list, original_df):
""" takes the denoted terms from the "uncategorised" df and puts it into new df"""
new_dfs = []
for word in word_list:
new_dfs.append(original_df[original_df['description'].str.contains(word)])
return pd.concat(new_dfs).reset_index()
#apply the function
df_Telephone = categorise(words_Telephone, original_df)
print(df_Telephone)
date description paid out
0 05/04/2017 tfl 3.00
1 06/04/2017 tfl 4.30
2 08/04/2017 tfl 6.10
3 08/04/2017 tfl 1.50
4 08/04/2017 ac electronics 16.39
5 10/04/2017 ac electronics 20.40