Columns error while converting to float - python

df = pd.read_csv('TestdataNALA.csv', sep=';', na_values='ND')
df.fillna(0, inplace=True)
df = df[['ID','VegType','Elevation','C','N','C/N','Soil13C','Soil15N','pH','Na_[mg/g]','K_[mg/g]','Mg_[mg/g]','Ca_[mg/g]','P_[µg/g]']]
new_list = df[['Elevation','C','N','C/N','Soil13C','Soil15N','pH','Na_[mg/g]','K_[mg/g]','Mg_[mg/g]','Ca_[mg/g]','P_[µg/g]']]
nnew_list = [float(i) for i in new_list]
print(type(df['ID']))
#--> <class 'pandas.core.series.Series'>
print('ID' in df.columns)
#-->True
Some data from the CSV:
ID;VegType;Elevation;C;N;C/N;Soil13C;Soil15N;pH;Na_[mg/g];K_[mg/g];Mg_[mg/g];Ca_[mg/g];P_[µg/g]
A2425WC;Cyt;2425;2,78;0,23;12,14;-24,43;4,85;4,88;0,0005;0,0772;0,0646;0,3588;13,8400
C2489WCG;Cyt_Gen;2489;2,18;0,17;12,50;-24,51;5,53;4,21;0,0010;0,0639;0,0286;0,0601;0,6800
C2591WCG;Cyt_Gen;2591;5,13;0,36;14,29;-25,52;3,41;4,30;0,0046;0,0854;0,1169;0,7753;5,7000
E2695WC;Cyt;2695;1,43;0,14;10,55;-25,71;5,75;4,95;ND;0,0766;0,0441;0,0978;8,8500
A2900WC;Cyt;2900;6,95;0,53;13,11;-25,54;3,38;5,35;0,0032;0,1119;0,2356;1,8050;14,7100
A2800WC;Cyt;2800;1,88;0,16;11,62;-23,68;5,88;5,28;0,0025;0,0983;0,0770;0,3777;5,4200
A3050WC;Cyt;3050;1,50;0,12;12,50;-24,62;2,23;5,97;ND;0,0696;0,0729;0,5736;9,4000
When I run this, I get the error:
ValueError: could not convert string to float: 'Elevation'
It seems as if it's trying to convert the headlines, but I only wanted to convert the list.

To convert columns to numeric, you should use pd.to_numeric:
cols = ['Elevation','C','N','C/N','Soil13C','Soil15N','pH','Na_[mg/g]','K_[mg/g]','Mg_[mg/g]','Ca_[mg/g]','P_[µg/g]']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
Your code will not work because, like a dictionary and its keys, when you loop through a dataframe you only find the column headings.
Update 1
Try also using the options below:
import pandas as pd
from io import StringIO
mystr = StringIO(""" <bound method NDFrame.head of ID VegType Elevation C N C/N Soil13C Soil15N pH \
1 C2489WCG Cyt_Gen 2489 2,18 0,17 12,50 -24,51 5,53 4,21
2 C2591WCG Cyt_Gen 2591 5,13 0,36 14,29 -25,52 3,41 4,30
3 E2695WC Cyt 2695 1,43 0,14 10,55 -25,71 5,75 4,95
""")
df = pd.read_csv(mystr, skiprows=1, decimal=',', header=None, delim_whitespace=True)
# 0 1 2 3 4 5 6 7 8 9
# 0 1 C2489WCG Cyt_Gen 2489 2.18 0.17 12.50 -24.51 5.53 4.21
# 1 2 C2591WCG Cyt_Gen 2591 5.13 0.36 14.29 -25.52 3.41 4.30
# 2 3 E2695WC Cyt 2695 1.43 0.14 10.55 -25.71 5.75 4.95
Update 2
import pandas as pd
from io import StringIO
mystr = StringIO("""ID;VegType;Elevation;C;N;C/N;Soil13C;Soil15N;pH;Na_[mg/g];K_[mg/g];Mg_[mg/g];Ca_[mg/g];P_[µg/g]
A2425WC;Cyt;2425;2,78;0,23;12,14;-24,43;4,85;4,88;0,0005;0,0772;0,0646;0,3588;13,8400
C2489WCG;Cyt_Gen;2489;2,18;0,17;12,50;-24,51;5,53;4,21;0,0010;0,0639;0,0286;0,0601;0,6800
C2591WCG;Cyt_Gen;2591;5,13;0,36;14,29;-25,52;3,41;4,30;0,0046;0,0854;0,1169;0,7753;5,7000
E2695WC;Cyt;2695;1,43;0,14;10,55;-25,71;5,75;4,95;ND;0,0766;0,0441;0,0978;8,8500
A2900WC;Cyt;2900;6,95;0,53;13,11;-25,54;3,38;5,35;0,0032;0,1119;0,2356;1,8050;14,7100
A2800WC;Cyt;2800;1,88;0,16;11,62;-23,68;5,88;5,28;0,0025;0,0983;0,0770;0,3777;5,4200
A3050WC;Cyt;3050;1,50;0,12;12,50;-24,62;2,23;5,97;ND;0,0696;0,0729;0,5736;9,4000
""")
df = pd.read_csv(mystr, decimal=',', delimiter=';')
# ID VegType Elevation C N C/N Soil13C Soil15N pH \
# 0 A2425WC Cyt 2425 2.78 0.23 12.14 -24.43 4.85 4.88
# 1 C2489WCG Cyt_Gen 2489 2.18 0.17 12.50 -24.51 5.53 4.21
# 2 C2591WCG Cyt_Gen 2591 5.13 0.36 14.29 -25.52 3.41 4.30
# 3 E2695WC Cyt 2695 1.43 0.14 10.55 -25.71 5.75 4.95
# 4 A2900WC Cyt 2900 6.95 0.53 13.11 -25.54 3.38 5.35
# 5 A2800WC Cyt 2800 1.88 0.16 11.62 -23.68 5.88 5.28
# 6 A3050WC Cyt 3050 1.50 0.12 12.50 -24.62 2.23 5.97
# Na_[mg/g] K_[mg/g] Mg_[mg/g] Ca_[mg/g] P_[µg/g]
# 0 0,0005 0.0772 0.0646 0.3588 13.84
# 1 0,0010 0.0639 0.0286 0.0601 0.68
# 2 0,0046 0.0854 0.1169 0.7753 5.70
# 3 ND 0.0766 0.0441 0.0978 8.85
# 4 0,0032 0.1119 0.2356 1.8050 14.71
# 5 0,0025 0.0983 0.0770 0.3777 5.42
# 6 ND 0.0696 0.0729 0.5736 9.40

Related

Create a csv from website by python scrapping

I want to have a csv file of the table in this website. https://www.moneycontrol.com/stocks/fno/marketstats/arbitrage/futures-spot-near-2.html
The desired output is a pandas dataframe with columns..
Company,Future,Spot,Basis,Basis%,Previous,Basis,Change,Lot Size
import pandas as pd
url = 'https://www.moneycontrol.com/stocks/fno/marketstats/arbitrage/futures-spot-near-2.html'
df = pd.read_html(url, header=0)[0]
df.rename(columns={'Unnamed: 0': 'Company',
'Futures': 'Future',
'Spot': 'Spot',
'Basis': 'Basis',
'Basis%': 'Basis%',
'Previous': 'Previous',
'Basis.1': 'Change',
'Lot Size': 'Lot Size'}, inplace=True)
df.to_csv('futures_spot_near.csv', index=False)
I tried this but only get the column names in csv
Here is how you can get that data:
import pandas as pd
url = 'https://www.moneycontrol.com/stocks/fno/marketstats/arbitrage/futures-spot-near-2.html'
df = pd.read_html(url)[1]
df.columns = ['Company','Future','Spot','Basis','Basis %','Previous Basis','Change','Lot Size']
df.to_csv('futures_spot_near.csv', index=False)
print(df)
Result in terminal (also saved as csv):
Company Future Spot Basis Basis % Previous Basis Change Lot Size
0 LTIM 5150.00 4691.60 458.40 9.77 379.35 79.05 150
1 INDIACEM 201.40 197.90 3.50 1.77 0.80 2.70 2900
2 ADANIENT 1860.00 1846.95 13.05 0.71 3.00 10.05 250
3 BIOCON 244.00 242.35 1.65 0.68 0.50 1.15 2300
4 SHRIRAMFIN 1287.10 1278.60 8.50 0.66 -14.95 23.45 600
... ... ... ... ... ... ... ... ...
188 MRF 88340.05 89070.90 -730.85 -0.82 -463.40 -267.45 10
189 MGL 893.60 901.35 -7.75 -0.86 -6.35 -1.40 800
190 BALRAMCHIN 359.55 363.55 -4.00 -1.10 -0.65 -3.35 1600
191 ITC 366.45 371.35 -4.90 -1.32 -4.25 -0.65 1600
192 CHAMBLFERT 293.80 298.20 -4.40 -1.48 -3.60 -0.80 1500
193 rows × 8 columns

transpose multiple columns in a pandas dataframe

AD AP AR MD MS iS AS
0 169.88 0.00 50.50 814.0 57.3 32.3 43.230
1 12.54 0.01 84.75 93.0 51.3 36.6 43.850
2 321.38 0.00 65.08 986.0 56.7 28.9 42.070
I would like to change the dataframe above to a transposed version where for each column, the values are put in a single row, so e.g. for columns AD and AP, it will look like this
d1_AD d2_AD d3_AD d1_AP d2_AP d3_AP
169.88 12.54 321.38 0.00 0.01 0.00
I can do a transpose, but how do I get the column names and output structure like above?
NOTE: The output is truncated for legibility but the actual output should include all the other columns like AR MD MS iS AS
We can rename to make the index of the correct form, then stack and sort_index, then Collapse the MultiIndex and to_frame and transpose
new_df = df.rename(lambda x: f'd{x + 1}').stack().sort_index(level=1)
new_df.index = new_df.index.map('_'.join)
new_df = new_df.to_frame().transpose()
Input df:
df = pd.DataFrame({
'AD': [169.88, 12.54, 321.38], 'AP': [0.0, 0.01, 0.0],
'AR': [50.5, 84.75, 65.08], 'MD': [814.0, 93.0, 986.0],
'MS': [57.3, 51.3, 56.7], 'iS': [32.3, 36.6, 28.9],
'AS': [43.23, 43.85, 42.07]
})
new_df:
d1_AD d2_AD d3_AD d1_AP d2_AP ... d2_MS d3_MS d1_iS d2_iS d3_iS
0 169.88 12.54 321.38 0.0 0.01 ... 51.3 56.7 32.3 36.6 28.9
[1 rows x 21 columns]
If lexicographic sorting does not work we can wait to convert the MultiIndex to string until after sort_index:
new_df = df.stack().sort_index(level=1) # Sort level 1 (by number)
new_df.index = new_df.index.map(lambda x: f'd{x[0]+1}_{x[1]}')
new_df = new_df.to_frame().transpose()
Larger frame:
df = pd.concat([df] * 4, ignore_index=True)
Truncated output:
d1_AD d2_AD d3_AD d4_AD d5_AD ... d8_iS d9_iS d10_iS d11_iS d12_iS
0 169.88 12.54 321.38 169.88 12.54 ... 36.6 28.9 32.3 36.6 28.9
[1 rows x 84 columns]
If needing columns in same order as df, use melt using ignore_index=False to not have to recalculate groups and let melt handle the ordering:
new_df = df.melt(value_name=0, ignore_index=False)
new_df = new_df[[0]].set_axis(
# Create the new index
'd' + (new_df.index + 1).astype(str) + '_' + new_df['variable']
).transpose()
Truncated output on the larger frame:
d1_AD d2_AD d3_AD d4_AD d5_AD ... d8_AS d9_AS d10_AS d11_AS d12_AS
0 169.88 12.54 321.38 169.88 12.54 ... 43.85 42.07 43.23 43.85 42.07
[1 rows x 84 columns]
You could try melt and set_index with groupby:
x = df.melt().set_index('variable').rename_axis(index=None).T.set_axis([0])
x.set_axis(x.columns + x.columns.to_series().groupby(level=0).transform('cumcount').add(1).astype(str), axis=1)
AD1 AD2 AD3 AP1 AP2 AP3 AR1 AR2 AR3 ... MS1 MS2 MS3 iS1 iS2 iS3 AS1 AS2 AS3
0 169.88 12.54 321.38 0.0 0.01 0.0 50.5 84.75 65.08 ... 57.3 51.3 56.7 32.3 36.6 28.9 43.23 43.85 42.07
[1 rows x 21 columns]

Slicing pandas dataframe by ordered values into clusters

I have a pandas dataframe like there is longer gaps in time and I want to slice them into smaller dataframes where time "clusters" are together
Time Value
0 56610.41341 8.55
1 56587.56394 5.27
2 56590.62965 6.81
3 56598.63790 5.47
4 56606.52203 6.71
5 56980.44206 4.75
6 56592.53327 6.53
7 57335.52837 0.74
8 56942.59094 6.96
9 56921.63669 9.16
10 56599.52053 6.14
11 56605.50235 5.20
12 57343.63828 3.12
13 57337.51641 3.17
14 56593.60374 5.69
15 56882.61571 9.50
I tried sorting this and taking time difference of two consecutive points with
df = df.sort_values("Time")
df['t_dif'] = df['Time'] - df['Time'].shift(-1)
And it gives
Time Value t_dif
1 56587.56394 5.27 -3.06571
2 56590.62965 6.81 -1.90362
6 56592.53327 6.53 -1.07047
14 56593.60374 5.69 -5.03416
3 56598.63790 5.47 -0.88263
10 56599.52053 6.14 -5.98182
11 56605.50235 5.20 -1.01968
4 56606.52203 6.71 -3.89138
0 56610.41341 8.55 -272.20230
15 56882.61571 9.50 -39.02098
9 56921.63669 9.16 -20.95425
8 56942.59094 6.96 -37.85112
5 56980.44206 4.75 -355.08631
7 57335.52837 0.74 -1.98804
13 57337.51641 3.17 -6.12187
12 57343.63828 3.12 NaN
Lets say I want to slice this dataframe to smaller dataframes where time difference between two consecutive points is smaller than 40 how would I go by doing this?
I could loop the rows but this is frowned upon so is there a smarter solution?
Edit: Here is a example:
df1:
Time Value t_dif
1 56587.56394 5.27 -3.06571
2 56590.62965 6.81 -1.90362
6 56592.53327 6.53 -1.07047
14 56593.60374 5.69 -5.03416
3 56598.63790 5.47 -0.88263
10 56599.52053 6.14 -5.98182
11 56605.50235 5.20 -1.01968
4 56606.52203 6.71 -3.89138
df2:
0 56610.41341 8.55 -272.20230
df3:
15 56882.61571 9.50 -39.02098
9 56921.63669 9.16 -20.95425
8 56942.59094 6.96 -37.85112
...
etc.
I think you can just
df1 = df[df['t_dif']<30]
df2 = df[df['t_dif']>=30]
def split_dataframe(df, value):
df = df.sort_values("Time")
df = df.reset_index()
df['t_dif'] = (df['Time'] - df['Time'].shift(-1)).abs()
indxs = df.index[df['t_dif'] > value].tolist()
indxs.append(-1)
indxs.append(len(df))
indxs.sort()
frames = []
for i in range(1, len(indxs)):
val = df.iloc[indxs[i] + 1: indxs[i]]
frames.append(val)
return frames
Returns the correct dataframes as a list

Remove values when reading a csv and return to a list

I have a csv file of subjects XY Coordinates. Some XY's have been removed if the X-Coordinate is less than 5. This can be for any player and changes over time. (See example dataset).
At the start of this file P2, P7, P12, P17 have removed data. Although, throughout the file each player will have data missing. for about 90% of the file there will be at least 4 players having missing data at any time point.
Frame Time P1_X P2_Y P2_X P2_Y P3_X P3_Y P4_X P4_Y P5_X P5_Y P6_X P6_Y P7_X P7_Y P8_X P8_Y P9_X P9_Y P10_X P10_Y P11_X P11_Y P12_X P12_Y
0 10:39.2 65.75 45.10 73.74 -3.52 61.91 41.80 67.07 -24.62 77.14 -22.98 93.95 3.51 56.52 28.44 70.21 11.06 73.08 -35.54 69.79 45.73 73.34 29.26 64.73 -40.69 70.90 6.11 70.94 -45.11 42.78 3.00 61.77 -1.05 72.07 38.62
1 10:39.3 65.77 45.16 73.69 -3.35 61.70 41.79 67.19 -24.59 77.17 -23.03 93.90 3.53 56.54 28.38 70.20 11.00 73.15 -35.48 69.79 45.86 73.20 29.30 64.96 -40.77 70.91 6.10 71.04 -45.29 42.84 3.02 61.82 -0.99 72.12 38.71
2 10:39.4 65.78 45.24 73.63 -3.17 61.70 41.79 67.32 -24.56 77.20 -23.05 93.83 3.55 56.59 28.31 70.20 10.92 73.20 -35.41 69.79 45.86 73.03 29.36 65.19 -40.84 70.91 6.10 71.15 -45.50 42.91 3.04 61.89 -0.91 72.16 38.80
3 10:39.5 65.78 45.33 73.57 -3.00 61.49 41.78 67.45 -24.50 77.25 -23.07 93.75 3.57 56.59 28.31 70.21 10.83 73.25 -35.33 69.77 46.01 72.86 29.43 65.45 -40.86 70.90 6.09 71.15 -45.50 43.01 3.08 61.98 -0.81 72.19 38.86
4 10:39.6 65.78 45.33 73.51 -2.86 61.32 41.76 67.45 -24.50 77.31 -23.09 93.64 3.60 56.65 28.22 70.23 10.72 73.29 -35.22 69.72 46.17 72.69 29.51 65.75 -40.84 70.88 6.08 71.24 -45.71 43.11 3.12 62.06 -0.70 72.22 38.90
5 10:39.7 65.75 45.44 73.51 -2.86 61.20 41.73 67.59 -24.37 77.38 -23.10 93.52 3.63 56.73 28.09 70.25 10.59 73.29 -35.22 69.68 46.33 72.49 29.60 66.06 -40.84 70.86 6.05 71.31 -45.91 43.22 3.14 62.13 -0.59 72.26 38.92
6 10:39.8 65.72 45.56 73.45 -2.72 61.08 41.71 67.72 -24.19 77.44 -23.12 93.39 3.69 56.80 27.91 70.27 10.45 73.34 -35.08 69.66 46.48 72.27 29.67 66.36 -40.87 70.86 6.01 71.39 -46.09 43.35 3.17 62.20 -0.47 72.29 38.93
7 10:39.9 65.72 45.56 73.34 -2.48 60.97 41.72 67.92 -23.76 77.51 -23.13 93.23 3.75 56.80 27.91 70.30 10.31 73.40 -34.76 69.64 46.63 72.01 29.74 66.62 -40.93 70.85 5.96 71.39 -46.09 43.51 3.18 62.27 -0.35 72.31 38.93
8 10:40.0 65.73 45.90 73.34 -2.48 60.86 41.72 67.92 -23.76 77.51 -23.13 93.05 3.80 56.91 27.47 70.30 10.31 73.40 -34.76 69.63 46.76 72.01 29.74 66.82 -41.06 70.83 5.88 71.53 -46.45 43.68 3.20 62.27 -0.35 72.29 38.92
9 10:40.1 65.73 46.09 73.29 -2.39 60.74 41.70 68.00 -23.52 77.60 -23.12 92.83 3.86 56.99 27.23 70.35 10.17 73.43 -34.58 69.64 46.88 71.72 29.80 66.99 -41.22 70.80 5.79 71.60 -46.63 43.86 3.23 62.34 -0.22 72.22 38.89
10 10:40.2 65.76 46.27 73.22 -2.32 60.60 41.65 68.07 -23.24 77.71 -23.05 92.83 3.86 57.14 26.98 70.43 10.05 73.47 -34.38 69.68 46.96 71.42 29.85 67.16 -41.38 70.77 5.70 71.64 -46.80 44.04 3.28 62.43 -0.08 72.13 38.86
11 10:40.3 65.81 46.43 73.12 -2.28 60.43 41.60 68.12 -22.93 77.83 -22.94 92.58 3.89 57.32 26.72 70.54 9.92 73.50 -34.16 69.75 46.99 71.08 29.89 67.16 -41.38 70.74 5.62 71.67 -46.96 44.21 3.33 62.54 0.09 72.03 38.84
12 10:40.4 65.87 46.58 72.98 -2.29 60.24 41.55 68.15 -22.57 77.94 -22.76 92.30 3.93 57.52 26.45 70.67 9.78 73.50 -33.91 69.85 47.00 70.72 29.91 67.31 -41.57 70.70 5.52 71.73 -47.15 44.37 3.40 62.66 0.24 72.03 38.84
13 10:40.5 65.91 46.69 72.80 -2.32 60.07 41.49 68.17 -22.18 78.01 -22.53 91.99 3.98 57.71 26.18 70.81 9.60 73.49 -33.68 69.97 47.03 70.33 29.92 67.45 -41.78 70.64 5.38 71.81 -47.35 44.37 3.40 62.80 0.40 71.96 38.81
14 10:40.6 65.94 46.80 72.60 -2.34 59.93 41.43 68.19 -21.77 78.05 -22.27 91.69 4.03 57.89 25.90 70.96 9.42 73.47 -33.47 70.10 47.09 69.93 29.93 67.54 -41.96 70.56 5.20 71.86 -47.53 44.54 3.50 62.98 0.58 71.91 38.77
15 10:40.7 65.95 46.93 72.36 -2.36 59.80 41.38 68.18 -21.32 78.08 -21.99 91.38 4.09 58.11 25.63 71.11 9.26 73.41 -33.26 70.24 47.15 69.50 29.91 67.58 -42.15 70.56 5.20 71.86 -47.69 44.54 3.50 63.16 0.77 71.91 38.77
16 10:40.8 65.93 47.09 72.10 -2.34 59.65 41.36 68.16 -20.86 78.11 -21.68 91.09 4.17 58.35 25.38 71.23 9.13 73.31 -33.05 70.38 47.20 69.07 29.84 67.56 -42.32 70.44 5.00 71.81 -47.84 45.00 3.79 63.34 0.97 71.80 38.60
17 10:40.9 65.92 47.23 71.85 -2.28 59.47 41.37 68.11 -20.41 78.11 -21.37 90.81 4.27 58.59 25.12 71.33 9.00 73.22 -32.84 70.52 47.26 68.63 29.75 67.47 -42.51 70.28 4.78 71.75 -47.97 45.26 3.94 63.52 1.14 71.73 38.46
Because there is missing data I tried to read the csv file as such. If I removed the try: except: function I received a Type Error stating I couldn't convert string to float.
with open('NoBench.csv') as csvfile :
readCSV = csv.reader(csvfile, delimiter=',')
n=0
for row in readCSV :
if n == 0 :
n+=1
try:
visuals[0].append([float(row[3]),float(row[5]),float(row[7]),float(row[9]),float(row[11]),float(row[13]),float(row[15]),float(row[17]),float(row[19]),float(row[21]),float(row[23]),float(row[25]),float(row[27]),float(row[29]),float(row[31]),float(row[33]),float(row[35]),float(row[37]),float(row[39]),float(row[41]),float(row[43])])
visuals[1].append([float(row[2]),float(row[4]),float(row[6]),float(row[8]),float(row[10]),float(row[12]),float(row[14]),float(row[16]),float(row[18]),float(row[20]),float(row[22]),float(row[24]),float(row[26]),float(row[28]),float(row[30]),float(row[32]),float(row[34]),float(row[36]),float(row[38]),float(row[40]),float(row[42])])
except ValueError:
continue
However, when I use this code, it only returns the values to the list when every row of data is present. As mentioned, this only occurs for about 10% of the file. I am using the xy's to create a scatter plot at each point so cannot change to 0,0 as that will create a false data point. How do I alter the code so it returns the xy values when players data isn't removed.
You can define your own convert before the loop:
def convert_float(x);
if x: # equivalent to if x == ''
return float(x)
else:
return 0.0 # or set the default value you expect to replace the missing data with.
In combination with #juanpa.arrivillaga's excellent suggestion, change the visual.append lines to these:
visual[0].append(list(map(convert_float, row[3::2]))
visual[1].append(list(map(convert_float, row[2::2]))
Also I'm not sure what your n+=1 line is supposed to do... if you merely wanted to skip the first row (headers), simply do this:
def convert_float(x);
if x:
return float(x)
else:
return 0.0
for i, row in enumerate(readCSV):
if n > 0:
visual[0].append(list(map(convert_float, row[3::2]))
visual[1].append(list(map(convert_float, row[2::2]))

How to compare 3 pairs of pandas dataframe columns which is of float type?

I want to compare 3 different pairs of one data-frame. In my data-frame's column some values are not float type i.e. "," is in between that values, so I want to remove "," from that values and then I want to convert that columns into float type. Last step is to compare three different pairs of data-frame columns.
Dataframe:
aaa float_type1 float_type2 float_type3 float_type4 float_type5 float_type6
0 abc 1.12 1.120 1.20 1.2 1,67 167
1 xyz 1,2.5 2.35 1.25 125 12,5 12.5
2 pqr 3.56 3.58 35.6 3.78 3.90 5.56
3 pqr 5.5 5.8 5.05 5.005 5.500 5,5.78
4 pqr 6.6 6.9 6.06 6.06 6.60 6.600
Program :
def float_type_format(arg):
arg = arg.replace(',', '')
return arg
data = {'aaa' :{0:'abc',1:'xyz',2:'pqr',3:'pqr',4:'pqr'},
'float_type1' :{0:'1.12',1:'1,2.5',2:'3.56',3:'5.5',4:'6.6'},
'float_type2' :{0:'1.120',1:'2.35',2:'3.58',3:'5.8',4:'6.9'},
'float_type3' :{0:'1.20',1:'1.25',2:'35.6',3:'5.05',4:'6.06'},
'float_type4' :{0:'1.2',1:'125',2:'3.78',3:'5.005',4:'6.06'},
'float_type5' :{0:'1,67',1:'12,5',2:'3.90',3:'5.500',4:'6.60'},
'float_type6' :{0:'167',1:'12.5',2:'5.56',3:'5,5.78',4:'6.600'}}
df1 = pd.DataFrame(data)
#removing "," from float values
df1['float_type1'] = df1['float_type1'].apply(float_type_format)
df1['float_type2'] = df1['float_type2'].apply(float_type_format)
df1['float_type3'] = df1['float_type3'].apply(float_type_format)
df1['float_type4'] = df1['float_type4'].apply(float_type_format)
df1['float_type5'] = df1['float_type5'].apply(float_type_format)
df1['float_type6'] = df1['float_type6'].apply(float_type_format)
#converting dtype into float
df1.float_type1 = df1.float_type1.astype('float')
df1.float_type2 = df1.float_type2.astype('float')
df1.float_type3 = df1.float_type3.astype('float')
df1.float_type4 = df1.float_type4.astype('float')
df1.float_type5 = df1.float_type5.astype('float')
df1.float_type6 = df1.float_type6.astype('float')
For removing the "," from the column values I'm followed above logic.
Question 1:
Is their any fast performance and good way to remove "," from columns.
Now I want to compare float_type1 with float_type2, float_type3 with float_type4, float_type5 with float_type6 and if all 3 pairs are equal then only result column contain true and the expected output as follows:
aaa float_type1 float_type2 float_type3 float_type4 float_type5 \
0 abc 1.12 1.12 1.2 1.2 167.0
float_type6 result
0 167.0 True
Question 2:
I want a robust way of to perform this comparison.
You can greatly simplify your code by using replace with regex=True. You can then convert to numeric with pd.to_numeric and then you can get the result column with some boolean logic.
df2 = df1.replace(',','',regex=True)
df2 = df2.apply(pd.to_numeric, errors='ignore')
df2['result'] = ((df2['float_type1'] == df2['float_type2']) &
(df2['float_type3'] == df2['float_type4']) &
(df2['float_type5'] == df2['float_type6']))
aaa float_type1 float_type2 float_type3 float_type4 float_type5 \
0 abc 1.12 1.12 1.20 1.200 167.0
1 xyz 12.50 2.35 1.25 125.000 125.0
2 pqr 3.56 3.58 35.60 3.780 3.9
3 pqr 5.50 5.80 5.05 5.005 5.5
4 pqr 6.60 6.90 6.06 6.060 6.6
float_type6 result
0 167.00 True
1 12.50 False
2 5.56 False
3 55.78 False
4 6.60 False
import numpy as np
import pandas as pd
def float_type_format(arg):
arg = arg.replace(',', '')
return arg
data = {'aaa' :{0:'abc',1:'xyz',2:'pqr',3:'pqr',4:'pqr'},
'float_type1' :{0:'1.12',1:'1,2.5',2:'3.56',3:'5.5',4:'6.6'},
'float_type2' :{0:'1.120',1:'2.35',2:'3.58',3:'5.8',4:'6.9'},
'float_type3' :{0:'1.20',1:'1.25',2:'35.6',3:'5.05',4:'6.06'},
'float_type4' :{0:'1.2',1:'125',2:'3.78',3:'5.005',4:'6.06'},
'float_type5' :{0:'1,67',1:'12,5',2:'3.90',3:'5.500',4:'6.60'},
'float_type6' :{0:'167',1:'12.5',2:'5.56',3:'5,5.78',4:'6.600'}}
df1 = pd.DataFrame(data)
print(df1.head(6))
#removing "," from float values
df1['float_type1'] = df1['float_type1'].apply(float_type_format)
df1['float_type2'] = df1['float_type2'].apply(float_type_format)
df1['float_type3'] = df1['float_type3'].apply(float_type_format)
df1['float_type4'] = df1['float_type4'].apply(float_type_format)
df1['float_type5'] = df1['float_type5'].apply(float_type_format)
df1['float_type6'] = df1['float_type6'].apply(float_type_format)
#converting dtype into float
df1.float_type1 = df1.float_type1.astype('float')
df1.float_type2 = df1.float_type2.astype('float')
df1.float_type3 = df1.float_type3.astype('float')
df1.float_type4 = df1.float_type4.astype('float')
df1.float_type5 = df1.float_type5.astype('float')
df1.float_type6 = df1.float_type6.astype('float')
df1['test'] = (df1['float_type1']==df1['float_type2']) & (df1['float_type3']==df1['float_type4']) & (df1['float_type5']==df1['float_type6'])
print(df1.head(6))
It took me a while to realize you have all the df1['float_type1']
in the replace part, thus making the whole dataframe the same. Lol
This is the way I come up, probably not the best way.
aaa float_type1 float_type2 float_type3 float_type4 float_type5 \
0 abc 1.12 1.12 1.20 1.200 167.0
1 xyz 12.50 2.35 1.25 125.000 125.0
2 pqr 3.56 3.58 35.60 3.780 3.9
3 pqr 5.50 5.80 5.05 5.005 5.5
4 pqr 6.60 6.90 6.06 6.060 6.6
float_type6 test
0 167.00 True
1 12.50 False
2 5.56 False
3 55.78 False
4 6.60 False

Categories

Resources