I am hoping to write a program that will run though multiple columns of data and create a new dataframe based on those that are found to be outliers and those that are blank. Currently, I have the below code that will replace the values with "Outlier" and "No Data" but I am struggling to convert this to a new dataframe.
Visual of request:
import pandas as pd
from pandas import ExcelWriter
# Remove Initial Data Quality
outl = ['.',0,' ']
# Pull in Data
path = r"C:\Users\robert.carmody\desktop\Python\PyTest\PyTGPS.xlsx"
sheet = 'Raw Data'
df = pd.read_excel(path,sheet_name=sheet)
data = pd.read_excel(path,sheet_name=sheet)
j = 0
while j < len(df.keys()): #run through total number of columns
list(df.iloc[:,j]) #create a list of all values within the 'j' column
if type(list(df.iloc[:,j])[0]) == float:
q1 = df.iloc[:,j].quantile(q=.25)
med = df.iloc[:,j].quantile(q=.50)
q3 = df.iloc[:,j].quantile(q=.75)
iqr = q3 - q1
ub = q3 + 1.5*iqr
lb = q1 - 1.5*iqr
mylist = [] #outlier list is defined
for i in df.iloc[:,j]: #identify outliers and add to the list
if i > ub or i < lb:
mylist.append(float(i))
else:
i
if mylist == []:
mylist = ['Outlier']
else:
mylist
else:
mylist = ['Outlier']
data.iloc[:,j].replace(mylist,'Outlier',inplace=True)
j = j + 1
data = data.fillna('No Data')
#Excel
path2 = r"C:\Users\robert.carmody\desktop\Python\PyTest\PyTGPS.xlsx"
writer = ExcelWriter(path2)
df.to_excel(writer,'Raw Data')
data.to_excel(writer,'Adjusted Data')
writer.save()
Suppose you data looks like this, and for simplicity upperbound is 2 and lowerbound is 0,
df = pd.DataFrame({'group':'A B C D E F'.split(' '), 'Q1':[1,1,5,2,2,2], 'Q2':[1,5,5,2,2,2],'Q3':[2,2,None,2,2,2]})
df.set_index('group', inplace=True)
i.e.:
Q1 Q2 Q3
group
A 1 1 2.0
B 1 5 2.0
C 5 5 NaN
D 2 2 2.0
E 2 2 2.0
F 2 2 2.0
Then the following might give what you want:
newData = []
for quest in df.columns: #run through the columns
q1 = df[quest].quantile(q=.25)
med = df[quest].quantile(q=.50)
q3 = df[quest].quantile(q=.75)
iqr = q3 - q1
#ub = q3 + 1.5*iqr
ub = 2 #my
#lb = q1 - 1.5*iqr
lb = 0 #my
for group in df.index:
i = df.loc[group, quest]
if i > ub or i < lb: #identify outliers and add to the list
newData += [[group, quest, 'Outlier', i]]
elif (i>0 or i<=0)==False:
newData += [[group, quest, 'None', None]]
creates a 2 dimensional list which can easily be transformed in a dataframe
by
pd.DataFrame(newData)
Related
I have an excel file with 2 columns titled Lat and Lon with some data such as:
Lat Lon
36.19553° N 95.90918° W
36.19550° N 95.93592° W
36.20277° N 95.94484° W
36.20277° N 95.95381° W
36.22436° N 95.98023° W
36.21005° N 95.94487° W
36.21006° N 95.93594° W
35.99968° N 96.09681° W
35.97043° N 95.98949° W
35.96317° N 95.98951° W
35.99968° N 96.11459° W
35.99967° N 96.10568° W
35.96318° N 95.99839° W
35.96315° N 96.00728° W
35.99239° N 96.13247° W
I am trying to read in the excel file and group each one that is within a distance of 0.00004 in the last place of each other in either the lat or the lon column into a group. It should start with the first row and check each row looking for another one that is within a distance of 0.00004 and insert a number starting a one in a column called 'Drive' for each grouping.
the expected output is supposed to be:
Lat Lon Drive
0 36.19553 95.90918 1
1 36.19550 95.93592 1
2 36.20277 95.94484 2
3 36.20277 95.95381 2
4 36.22436 95.98023 3
5 36.21005 95.94487 2
6 36.21006 95.93594 1
7 35.99968 96.09681 4
8 35.97043 95.98949 5
9 35.96317 95.98951 5
10 35.99968 96.11459 4
11 35.99967 96.10568 4
12 35.96318 95.99839 5
13 35.96315 96.00728 5
14 35.99239 96.13247 6
I have made several attempt with no success.
here is the latest attempt:
# Read the data into a pandas DataFrame
df = pd.read_excel('data.xlsx')
# Convert Lat and Lon to absolute values for easy comparison
df['Lat'] = df['Lat'].abs()
df['Lon'] = df['Lon'].abs()
# Initialize the counter and group column
counter = 1
df['Drive'] = 0
# Loop over the DataFrame rows
for i in range(len(df)):
if df['Drive'][i] == 0:
df.loc[(df['Lat'].between(df['Lat'][i] - 4, df['Lat'][i] + 4)) &
(df['Lon'].between(df['Lon'][i] - 4, df['Lon'][i] + 4)), 'Drive'] = counter
counter += 1
# Print the result
print(df)
I get the error bad operand type for abs(): 'str'
# Read the data into a pandas DataFrame
df = pd.read_excel(workbook_path)
# Extract the degrees from the string value
df['Lat'] = df['Lat'].str.extract(r'(\d+\.\d+)')
df['Lon'] = df['Lon'].str.extract(r'(\d+\.\d+)')
df['Lat'] = df['Lat'].astype(float)
df['Lon'] = df['Lon'].astype(float)
df['Drive'] = 0
drive = 1
for i in range(len(df)):
if df.loc[i, 'Drive'] == 0:
df.loc[i, 'Drive'] = drive
for j in range(i + 1, len(df)):
if (abs(df.loc[i, 'Lat'] - df.loc[j, 'Lat']) <= 0.00004) or (abs(df.loc[i, 'Lon'] - df.loc[j, 'Lon']) <= 0.00004):
df.loc[j, 'Drive'] = drive
drive += 1
print(df)
My toy example is as follows:
import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
### prepare data
Xy = np.c_[load_iris(return_X_y=True)]
mycol = ['x1','x2','x3','x4','group']
df = pd.DataFrame(data=Xy, columns=mycol)
dat = df.iloc[:100,:] #only consider two species
dat['group'] = dat.group.apply(lambda x: 1 if x ==0 else 2) #two species means two groups
dat.shape
dat.head()
### Linear discriminant analysis procedure
G1 = dat.iloc[:50,:-1]; x1_bar = G1.mean(); S1 = G1.cov(); n1 = G1.shape[0]
G2 = dat.iloc[50:,:-1]; x2_bar = G2.mean(); S2 = G2.cov(); n2 = G2.shape[0]
Sp = (n1-1)/(n1+n2-2)*S1 + (n2-1)/(n1+n2-2)*S2
a = np.linalg.inv(Sp).dot(x1_bar-x2_bar); u_bar = (x1_bar + x2_bar)/2
m = a.T.dot(u_bar); print("Linear discriminant boundary is {} ".format(m))
def my_lda(x):
y = a.T.dot(x)
pred = 1 if y >= m else 2
return y.round(4), pred
xx = dat.iloc[:,:-1]
xxa = xx.agg(my_lda, axis=1)
xxa.shape
type(xxa)
We have xxa is a pandas.core.series.Series with shape (100,). Note that there are two columns in parentheses of xxa, I want convert xxa to a pd.DataFrame with 100 rows x 2 columns and I try
xxa_df1 = pd.DataFrame(data=xxa, columns=['y','pred'])
which gives ValueError: Shape of passed values is (100, 1), indices imply (100, 2).
Then I continue to try
xxa2 = xxa.to_frame()
# xxa2 = pd.DataFrame(xxa) #equals `xxa.to_frame()`
xxa_df2 = pd.DataFrame(data=xxa2, columns=['y','pred'])
and xxa_df2 presents all NaN with 100 rows x 2 columns. What should I do next?
Let's try Series.tolist()
xxa_df1 = pd.DataFrame(data=xxa.tolist(), columns=['y','pred'])
print(xxa_df1)
y pred
0 42.0080 1
1 32.3859 1
2 37.5566 1
3 31.0958 1
4 43.5050 1
.. ... ...
95 -56.9613 2
96 -61.8481 2
97 -62.4983 2
98 -38.6006 2
99 -61.4737 2
[100 rows x 2 columns]
I need help. I've to merge this DataFrames(examples) by adding new column and put percents there.
If 'level'<5000 it's NaN, if 5000<'level'<=7000 it's 5%, if 7000<'level'<=10000 it's 7% and etc..
import pandas as pd
levels = pd.DataFrame({'lev':[5000,7000,10000],'proc':['5%','7%','10%']})
data = pd.DataFrame({'name':['A','B','C','D','E','F','G'],'sum':[6500,3000,15000,1400,8600,5409,9999]})
My efforts do solve this... It doesn't work and I don't understand how to solve this.
temp = data[data['sum'] >= levels['lev'][2]]
temp['proc']=levels['proc'][2]
lev3 = temp
temp = data[levels['lev'][1]<=data['sum'] and data['sum']<=levels['lev'][2]]
temp['proc']=levels['proc'][1]
lev2 = temp
temp = data[levels['lev'][0]<=data['sum'] and data['sum']<=levels['lev'][1]]
temp['proc']=levels['proc'][0]
lev1 = temp
data = pd.concat([lev1,lev2,lev3,data])
You can apply a function to each row like this:
import pandas as pd
def levels(s):
if 5000 < s <= 7000:
return '5%'
elif 7000 < s <= 10000:
return '7%'
elif s > 10000:
return '10%'
df = pd.DataFrame({'name':['A','B','C','D','E','F','G'],'sum':[6500,3000,15000,1400,8600,5409,9999]})
df['Percent'] = df.apply(lambda x: levels(x['sum']), axis=1)
print(df)
name sum Percent
0 A 6500 5%
1 B 3000 None
2 C 15000 10%
3 D 1400 None
4 E 8600 7%
5 F 5409 5%
6 G 9999 7%
I currently have a dataframe in the following format:
step tag_id x_pos y_pos
1 1 5 3
1 2 3 4
2 1 2 2
2 3 1 6
.........................
.........................
N 1 5 7
For each row in the df, I am aiming to add an additional m rows oversampling from a Gaussian distribution for the x and y values (independent). Thus, a df of N = 100 and m = 10 would result in a df length 1010, including the original and oversampled values.
The code I have for this works, but it is extremely slow over a large dataset (N > 100k). There are many operations (creating new arrays/ dfs, use of itertuples, etc.) that I'm sure are hampering performance; I would appreciate any help as to how I can improve the performance so I can generate higher m values over the whole dataset. For instance: input data is from a pandas dataframe, but the multi-variate normal function operates on numpy arrays. Is there a more natural way to implement this through pandas without the copying between numpy arrays and dataframes? Thanks!
Reproducible example:
import pandas as pd
import numpy as np
import random
def gaussianOversample2(row, n):
sigma = 2
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
return x,y, steps, tags
def oversampleDf(df, n):
oversampled_arr = np.empty((0,4), float)
# with input df with step, tag_id, x_pos, y_pos
data = pd.DataFrame(columns = df.columns)
count = 0
for row in df.itertuples(index=False):
count = count + 1
temp = np.zeros((len(row), n+1))
oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
temp[0] = steps
temp[1] = tags
temp[2] = oversample_x
temp[3] = oversample_y
temp = pd.DataFrame(temp.T, columns = df.columns)
data = data.append(temp)
if count % 1000 == 0:
print("Row: ", count)
return data
df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])
res = oversampleDf(df, 20)
"""
# Result should be:
step tag_id x_pos y_pos
0 1.0 1.0 5.000000 3.000000
1 1.0 1.0 3.423492 3.886602
2 1.0 1.0 5.404581 2.177559
3 1.0 1.0 4.023274 2.883737
4 1.0 1.0 3.390710 3.038782
.. ... ... ... ...
16 2.0 3.0 1.894151 5.510321
17 2.0 3.0 1.110932 5.281578
18 2.0 3.0 1.623538 4.529825
19 2.0 3.0 -0.576756 7.476872
20 2.0 3.0 -0.866123 5.898048
"""
This is the solution I have found for myself; it is more of a workaround than a technique using quicker methods. I instead write out to a csv file, which I then read in once complete, as so:
def gaussianOversample3(row, n):
mean_x = float(getattr(row,'x_pos'))
mean_y = float(getattr(row,'y_pos'))
step = getattr(row, 'step')
tag_id = getattr(row, 'tag_id')
sigma = np.array([1,1])
cov = np.diag(sigma ** 2)
x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
x = np.concatenate(([mean_x], x))
y = np.concatenate(([mean_y], y))
steps = np.empty(n+1)
tags = np.empty(n+1)
steps.fill(step)
tags.fill(tag_id)
pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
def oversampleDf2(df, n):
filename = "oversample.csv"
d = pd.DataFrame(list())
d.to_csv(filename)
#count = 0
for row in df.itertuples(index=False):
#count = count + 1
gaussianOversample3(row, n)
#if count % 10000 == 0:
# print("Row: ", count)
Because of how it is reading the file, I have to do the following:
oversampleDf2(defensive_df2, num_oversamples)
oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
oversampled_df.columns = ['col']
oversampled_df = oversampled_df.col.str.split(",",expand=True)
oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
oversampled_df = oversampled_df.drop(['temp'], axis = 1)
oversampled_df = oversampled_df.astype(float)
I have this dataframe;
import pandas as pd
import numpy as np
df = pd.DataFrame({
'Client':np.random.choice(['Customer_A', 'Customer_B'], 1000),
'Product':np.random.choice( ['Guns', 'Ammo', 'Armour'], 1000),
'Value':(np.random.randn(1000))
})
Categoricals = ['Client', 'Product']
df[Categoricals] = df[Categoricals].astype('category')
df = df.drop_duplicates()
df
And I want this result;
# Non-anonymous function for Anomaly limit
def Anomaly (x):
Q3 = np.nanpercentile(x, q = 75)
Q1 = np.nanpercentile(x, q = 25)
IQR = (Q3 - Q1)
return (Q3 + (IQR * 2.0))
# Non-anonymous function for CriticalAnomaly limit
def CriticalAnomaly (x):
Q3 = np.nanpercentile(x, q = 75)
Q1 = np.nanpercentile(x, q = 25)
IQR = (Q3 - Q1)
return (Q3 + (IQR * 3.0))
# Define metrics
Metrics = {'Value':['count', Anomaly, CriticalAnomaly]}
# Groupby has more than 1 grouping column, so agg can only accept non-anonymous functions
Limits = df.groupby(['Client', 'Product']).agg(Metrics)
Limits
But it's slow on large datasets because the functions "Anomaly" and "CriticalAnomaly" have to recalculate Q1, Q3 and IQR twice, instead of once. By combining both functions together makes it much faster. But the results are output into 1 column instead of 2.
# Combined anomaly functions
def CombinedAnom (x):
Q3 = np.nanpercentile(x, q = 75)
Q1 = np.nanpercentile(x, q = 25)
IQR = (Q3 - Q1)
Anomaly = (Q3 + (IQR * 2.0))
CriticalAnomaly = (Q3 + (IQR * 3.0))
return (Anomaly, CriticalAnomaly)
# Define metrics
Metrics = {'Value':['count', CombinedAnom]}
# Groupby has more than 1 grouping column, so agg can only accept non-anonymous functions
Limits = df.groupby(['Client', 'Product']).agg(Metrics)
Limits
How can I make a combined function so the results go into 2 columns?
If you use apply instead of agg, you can return a Series that gets unpacked into columns:
def f(g):
return pd.Series({
'c1': np.sum(g.b),
'c2': np.prod(g.b)
})
df = pd.DataFrame({'a': list('aabbcc'), 'b': [1,2,3,4,5,6]})
df.groupby('a').apply(f)
This goes from:
a b
0 a 1
1 a 2
2 b 3
3 b 4
4 c 5
5 c 6
to
c1 c2
a
a 3 2
b 7 12
c 11 30