Python Loop, Need to / Can't Retain Value from Original Dataframe - python

I'm attempting to loop through groups of phrases to match and score them among all the members in each group. Even if some of the phrases are the same, they may have different Codes which is what I'm trimming from the loop inputs - but need to retain in the final df2. I have to make the comparison in the loop without the code but the issue is tying it back to the original df that contains the code so I can identify which rows need to be flagged.
The code below works but I need to add the original DESCR to df2. Appending a and b only contains the trim.
I've tried df.at[] but have mixed, incorrect results. Thank you.
import pandas as pd
from fuzzywuzzy import fuzz as fz
import itertools
data = [[1,'Oneab'],[1,'Onebc'],[1,'Twode'],[2,'Threegh'],[2,'Threehi'],[2,'Fourjk'],[3,'Fivekl'],[3,'Fivelm'],[3,'Fiveyz']]
df = pd.DataFrame(data,columns=['Ids','DESCR'])
n_list = []
a_list = []
b_list = []
pr_list = []
tsr_list = []
groups = df.groupby('Ids')
for n,g in groups:
for a, b in itertools.product(g['DESCR'].str[:-2],g['DESCR'].str[:-2]):
if str(a) < str(b):
try:
n_list.append(n)
a_list.append(a)
b_list.append(b)
pr_list.append(fz.partial_ratio(a,b))
tsr_list.append(fz.token_set_ratio(a,b))
except:
pass
df2 = pd.DataFrame({'Group': n_list, 'First Comparator': a_list, 'Second Comparator': b_list, 'Partial Ratio': pr_list, 'Token Set Ratio': tsr_list})
Instead of:
ab bc 50 50
ab de 0 0
bc de 0 0
gh hi 50 50
gh jk 0 0
hi jk 50 50
...
I'd like to see:
Oneab Onebc 50 50
Oneab Twode 0 0
Onebc Twode 0 0
Threegh Threehi 50 50
Threegh Fourjk 0 0
Threehi Fourjk 50 50
...

In case anyone else runs into a similar issue - figured it out - instead of filtering the inputs at the beginning of the second level loop, I'm bringing the full value into the second loop and stripping it there:
a2 = a[6:]
b2 = b[6:]
So:
import pandas as pd
from fuzzywuzzy import fuzz as fz
import itertools
data = [[1,'Oneab'],[1,'Onebc'],[1,'Twode'],[2,'Threegh'],[2,'Threehi'],[2,'Fourjk'],[3,'Fivekl'],[3,'Fivelm'],[3,'Fiveyz']]
df = pd.DataFrame(data,columns=['Ids','DESCR'])
n_list = []
a_list = []
b_list = []
pr_list = []
tsr_list = []
groups = df.groupby('Ids')
for n,g in groups:
for a, b in itertools.product(g['DESCR'],g['DESCR']):
if str(a) < str(b):
try:
a2 = a[:-2]
b2 = b[:-2]
n_list.append(n)
a_list.append(a)
b_list.append(b)
pr_list.append(fz.partial_ratio(a2,b2))
tsr_list.append(fz.token_set_ratio(a2,b2))
except:
pass
df2 = pd.DataFrame({'Group': n_list, 'First Comparator': a_list, 'Second Comparator': b_list, 'Partial Ratio': pr_list, 'Token Set Ratio': tsr_list})

Related

Join rows in pandas, shift by one and create new data frame

I have a data frame as follows
I/P
date,low,high,close
d1,l1,h1,c1
d2,l2,h2,c2
d3,l3,h3,c3
d4,l4,h4,c4
d5,l5,h5,c5
d6,l6,h5,c5
d7,l7,h7,c7
O/P
d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
d2,l2,h2,c2,d3,l3,h3,c3,d4,l5,h4,c4
d3,l3,h3,c3,d4,l5,h4,c4,d5,l5,h5,c5
d4,l5,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
....
Basically join all rows, split into subarrays of 3 size each staring at each index, and create the op data frame.
Following code works. Buts its too verbose and slow. Does pandas have something inbuilt for this?
def flatten(df):
candles = []
i = 0
while i < len(df):
candles.append(df.iloc[i])
i= i+1
return candles
def slide_and_expand(candles, k):
return [candles[i:i+k] for i in range(len(candles) - k + 1)]
def candle_to_dict(col_name_prefix, candle_series):
candle_dict = {}
for index, val in candle_series.iteritems():
col_name = col_name_prefix+index
candle_dict[col_name] = val
return candle_dict
def candle_group_to_feature_vector(candle_group):
feature_vector_dict = {}
i = 0
for candle in candle_group:
col_name_prefix = f"c{i}_"
candle_dict = candle_to_dict(col_name_prefix, candle)
feature_vector_dict.update(candle_dict)
i= i+1
return feature_vector_dict
def candle_groups_to_feature_vectors(candle_groups):
feature_vectors = []
for candle_group in candle_groups:
feature_vector = candle_group_to_feature_vector(candle_group)
feature_vectors.append(feature_vector)
return feature_vectors
fv_len = 3
candles = flatten(data)
candle_groups = slide_and_expand(candles,fv_len)
feature_vectors = candle_groups_to_feature_vectors(candle_groups)
data_fv = pd.DataFrame.from_dict(feature_vectors, orient='columns')
data_fv
You could do something like this:
n = len(df.index) # number of rows in original dataframe 'df'
df_0 = df.loc[0:n-3]
df_1 = df.loc[1:n-2]
df_2 = df.loc[2:n-1]
df_final = pandas.concat([df_0, df_1, df_2], axis = 1)
You can save a few steps using Pandas rolling function using the windows size as the desired subarray length (window=SUBARR_SZ). Then, join each column with a ,, transform the result to a Series to be able to apply a join again, but now using each row in the Series (which contains the specific amount of subarrays).
import pandas as pd
df = pd.read_csv('sample.csv')
SUBARR_SZ = 3 # subarray size
df_list = []
for w in df.rolling(window=SUBARR_SZ):
if len(w) == SUBARR_SZ:
s = w.apply(','.join, axis=1).apply(pd.Series).apply(','.join)
df_list.append(s)
dff = pd.concat(df_list).reset_index(drop=True)
print(dff)
Output from dff
0 d1,l1,h1,c1,d2,l2,h2,c2,d3,l3,h3,c3
1 d2,l2,h2,c2,d3,l3,h3,c3,d4,l4,h4,c4
2 d3,l3,h3,c3,d4,l4,h4,c4,d5,l5,h5,c5
3 d4,l4,h4,c4,d5,l5,h5,c5,d6,l6,h6,c6
4 d5,l5,h5,c5,d6,l6,h6,c6,d7,l7,h7,c7
dtype: object

Why does my original dataframe change as well?

For the dataset that I am using, it is available on Kaggle at this link
I am doing this to it:
import pandas as pd
df = pd.read_csv('./survey_results_public.csv')
df = df.dropna(subset=['Salary'], axis = 0).drop(['Respondent','ExpectedSalary','Salary'], axis = 1)
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
# Method 1
df1 = df
df1 = df1.select_dtypes(include=['float']).fillna(df1.mean())
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
print(df1['HoursPerWeek'].mean())
print(sum(df1['HoursPerWeek'].isnull()))
# Method 2
df2 = df
num_vars = df2.select_dtypes(include = ['float']).columns
for col in num_vars:
df2[col].fillna(df2[col].mean(),inplace = True)
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
print(df2['HoursPerWeek'].mean())
print(sum(df2['HoursPerWeek'].isnull()))
My question is: Why does "Method 2" change df as well, as observed in the last 4 print statements where the mean and number of empty values is the dame between df and df2?
When I do something similar with normal variables in python this does not happen
a=2
b=a
c=a
print(a,b,c)
b += 2
print(a,b,c)
c += 3
print(a,b,c)
In this example, a is unchanged.
what you want to do is copy the dataframes:
...
# Method 1
df1 = df.copy()
df1 = df1.select_dtypes(include=['float']).fillna(df1.mean())
....
# Method 2
df2 = df.copy()
num_vars = df2.select_dtypes(include = ['float']).columns
...
Hope this helps :D
A good example are lists:
a = [1,2,3]
b = a
a.append(4)
print("b is",b)
# output is 'b is [1,2,3,4]

Pandas check which substring is in column of strings

Im trying to create function which will create a new column in a pandas dataframe, where it figures out which substring is in a column of strings and takes the substring and uses that for the new column.
The problem being that the text to find does not appear at the same location in variable x
df = pd.DataFrame({'x': ["var_m500_0_somevartext","var_m500_0_vartextagain",
"varwithsomeothertext_0_500", "varwithsomext_m150_0_text"], 'x1': [4, 5, 6,8]})
finds = ["m500_0","0_500","m150_0"]
which of finds is in a given df["x"] row
I've made a function that works, but is terribly slow for large datasets
def pd_create_substring_var(df,new_var_name = "new_var",substring_list=["1"],var_ori="x"):
import re
df[new_var_name] = "na"
cols = list(df.columns)
for ix in range(len(df)):
for find in substring_list:
for m in re.finditer(find, df.iloc[ix][var_ori]):
df.iat[ix, cols.index(new_var_name)] = df.iloc[ix][var_ori][m.start():m.end()]
return df
df = pd_create_substring_var(df,"t",finds,var_ori="x")
df
x x1 t
0 var_m500_0_somevartext 4 m500_0
1 var_m500_0_vartextagain 5 m500_0
2 varwithsomeothertext_0_500 6 0_500
3 varwithsomext_m150_0_text 8 m150_0
Does this accomplish what you need ?
finds = ["m500_0", "0_500", "m150_0"]
df["t"] = df["x"].str.extract(f"({'|'.join(finds)})")
Use pandas.str.findall:
df['x'].str.findall("|".join(finds))
0 [m500_0]
1 [m500_0]
2 [0_500]
3 [m150_0]
Probably not the best way:
df['t'] = df['x'].apply(lambda x: ''.join([i for i in finds if i in x]))
And now:
print(df)
Is:
x x1 t
0 var_m500_0_somevartext 4 m500_0
1 var_m500_0_vartextagain 5 m500_0
2 varwithsomeothertext_0_500 6 0_500
3 varwithsomext_m150_0_text 8 m150_0
And now, just adding to #pythonjokeun's answer, you can do:
df["t"] = df["x"].str.extract("(%s)" % '|'.join(finds))
Or:
df["t"] = df["x"].str.extract("({})".format('|'.join(finds)))
Or:
df["t"] = df["x"].str.extract("(" + '|'.join(finds) + ")")
I don't know how large your dataset is, but you can use map function like below:
def subset_df_test():
df = pandas.DataFrame({'x': ["var_m500_0_somevartext", "var_m500_0_vartextagain",
"varwithsomeothertext_0_500", "varwithsomext_m150_0_text"], 'x1': [4, 5, 6, 8]})
finds = ["m500_0", "0_500", "m150_0"]
df['t'] = df['x'].map(lambda x: compare(x, finds))
print df
def compare(x, finds):
for f in finds:
if f in x:
return f
Try this
df["t"] = df["x"].apply(lambda x: [i for i in finds if i in x][0])

Pandas dataframe trying to retrieve integer in dataframe

I have a pandas dataframe which is as follows:
s = index_df[(index_df['id2'].values == result[z][3])]
print s.iloc[:, [0]]
which will give me the result
id1
36 14559
I'm trying to store the value 14559 into a variable with the following:
value = s.iloc[:, [0]]
But it keeps giving me an error:
ValueError: Incompatible indexer with DataFrame
Any idea how i could solve this?
EDIT:
My dataframe are declared as follows:
result:
result=[(fuzz.WRatio(n, n2),n2,sdf.index[x],bdf.index[y])
for y, n2 in enumerate(Col2['CSGNE_NAME']) if fuzz.WRatio(n, n2)>80 and len(n2) >= 2
]
And this is how i declare and append to the dataframe:
index_df = pd.DataFrame(columns=['id1','id2', 'score'])
index_df = index_df.append({'id1':result[z][2], 'id2':result[z][3], 'score':result[z][0]}, ignore_index=True)
I believe need:
s.iloc[:, 0]
Or:
s.iloc[0, 0]
Or convert values to list and use next for extract first value:
L = index_df[(index_df['id2'].values == result[z][3])].values.tolist()
#use parameter if not matched condition and returned empty val
out = next(iter(L), 'no matched value')
Sample:
index_df = pd.DataFrame({'id2':[1,2,3,2],
'id1':[10,20,30,40]})
print (index_df)
id2 id1
0 1 10
1 2 20
2 3 30
3 2 40
#if possible specify column name with .loc (`id1`)
L = index_df.loc[index_df['id2'].values == 2, 'id1']
#use parameter if not matched condition and returned empty val
#out = next(iter(L), 'no matched value')
print (out)
20

stratified sampling in numpy

In numpy I have a dataset like this. The first two columns are indices. I can divide my dataset into blocks via the indices, i.e. first block is 0 0 second block is 0 1 third block 0 2 then 1 0, 1 1, 1 2 and so on and so forth. Each block has at least two elements. The numbers in the indices columns can vary
I need to split the dataset along these blocks 80%-20% randomly such that after the split each block in both datasets has at least 1 element. How could I do that?
indices | real data
|
0 0 | 43.25 665.32 ... } 1st block
0 0 | 11.234 }
0 1 ... } 2nd block
0 1 }
0 2 } 3rd block
0 2 }
1 0 } 4th block
1 0 }
1 0 }
1 1 ...
1 1
1 2
1 2
2 0
2 0
2 1
2 1
2 1
...
See how do you like this. To introduce randomness, I am shuffling the entire dataset. It is the only way I have figured how to do the splitting vectorized. Maybe you could simply shuffle an indexing array, but that was one indirection too many for my brain today. I have also used a structured array, for ease in extracting the blocks. First, lets create a sample dataset:
from __future__ import division
import numpy as np
# Create a sample data set
c1, c2 = 10, 5
idx1, idx2 = np.arange(c1), np.arange(c2)
idx1, idx2 = np.repeat(idx1, c2), np.tile(idx2, c1)
items = 1000
i = np.random.randint(c1*c2, size=(items - 2*c1*c2,))
d = np.random.rand(items+5)
dataset = np.empty((items+5,), [('idx1', np.int), ('idx2', np.int),
('data', np.float)])
dataset['idx1'][:2*c1*c2] = np.tile(idx1, 2)
dataset['idx1'][2*c1*c2:-5] = idx1[i]
dataset['idx2'][:2*c1*c2] = np.tile(idx2, 2)
dataset['idx2'][2*c1*c2:-5] = idx2[i]
dataset['data'] = d
# Add blocks with only 2 and only 3 elements to test corner case
dataset['idx1'][-5:] = -1
dataset['idx2'][-5:] = [0] * 2 + [1]*3
And now the stratified sampling:
# For randomness, shuffle the entire array
np.random.shuffle(dataset)
blocks, _ = np.unique(dataset[['idx1', 'idx2']], return_inverse=True)
block_count = np.bincount(_)
where = np.argsort(_)
block_start = np.concatenate(([0], np.cumsum(block_count)[:-1]))
# If we have n elements in a block, and we assign 1 to each array, we
# are left with only n-2. If we randomly assign a fraction x of these
# to the first array, the expected ratio of items will be
# (x*(n-2) + 1) : ((1-x)*(n-2) + 1)
# Setting the ratio equal to 4 (80/20) and solving for x, we get
# x = 4/5 + 3/5/(n-2)
x = 4/5 + 3/5/(block_count - 2)
x = np.clip(x, 0, 1) # if n in (2, 3), the ratio is larger than 1
threshold = np.repeat(x, block_count)
threshold[block_start] = 1 # first item goes to A
threshold[block_start + 1] = 0 # seconf item goes to B
a_idx = threshold > np.random.rand(len(dataset))
A = dataset[where[a_idx]]
B = dataset[where[~a_idx]]
After running it, the split is roughly 80/20, and all blocks are represented in both arrays:
>>> len(A)
815
>>> len(B)
190
>>> np.all(np.unique(A[['idx1', 'idx2']]) == np.unique(B[['idx1', 'idx2']]))
True
Here's an alternative solution. I'm open for a code review if it is possible to implement this in a more numpy way (without for loops). #Jamie 's answer is really good, it's just that sometimes it produces skewed ratios within blocks of data.
ratio = 0.8
IDX1 = 0
IDX2 = 1
idx1s = np.arange(len(np.unique(self.data[:,IDX1])))
idx2s = np.arange(len(np.unique(self.data[:,IDX2])))
valid = None
train = None
for i1 in idx1s:
for i2 in idx2:
mask = np.nonzero((data[:,IDX1] == i1) & (data[:,IDX2] == i2))
curr_data = data[mask,:]
np.random.shuffle(curr_data)
start = np.min(mask)
end = np.max(mask)
thres = start + np.around((end - start) * ratio).astype(np.int)
selected = mask < thres
train_idx = mask[0][selected[0]]
valid_idx = mask[0][~selected[0]]
if train != None:
train = np.vstack((train,data[train_idx]))
valid = np.vstack((valid,data[valid_idx]))
else:
train = data[train_idx]
valid = data[valid_idx]
I'm assuming that each block has at least two entries and also that if it has more than two you want them assigned as closely as possible to 80/20. The easiest way to do this seems to be to assign a random number to all rows, and then choose based on percentiles within each stratified sample. Say this is the data in file strat_sample.csv:
Index_1,Index_2,Data_1,Data_2
0,0,0.614583182,0.677644482
0,0,0.321384981,0.598450854
0,0,0.303029607,0.300593782
0,0,0.646010758,0.612006715
0,0,0.484572883,0.30052535
0,1,0.010625416,0.118671475
0,1,0.428967984,0.23795173
0,1,0.523440618,0.457275922
0,1,0.379612652,0.337640868
0,1,0.338180659,0.206399031
1,0,0.079386,0.890939911
1,0,0.572864624,0.725615079
1,0,0.045891404,0.300128917
1,0,0.578792198,0.100698871
1,0,0.776485138,0.475135948
1,0,0.401850419,0.784835723
1,1,0.087660923,0.497299605
1,1,0.8460978,0.825774802
1,1,0.526015021,0.581905971
1,1,0.23324672,0.299475291
Then this code (using Pandas data structures) works as desired
import numpy as np
import random as rnd
import pandas as pd
#sample data strat_sample.csv, contents to follow
def TreatmentOneCount(n , *args):
#assign a minimum one to each group but as close as possible to fraction OptimalRatio in group 1.
OptimalRatio = args[0]
if n < 2:
print("N too small, assignment not defined.")
a = NaN
elif n == 2:
a = 1
else:
"""
There are one of two numbers that are close to the target ratio, one above, the other below
If the number above is N and it is closest to optimal, then you need to set things to N-1 to ensure both groups have at least one member (recall n>2)
If the number below is 0 and it is closest to optimal, then you need to set things to 1 to ensure both groups have at least one member (recall n>2)
"""
targetassigment = OptimalRatio * n
if targetassigment - floor(targetassigment) > 0.5:
a = min(ceil(targetassigment),n-1)
else:
a = max(floor(targetassigment),1)
return a
df = pd.read_csv('strat_sample.csv', sep=',' , header=0)
#assign a random number to each entry
df['RandScore'] = np.random.uniform(0,1,df.shape[0])
df.sort(columns= ['Index_1' ,'Index_2','RandScore'], inplace = True)
#Within each block assign a rank based on random number.
df['RandRank'] = df.groupby(['Index_1','Index_2'])['RandScore'].rank()
#make a group index
df['MasterIdx'] = df['Index_1'].apply(str) + df['Index_2'].apply(str)
#Store the counts for members of each block
seriestest = df.groupby('MasterIdx')['RandRank'].count()
seriestest.name = "Counts"
dftest = pd.DataFrame(seriestest)
#Add the block counts to the data
df = df.merge(dftest, how='left', left_on = 'MasterIdx', right_index= True)
#Make the actual assignments to the two groups
df['Assignment'] = (df['RandRank'] <= df['Counts'].apply(TreatmentOneCount, args = (0.8,))) * -1 + 2
df.drop(['MasterIdx', 'Counts', 'RandRank', 'RandScore'], axis=1)
from sklearn import cross_validation
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=0)

Categories

Resources