Removing value from a DataFrame column which repeats over 15 times - python

I'm working on forex data like this:
0 1 2 3
1 AUD/JPY 20040101 00:01:00.000 80.598 80.598
2 AUD/JPY 20040101 00:02:00.000 80.595 80.595
3 AUD/JPY 20040101 00:03:00.000 80.562 80.562
4 AUD/JPY 20040101 00:04:00.000 80.585 80.585
5 AUD/JPY 20040101 00:05:00.000 80.585 80.585
I want to go through column 2 and 3 and remove the rows in which the value is repeated for more than 15 times in a row. So far I managed to produce this piece of code:
price = 0
drop_start = 0
counter = 0
df_new = df
for i, r in df.iterrows():
if r.iloc[2] != price:
if counter >= 15:
df_new = df_new.drop(df_new.index[drop_start:i])
price = r.iloc[2]
counter = 1
drop_start = i
if r.iloc[2] == price:
counter = counter + 1
price = 0
drop_start = 0
counter = 0
df = df_new
for i, r in df.iterrows():
if r.iloc[3] != price:
if counter >= 15:
df_new = df_new.drop(df_new.index[drop_start:i])
price = r.iloc[3]
counter = 1
drop_start = i
if r.iloc[3] == price:
counter = counter + 1
print(df_new.info())
df_new.to_csv('df_new.csv', index=False, header=None)
Unfortunately when I check the output file there are some mistakes, there are some weekends which have not been removed by the program. How should I build my algorithm, so it removes the duplicated values correctly?
First 250k rows of my initial dataset is available here: https://ufile.io/omg5h
The output of this program for that sample data is available here:
https://ufile.io/2gc3d
You can see that in the output file the rows 6931+ were not succesfully removed:

The problem with your algorithm is that, you are not holding specific counter values for the row values, but rather increment the counter through the loop. This causes the result to be false I believe. Also, the comparison r.iloc[2] != price also does not make sense because you are changing the value of price every iteration, so if there are elements between the duplicates, this check do not serve a proper function. I wrote a small code to copy the behavior you asked for.
df = pd.DataFrame([[0,0.5, 2.5],[0,1, 2],[0,1.5,2.5 ],[0,2, 3],[0,2, 3],[0,3, 4],
[0,4, 5]],columns = ['A','B','C'])
df_new = df
dict = {}
print('Initial DF')
print(df)
print()
for i, r in df.iterrows():
counter = dict.get(r.iloc[1])
if counter == None:
counter = 0
dict[r.iloc[1]] = counter + 1
if dict[r.iloc[1]] >= 2:
df_new = df_new[df_new.B != r.iloc[1]]
print('2nd col. deleted DF')
print(df_new)
print()
df_fin = df_new
dict2 = {}
for i, r in df_new.iterrows():
counter = dict2.get(r.iloc[2])
if counter == None:
counter = 0
dict2[r.iloc[2]] = counter + 1
if dict2[r.iloc[2]] >= 2:
df_fin = df_fin[df_fin.C != r.iloc[2]]
print('3rd col. deleted DF')
print(df_fin)
Here, I hold the counter value for each unique value in the rows of column 2 and 3. Then, according to the threshold(which is 2 in this case) I remove the rows which are exceeding the threshold. I first eliminate values according to the 2nd column, then forward this modified array to the next loop and eliminate values according to the 3rd column and finish the process.

Related

Is it possible to dynamically assign condition statements to a list in Python?

I am trying to create a list of conditions to use the numpy select statement to create a 'Week #' column depending on the date a record was created. However, it doesn't quite seem to work. Any suggestions?
#Creating list for start dates
weekStartDay = []
weekValues = []
weekConditions = []
counter = 1
demoStartDate = min(demographic['Date'])
demoEndDate = max(demographic['Date'])
while demoStartDate <= demoEndDate:
weekStartDay.append(demoStartDate)
demoStartDate += timedelta(days=7)
weekStartDay.append(demoStartDate)
while counter <= len(weekConditions):
weekValues.append(counter+1)
counter += 1
#Assigning condition statement for numpy conditions
for i in range(len(weekStartDay)):
weekConditions.append( (demographic['Date'] >= weekStartDay[i]) & (demographic['Date'] < weekStartDay[i+1]) )
#Creating week value assignment column
demographic['Week'] = np.select(weekConditions,weekValues)
I believe I've found a solution to the problem.
#Creating list for start dates
weekStartDay = []
weekValues = []
weekConditions = []
counter = 1
i = 0
demoStartDate = min(demographic['Date'])
demoEndDate = max(demographic['Date'])
while demoStartDate <= demoEndDate:
weekStartDay.append(demoStartDate)
demoStartDate += timedelta(days=7)
weekStartDay.append(demoStartDate)
while counter <= len(weekStartDay):
weekValues.append(counter)
counter += 1
#Assigning condition statement for numpy conditions
while i != len(weekStartDay):
for i in range(len(weekStartDay)):
weekConditions.append( (demographic['Date'] >= weekStartDay[i-1]) & (demographic['Date'] < weekStartDay[i]) )
i += 1
#Creating week value assignment column
demographic['Week'] = np.select(weekConditions,weekValues)

Compute number of word distance of 1 for every word in pandas column

For each string in a list, I need to find the number of strings in that list that are one levenshtein-distance away. The levenshtein-distance is smallest number of character substitutions, additions, or removals necessary to derive one word from another. For illustration, please see the following DataFrame:
import pandas as pd
import numpy as np
df = pd.DataFrame({
'word':['can', 'cans', 'canse', 'canpe', 'canp', 'camp'],
'code':['k#n', 'k#n}', 'k#(z', np.nan, 'k#()', np.nan]})
word code
0 can k#n
1 cans k#n}
2 canse k#(z
3 canpe
4 canp k#()
5 camp
My current implementation is way too slow:
from Levenshtein import distance as lev
df = df.fillna('')
# get unique strings
wordAll = df['word'].dropna().to_list()
codeAll = list(set(df['code'].dropna().to_list()))
# prepare dataframe for storage
df['wordLev'] = np.nan
df['codeLev'] = np.nan
# find neighbors
for idx,row in df.iterrows():
i=0
j=0
# get word and code
word = row['word']
code = row['code']
# remove word and code from all-strings-list
wordSubset = [w for w in wordAll if w != word]
codeSubset = [c for c in codeAll if c != code]
# compute number of neighbors
for item in wordSubset:
if lev(word, item) == 1:
i += 1
for item in codeSubset:
if lev(code, item) == 1:
j += 1
# add number of neighbors to df
df.loc[df['code'] == code, 'wordLev'] = i
if code != '':
df.loc[df['code'] == code, 'codeLev'] = j
else:
df.loc[df['code'] == code, 'codeLev'] = ''
df
word code wordLev codeLev
0 can k#n 2 1
1 cans k#n} 3 1
2 canse k#(z 2 1
3 canpe 2
4 canp k#() 3 1
5 camp 1
How can I speed it up? The DataFrame has ~500k rows...
The following code seems to be ~5x faster than your code at 1.8ms vs 9.6ms (at least on the df you've provided).
df = df.fillna('')
df['wordLev'] = [sum(1 for item in df['word'] if item!=word and lev(word, item)==1) for word in df['word']]
df['codeLev'] = [sum(1 for item in df['code'] if item!=code and lev(code, item)==1) or '' for code in df['code']]
This code is really very similar to yours. Biggest difference is that instead of creating wordSubset or codeSubset and then iterating over them again to apply the levenshtein distance function, it does it in all in a single generator expression. Since you're checking each word with every word in the column, you can't escape a double loop imo.

Climatology frequencies and duration

I have a 10 years climatological dateset as follows.
dt T P
01-01-2010 3 0
02-01-2010 5 11
03-01-2010 10 50
....
31-12-2020 -1 0
I want to estimate the total number of days in each month where T and P continuously stayed greater than 0 for three days or more
I would want these columns as an output:
month Number of days/DurationT&P>0 T P
I have never used loops in python, I seem to be able to write a simple loop and nothing beyond this when the data has to be first grouped by month and year and then apply the condition. would really appreciate any hints on the construction of the loop.
A= dataset
A['dt'] = pd.to_datetime(A['dt'], format='%Y-%m-%d')
for column in A [['P', 'T']]:
for i in range (len('P')):
if i > 0:
P.value_counts()
print(i)
for j in range (len ('T')):
if i > 0:
T.value_counts()
print(j)
Here is a really naive way you could set it up by simply iterating over the rows:
df['valid'] = (df['T'] > 0) & (df['P'] > 0)
def count_total_days(df):
i = 0
total = 0
for idx, row in df.iterrows():
if row.valid:
i += 1
elif not row.valid:
if i >= 3:
total += i
i = 0
return total
Since you want it per month, you would first have to create new month and year columns to group by:
df['month'] = df['dt'].dt.month
df['year'] = df['dt'].dt.year
for date, df_subset in df.groupby(['month', 'year']):
count_total_days(df_subset)
You can use resample and sum to get the sum of day for each where the condition is true.
import pandas as pd
dt = ["01-01-2010", "01-02-2010","01-03-2010","01-04-2010", "03-01-2010",'12-31-2020']
t=[3,66,100,5,10,-1]
P=[0,77,200,11,50,0]
A=pd.DataFrame(list(zip(dt, t,P)),
columns =['dtx', 'T','P'])
A['dtx'] = pd.to_datetime(A['dtx'], format='%m-%d-%Y')
A['Mask']=A.dtx.diff().dt.days.ne(1).cumsum()
dict_freq=A['Mask'].value_counts().to_dict()
newdict = dict((k, v) for k, v in dict_freq.items() if v >= 3)
A=A[A['Mask'].isin(list(newdict.keys()))]
A['Mask']=(A['T'] >= 1) & (A['P'] >= 1)
df_summary=A.query('Mask').resample(rule='M',on='dtx')['Mask'].sum()
Which produce
2010-01-31 3

How to create a new column as a result of comparing two nested consecutive rows in the Panda dataframe?

I need to write a code in Panda Dataframe. So: The values in the ID column will be checked sequentially whether they are the same or not. Three situations arise here. Case 1: If the ID is not the same as the next line, write it as "unique" in the Comment column. Case 2: If the ID is the same as the next column and different from the next one, write it as "ring" in the Comment column. Case 3: If the ID is the same as the next multiple columns, write it as "multi" in the Comment column. Case 4: do this until the rows in the ID column are complete.
import pandas as pd
df = pd.read_csv('History-s.csv')
a = len(df['ID'])
c = 0
while a != 0:
c += 1
while df['ID'][i] == df['ID'][i + 1]:
if c == 2:
if df['Nod 1'][i] == df['Nod 2'][i + 1]:
df['Comment'][i] = "Ring"
df['Comment'][i + 1] = "Ring"
else:
df['Comment'][i] = "Multi"
df['Comment'][i + 1] = "Multi"
elif c > 2:
df['Comment'][i] = "Multi"
df['Comment'][i + 1] = "Multi"
i += 1
else:
df['Comment'][i] = "Unique"
a = a -1
print(df, '\n')
Data is like this:
Data
After coding data frame should be like this:
Result
From the input dataframe you have provided, my first impression was that as you are checking next line in a while loop, so you are strictly considering just the next comin line, for ex.
ID
value
comment
1
2
MULTI
1
3
RING
3
4
UNIQUE
But if that is not the case, you can simply use pandas groupby function.
def func(df):
if len(df)>2:
df['comment'] = 'MULTI'
elif len(df)==2:
df['comment'] = 'RING'
else:
df['comment'] = 'UNIQUE'
return df
df = df.groupby(['ID']).apply(func)
Output:
ID value comment
0 1 2 RING
1 1 3 RING
2 3 4 UNIQUE

Counting data within ranges in csv

I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets:
0
">0 but <0.05"
">0.05 but <0.10"
">0.1 but <0.15... all the way up to 1.00"
All of these would ideally be appended to the same new.csv i.e. not the main data csv
Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0:
01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x
01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x
01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x
01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x
02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5
02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0
02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5
03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0
03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0
03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0
04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0
04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286
04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0
05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25
05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333
05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25
05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25
06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3
06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2
06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3
07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25
07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0
07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0
08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2
08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667
08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0
09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25
09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333
09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25
09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25
10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778
10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222
10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778
11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286
11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0
11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0
12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25
12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565
12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0
13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25
13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333
13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25
13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25
14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769
14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231
14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769
15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3
If anybody can help me achieve this I will truly grateful. If this requires more detail please ask.
One last note the csv in question has main data csv in question has 800k rows.
EDIT
Currently the output file appears as follows using the code supplied by #user650654:
data1,data2
If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row):
title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2
So in reality it would probably look like this:
x,n/a,data1,data2
0:0.05,0.02469,data1,data2
0.05:0.1,0.5469,data1,data2
....
....
Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05
Column2 = Calculated average of values that fell within a particular range. I.e. If the
Note the data1 & data2 are the two values the question innitially asked for.
Column1
Many thanks AEA
Here is a solution for adding the two new fields:
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
bins = numpy.arange(0, 1.05, 0.05)
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
all_array = numpy.array(all_array)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]
digitized = numpy.digitize(all_array, bins)
bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))]
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow(['x', 'n/a', col7one_x, total_x])
writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros])
for row in zip(bin_ranges, bin_means, hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()
This might work:
import numpy as np
import pandas as pd
column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc.
df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file
df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers
bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values.
labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins
labels[0] = 'x' #change first bin label to 'x'
labels[1] = '0' #change second bin label to '0'
df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them
df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label
df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index
def count_ones(x):
"""aggregate function to count values that equal 1"""
return np.sum(x==1)
dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column.
dfg.index = labels[dfg.index]# apply labels to bin numbers
dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless
print(dfg)
dfg.to_csv('new.csv')
which gave me
col7 col11
count_ones len mean
x 1 7 N/A
0 2 21 0
0.15:0.2 2 2 0.2
0.2:0.25 9 22 0.2478632
0.25:0.3 0 13 0.2840755
0.3:0.35 0 5 0.3333333
0.45:0.5 0 4 0.5
This solution uses numpy.histogram. See below.
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
bins = numpy.arange(0, 1.05, 0.05)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow([col7one_x, total_x])
writer.writerow([col7one_zeros, total_zeros])
for row in zip(hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()

Categories

Resources