I have a dataframe where the index is a datetime, and it is sorted. Basically I want to creating a column rolling_time1, rolling_time2,... etc where the value is the number of count after the row that is within timex. I created the following but it is very slow. Any ways to make this faster?
def sum_window_wd(row, wd_file, wd, df, num):
if row.start_index > num:
return row['rolling_' + str(wd)]
count = 0
for i in range(row.start_index + 1, len(df)):
if GetWinddownLeft(wd_file, df.iloc[i].name, row.name) < wd:
count = count + 1
else:
break
return count
for rolling in rollings:
df['rolling_' + str(rolling)] = 0
for rolling in rollings:
df['rolling_' + str(rolling)] = df.apply(sum_window_wd, axis=1, args = (winddown, rolling, df, len))
Related
I am quite new to programming field of Python.
I have a dataset which needs to be modified. I tried few methods for sum part but I dont get the exact results.
Dataset : My data table
Requirements:
To categorize the debit and credit values into the following ranges/bins :
a) 2000-4000
b) 5000-8000
c) 9000-20000
The sum of debit should be for 20 days period like
if the transaction happened on 2020-01-01 then
the sum of credit should be from 2020-01-01 to 2020-01-20
I also want the record of occurrences i.e
the number of times the value from the bins lies in the category
Required Result : Result]2
The code I tried for credit values:
EndDate = BM['transaction_date']+ pd.to_timedelta(20, unit='D')
StartDate= BM['transaction_date']
dfx=BM
dfx['EndDate'] = EndDate
dfx['StartDate'] = StartDate
dfx['Debit'] = dfx.apply(lambda x: BM.loc[(df['transaction_date'] >= x.StartDate) &
(BM['transaction_date']
<=x.EndDate),'Debit'].sum(), axis=1)
Code1-
Code2-
error :
I have created a lot of functions and broke the problem into smaller tasks. Hope the comments make this understandable.
def sum20Days(df, debitORCredit):
"""
Calculates the sum of all amount in the debitORCredit column of df looking 20 days into the future within df
df: pandas DataFrame. Should already do groupby on name
debitORCredit : String. Takes either debit or credit. Column names in the dataframe
Returns:
df: Creates a column sum_debit_20days, adds the sum amount and returns the final dataframe
"""
df = df.copy()
temp_df = df[df[debitORCredit]>0]
dates = sorted(temp_df["transaction_date"].unique())
curr_date = dates[0]
date_20days = curr_date + pd.Timedelta(20, unit="D")
i = 0
while i < len(dates):
date = dates[i]
if date > date_20days:
curr_date = date
date_20days = curr_date + pd.Timedelta(20, unit="D")
series = temp_df.loc[(df["transaction_date"]>=date)&(df["transaction_date"]<=date_20days), :]
df.loc[max(df.loc[df["transaction_date"] == series["transaction_date"].max()].index), f"sum_{debitORCredit}_20days"] = sum(series[debitORCredit])
new_i = series["transaction_date"].nunique()
if new_i > 1:
i = new_i+1
else:
i += 1
return df
def groupListUsingList(inp, groupby):
"""
Groups inp by list groupby
inp: List
groupby: List
Example: inp = [0, 1, 2, 3, 4, 5, 6, 7], groupby=[3, 6] then output = [[0, 1, 2, 3], [4, 5, 6], [7]]
"""
groupby = sorted(groupby)
inp = sorted(inp)
lst = []
arr = []
for i in inp:
if len(groupby) > 0:
if i <= groupby[0]:
arr.append(i)
else:
if len(arr)>0:
lst.append(arr)
arr = [i]
groupby.pop(0)
else:
arr += inp[i:]
if len(arr) > 0:
lst.append(arr)
return lst
def count_amounts_in_category(df, debitORCredit, category_info):
"""
Based on the category assigned, finds the number of amounts belonging to that category
Inputs-
df: Pandas Dataframe. Grouped by name and only contains the transactions belonging to a single category calculation
debitORCredit: String. Takes either credit/debit. Used to get column in df
category_info: Dict. Contains the rules of categorization.
Output-
count: Float. Returns count
"""
if debitORCredit.lower() == "debit":
temp_df = df.loc[(df["debitorcredit"]=="D")]
elif debitORCredit.lower() == "credit":
temp_df = df.loc[(df["debitorcredit"]=="C")]
if temp_df.shape[0] == 0:
return np.nan
category = temp_df.iloc[-1].loc[f"category_{debitORCredit}"]
amount_range = category_info.get(category)
count = temp_df[debitORCredit].apply(lambda x: 1 if x<=amount_range[1] and x>=amount_range[0] else 0).sum()
return count
def assign_category(amount, category_info):
"""
Assigns category based on amount and categorization rules
Input -
amount: Float/Int. The amount
category_info: Dict. Contains the rules of categorization.
Ouptut -
Returns the String category based on the categorization rules
"""
if pd.isna(amount):
return np.nan
for k, v in category_info.items():
if v[0]<=amount<=v[1]:
return k
return np.nan
category_info = {"A": (2000, 4000),
"B": (5000, 8000),
"C":(9000, 20000)}
debitORCredit = "debit"
new_df = pd.DataFrame()
#Groupby name, then for each date in a group, calculate the sum of debitORCredit amounts over the next 20 days
for group in df.groupby("name"):
temp_df = sum20Days(group[1], debitORCredit=debitORCredit)
new_df = pd.concat([new_df, temp_df])
new_df = new_df.reset_index(drop=True)
#Based on the 20 days sum, use the categorization rules to assign a category
new_df[f"category_{debitORCredit}"] = new_df[f"sum_{debitORCredit}_20days"].apply(lambda x: assign_category(x, category_info))
#After assigning a category, groupby name and later groupby each 20 day transaction to find the count of transaction that belong to category assigned to that group of transactions
for group in new_df.groupby("name"):
#to groupby every 20 day transaction, we identified the last row of every 20 day transaction (ones which have a sum_debit_20days value) and split the group(a group from name groupby) on the last value in the index
indices = groupListUsingList(inp=group[1].index, groupby=group[1][group[1][f"sum_{debitORCredit}_20days"].notna()].index)
for index in indices:
count = count_amounts_in_category(df=new_df.loc[index], debitORCredit=debitORCredit, category_info=category_info)
new_df.loc[index[-1], f"count_{debitORCredit}"] = count
new_df
I have a Pandas dataframe with ~100,000,000 rows and 3 columns (Names str, Time int, and Values float), which I compiled from ~500 CSV files using glob.glob(path + '/*.csv').
Given that two different names alternate, the job is to go through the data and count the number of times a value associated with a specific name ABC deviates from its preceding value by ±100, given that the previous 50 values for that name did not deviate by more than ±10.
I initially solved it with a for loop function that iterates through each row, as shown below. It checks for the correct name, then checks the stability of the previous values of that name, and finally adds one to the count if there is a large enough deviation.
count = 0
stabilityTime = 0
i = 0
if names[0] == "ABC":
j = value[0]
stability = np.full(50, values[0])
else:
j = value[1]
stability = np.full(50, values[1])
for name in names:
value = values[i]
if name == "ABC":
if j - 10 < value < j + 10:
stabilityTime += 1
if stabilityTime >= 50 and np.std(stability) < 10:
if value > j + 100 or value < j - 100:
stabilityTime = 0
count += 1
stability = np.roll(stability, -1)
stability[-1] = value
j = value
i += 1
Naturally, this process takes a very long computing time. I have looked at NumPy vectorization, but do not see how I can apply it in this case. Is there some way I can optimize this?
Thank you in advance for any advice!
Bonus points if you can give me a way to concatenate all the data from every CSV file in the directory that is faster than glob.glob(path + '/*.csv').
df = pd.DataFrame({
'label':[f"subj_{i}" for i in range(28)],
'data':[i for i in range(1, 14)] + [1,0,0,0,2] + [0,0,0,0,0,0,0,0,0,0]
})
I have a dataset something like that. It looks like:
I want to cut it at where the longest repetitions of 0s occur, so I want to cut at index 18, but I want to leave index 14-16 intact. So far I've tried stuff like:
Counters
cad_recorder = 0
new_index = []
for i,row in tqdm(temp_df.iterrows()):
if row['cadence'] == 0:
cad_recorder += 1
new_index.append(i)
* But obviously that won't work since the indices will be rewritten at each occurrance of zero.
I also tried a dictionary, but I'm not sure how to compare previous and next values using iterrows.
I also took the rolling mean for X rows at a time, and if its zero then I got an index. But then I got stuck at actually inferring the range of indices. Or finding the longest sequence of zeroes.
Edit: A friend of mine suggested the following logic, which gave the same results as #shubham-sharma. The poster's solution is much more pythonic and elegant.
def find_longest_zeroes(df):
'''
Finds the index at which the longest reptitions of <1 values begin
'''
current_length = 0
max_length = 0
start_idx = 0
max_idx = 0
for i in range(len(df['data'])):
if df.iloc[i,9] <= 1:
if current_length == 0:
start_idx = i
current_length += 1
if current_length > max_length:
max_length = current_length
max_idx = start_idx
else:
current_length = 0
return max_idx
The code I went with following #shubham-sharma's solution:
cut_us_sof = {}
og_df_sof = pd.DataFrame()
cut_df_sof = pd.DataFrame()
for lab in df['label'].unique():
temp_df = df[df['label'] == lab].reset_index(drop=True)
mask = temp_df['data'] <= 1 # some values in actual dataset were 0.0000001
counts = temp_df[mask].groupby((~mask).cumsum()).transform('count')['data']
idx = counts.idxmax()
# my dataset's trailing zeroes are usually after 200th index. But I also didn't want to remove trailing zeroes < 500 in length
if (idx > 2000) & (counts.loc[idx] > 500):
cut_us_sof[lab] = idx
og_df_sof = og_df_sof.append(temp_df)
cut_df_sof = cut_df_sof.append(temp_df.iloc[:idx,:])
We can use boolean masking and cumsum to identify the blocks of zeros, then groupby and transform these blocks using count followed by idxmax to get the starting index of the block having the maximum consecutive zeros
m = df['data'].eq(0)
idx = m[m].groupby((~m).cumsum()).transform('count').idxmax()
print(idx)
18
I would like to convert y dataframe from one format (X:XX:XX:XX) of values to another (X.X seconds)
Here is my dataframe looks like:
Start End
0 0:00:00:00
1 0:00:00:00 0:07:37:80
2 0:08:08:56 0:08:10:08
3 0:08:13:40
4 0:08:14:00 0:08:14:84
And I would like to transform it in seconds, something like that
Start End
0 0.0
1 0.0 457.80
2 488.56 490.80
3 493.40
4 494.0 494.84
To do that I did:
i = 0
j = 0
while j < 10:
while i < 10:
if data.iloc[i, j] != "":
Value = (int(data.iloc[i, j][0]) * 3600) + (int(data.iloc[i, j][2:4]) *60) + int(data.iloc[i, j][5:7]) + (int(data.iloc[i, j][8: 10])/100)
NewValue = data.iloc[:, j].replace([data.iloc[i, j]], Value)
i += 1
else:
NewValue = data.iloc[:, j].replace([data.iloc[i, j]], "")
i += 1
data.update(NewValue)
i = 0
j += 1
But I failed to replace the new values in my oldest dataframe in a permament way, when I do:
print(data)
I still get my old data frame in the wrong format.
Some one could hep me? I tried so hard!
Thank you so so much!
You are using pandas.DataFrame.update that requires a pandas dataframe as an argument. See the Example part of the update function documentation to really understand what update does https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
If I may suggest a more idiomatic solution; you can directly map a function to all values of a pandas Series
def parse_timestring(s):
if s == "":
return s
else:
# weird to use centiseconds and not milliseconds
# l is a list with [hour, minute, second, cs]
l = [int(nbr) for nbr in s.split(":")]
return sum([a*b for a,b in zip(l, (3600, 60, 1, 0.01))])
df["Start"] = df["Start"].map(parse_timestring)
You can remove the if ... else ... from parse_timestring if you replace all empty string with nan values in your dataframe with df = df.replace("", numpy.nan) then use df["Start"] = df["Start"].map(parse_timestring, na_action='ignore')
see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
The datetimelibrary is made to deal with such data. You should also use the apply function of pandas to avoid iterating on the dataframe like that.
You should proceed as follow :
from datetime import datetime, timedelta
def to_seconds(date):
comp = date.split(':')
delta = (datetime.strptime(':'.join(comp[1:]),"%H:%M:%S") - datetime(1900, 1, 1)) + timedelta(days=int(comp[0]))
return delta.total_seconds()
data['Start'] = data['Start'].apply(to_seconds)
data['End'] = data['End'].apply(to_seconds)
Thank you so much for your help.
Your method was working. I also found a method using loop:
To summarize, my general problem was that I had an ugly csv file that I wanted to transform is a csv usable for doing statistics, and to do that I wanted to use python.
my csv file was like:
MiceID = 1 Beginning End Type of behavior
0 0:00:00:00 Video start
1 0:00:01:36 grooming type 1
2 0:00:03:18 grooming type 2
3 0:00:06:73 0:00:08:16 grooming type 1
So in my ugly csv file I was writing only the moment of the begining of the behavior type without the end when the different types of behaviors directly followed each other, and I was writing the moment of the end of the behavior when the mice stopped to make any grooming, that allowed me to separate sequences of grooming. But this type of csv was not usable for easily making statistics.
So I wanted 1) transform all my value in seconds to have a correct format, 2) then I wanted to fill the gap in the end colonne (a gap has to be fill with the following begining value, as the end of a specific behavior in a sequence is the begining of the following), 3) then I wanted to create columns corresponding to the duration of each behavior, and finally 4) to fill this new column with the duration.
My questionning was about the first step, but I put here the code for each step separately:
step 1: transform the values in a good format
import pandas as pd
import numpy as np
data = pd.read_csv("D:/Python/TestPythonTraitementDonnéesExcel/RawDataBatch2et3.csv", engine = "python")
data.replace(np.nan, "", inplace = True)
i = 0
j = 0
while j < len(data.columns):
while i < len(data.index):
if (":" in data.iloc[i, j]) == True:
Value = str((int(data.iloc[i, j][0]) * 3600) + (int(data.iloc[i, j][2:4]) *60) + int(data.iloc[i, j][5:7]) + (int(data.iloc[i, j][8: 10])/100))
data = data.replace([data.iloc[i, j]], Value)
data.update(data)
i += 1
else:
i += 1
i = 0
j += 1
print(data)
step 2: fill the gaps
i = 0
j = 2
while j < len(data.columns):
while i < len(data.index) - 1:
if data.iloc[i, j] == "":
data.iloc[i, j] = data.iloc[i + 1, j - 1]
data.update(data)
i += 1
elif np.all(data.iloc[i:len(data.index), j] == ""):
break
else:
i += 1
i = 0
j += 4
print(data)
step 3: create a new colunm for each mice:
j = 1
k = 0
while k < len(data.columns) - 1:
k = (j * 4) + (j - 1)
data.insert(k, "Duree{}".format(k), "")
data.update(data)
j += 1
print(data)
step 3: fill the gaps
j = 4
i = 0
while j < len(data.columns):
while i < len(data.index):
if data.iloc[i, j - 2] != "":
data.iloc[i, j] = str(float(data.iloc[i, j - 2]) - float(data.iloc[i, j - 3]))
data.update(data)
i += 1
else:
break
i = 0
j += 5
print(data)
And of course, export my new usable dataframe
data.to_csv(r"D:/Python/TestPythonTraitementDonnéesExcel/FichierPropre.csv", index = False, header = True)
here are the transformations:
click on the links for the pictures
before step1
after step 1
after step 2
after step 3
after step 4
Here is my code
count = 0
def selectionSort(data):
for index in range(len(data)):
min = index
count += 1
# Find the index'th smallest element
for scan in range(index + 1, len(data)):
if (data[scan] < data[min]):
min = scan
if min != index: # swap the elements
data[index], data[min] = data[min], data[index]
return data
data = selectionSort([3,4,5,2,6])
print(count, data)
Your code as-is should not run. You should get local variable 'count' referenced before assignment.
To fix this, add the following to the top of selectionSort(data):
global count
A better way is to scrap the global variable and return count alongside the sorted data:
def selectionSort(data):
count = 0
for index in range(len(data)):
min = index
count += 1
# Find the index'th smallest element
for scan in range(index + 1, len(data)):
if (data[scan] < data[min]):
min = scan
if min != index: # swap the elements
data[index], data[min] = data[min], data[index]
return count, data
count, data = selectionSort([3,4,5,2,6])
print(count, data)
Last but not least, you are counting something other than comparisons. I leave fixing that as an exercise for the reader.