excluding count from dataFrame if value is 0 - python

I have the code below:
import pandas as pd
import numpy as np
# Read data from file 'filename.csv'
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later)
df = pd.read_csv("../example.csv", parse_dates=['DateTime'])
# Preview the first 5 lines of the loaded data
df = df.assign(Burned = df['Quantity'])
df.loc[df['To'] != '0x0000000000000000000000000000000000000000', 'Burned'] = 0.0
# OR:
df['cum_sum'] = df['Burned'].cumsum()
df['percent_burned'] = df['cum_sum']/df['Quantity'].max()*100.0
per_day = df.groupby(df['DateTime'].dt.date)['Burned'].count().reset_index(name='Trx')
The last line create a dataFrame called per_day which sorts all the transactions by day from the df dataFrame. Many of the transactions from df have 'Burned' = 0. I want to count the total number of transactions which is what my current code does. But I want to exclude transactions in which 'Burned'=0.
Also, how can I consoladate these 3 lines of code? I don't even want to create per_day_burned. How do I just make this its own column in per_day without doing it the way I did it?
per_day = dg.groupby(dg['DateTime'].dt.date)['Burned'].count().reset_index(name='Trx')
per_day_burned = dg.groupby(dg['DateTime'].dt.date)['Burned'].sum().reset_index(name='Burned')
per_day['Burned'] = per_day_burned['Burned']

Related

create new column in dataframe conditionally

updated question
by using the code below i am able to access dataframe only after completion of for loop, but i want to use most recently created column of the dataframe at intermediate time. i.e after every 5 minutes whichever is the last column of the dataframe ,how to achieve this?
#app.route("/sortbymax")
def sortbymax():
df = updated_data()
#### here i want to use most recently created column
df = create_links(df)
df = df.sort_values(by=['perc_change'], ascending=False)
return render_template('sortbymax.html',tables=[df.to_html(escape = False)], titles=df.columns.values)
def read_data():
filename = r'c:\Users\91956\Desktop\bk.xlsm'
df = pd.read_excel(filename)
return df
def updated_data():
df = read_data()
for i in range(288):
temp = read_data()
x=datetime.datetime.now().strftime("%H:%M:%S")
df['perc_change_'+x] = temp['perc_change']
time.sleep(300)
return df
I see you have a file .xlsm which means is a macro enabled excel. I guess you can read it but if you want to change it with python than you most probably lose the macro part in your excel.
For the python part:
this will copy the perc_change column every 5 minutes, with the respective name. However bear in mind that this will work only for one day (it will replace existing columns after that). If you want to work for longer periods, let me know so that I will add day-month-year (whatever you want) in column names.
import datetime
import time
def read_data():
filename = r'c:\Users\91956\Desktop\bk.xlsm'
df = pd.read_excel(filename)
return df
def write_data(df):
filename = r'c:\Users\91956\Desktop\bk.xlsm'
df.to_excel(filename)
df = read_data() #read excel for first time
for i in range(288): #this will run for one day exactly
temp = read_data()
x=datetime.datetime.now().strftime("%H:%M")
df['perc_change_'+x] = temp['perc_change']
time.sleep(300)

How to obtain the mean of selected columns from multiple sheets within same Excel File

I am working with a large excel file having 22 sheets, where each sheet has the same coulmn headings but do not have equal number of rows. I would like to obtain the mean values (excluding zeros) for columns AA to AX for all the 22 sheets. The columns have titles which I use in my code.
Rather than reading each sheet, I want to loop through the sheets and get as output the mean values.
With help from answers to other posts, I have this:
import pandas as pd
xls = pd.ExcelFile('myexcelfile.xlsx')
xls.sheet_names
#print(xls.sheet_names)
out_df = pd.DataFrame()
for sheets in xls.sheet_names:
df = pd.read_excel('myexcelfile.xlsx', sheet_names= None)
df1= df[df[:]!=0]
df2=df1.loc[:,'aa':'ax'].mean()
out_df.append(df2) ## This will append rows of one dataframe to another(just like your expected output)
print(out_df2)
## out_df will have data from all the sheets
The code works so far, but only one of the sheets. How do I get it to work for all 22 sheets?
You can use numpy to perform basic math on pandas.DataFrame or pandas.Series
take a look at my code below
import pandas as pd, numpy as np
XL_PATH = r'C:\Users\YourName\PythonProject\Book1.xlsx'
xlFile = pd.ExcelFile(XL_PATH)
xlSheetNames = xlFile.sheet_names
dfList = [] # variable to store all DataFrame
for shName in xlSheetNames:
df = pd.read_excel(XL_PATH, sheet_name=shName) # read sheet X as DataFrame
dfList.append(df) # put DataFrame into a list
for df in dfList:
print(df)
dfAverage = np.average(df) # use numpy to get DataFrame average
print(dfAverage)
#Try code below
import pandas as pd, numpy as np, os
XL_PATH = "YOUR EXCEL FULL PATH"
SH_NAMES = "WILL CONTAINS LIST OF EXCEL SHEET NAME"
DF_DICT = {} """WILL CONTAINS DICTIONARY OF DATAFRAME"""
def readExcel():
if not os.path.isfile(XL_PATH): return FileNotFoundError
SH_NAMES = pd.ExcelFile(XL_PATH).sheet_names
# pandas.read_excel() have argument 'sheet_name'
# when you put a list to 'sheet_name' argument
# pandas will return dictionary of dataframe with sheet_name as keys
DF_DICT = pd.read_excel(XL_PATH, sheet_name=SH_NAMES)
return SH_NAMES, DF_DICT
#Now you have DF_DICT that contains all DataFrame for each sheet in excel
#Next step is to append all rows data from Sheet1 to SheetX
#This will only works if you have same column for all DataFrame
def appendAllSheets():
dfAp = pd.DataFrame()
for dict in DF_DICT:
df = DF_DICT[dict]
dfAp = pd.DataFrame.append(self=dfAp, other=df)
return dfAp
#you can now call the function as below:
dfWithAllData = appendAllSheets()
#now you have one DataFrame with all rows combine from Sheet1 to SheetX
#you can fixed the data, for example to drop all rows which contain '0'
dfZero_Removed = dfWithAllData[[dfWithAllData['Column_Name'] != 0]]
dfNA_removed = dfWithAllData[not[pd.isna(dfWithAllData['Column_Name'])]]
#last step, to find average or other math operation
#just let numpy do the job
average_of_all_1 = np.average(dfZero_Removed)
average_of_all_2 = np.average(dfNA_Removed)
#show result
#this will show the average of all
#rows of data from Sheet1 to SheetX from your target Excel File
print(average_of_all_1, average_of_all_2)

Pandas DataFrame takes too long

I am running the below code on a file with close to 300k lines. I know my code is not very efficient as it takes forever to finish, can anyone advise me on how I can speed it up?
import sys
import numpy as np
import pandas as pd
file = sys.argv[1]
df = pd.read_csv(file, delimiter=' ',header=None)
df.columns = ["ts", "proto", "orig_bytes", "orig_pkts", "resp_bytes", "resp_pkts", "duration", "conn_state"]
orig_bytes = np.array(df['orig_bytes'])
resp_bytes = np.array(df['resp_bytes'])
size = np.array([])
ts = np.array([])
for i in range(len(df)):
if orig_bytes[i] > resp_bytes[i]:
size = np.append(size, orig_bytes[i])
ts = np.append(ts, df['ts'][i])
else:
size = np.append(size, resp_bytes[i])
ts = np.append(ts, df['ts'][i])
The aim is to only record instances where one of the two (orig_bytes or resp_bytes) is the larger one.
Thanking you all for your help
I can't guarantee that this will run faster than what you have, but it is a more direct route to where you want to go. Also, I'm assuming based on your example that you don't want to keep instances where the two byte values are equal and that you want a separate DataFrame in the end, not a new column in the existing df:
After you've created your DataFrame and renamed the columns, you can use query to drop all the instances where orig_bytes and resp_bytes are the same, create a new column with the max value of the two, and then narrow the DataFrame down to just the two columns you want.
df = pd.read_csv(file, delimiter=' ',header=None)
df.columns = ["ts", "proto", "orig_bytes", "orig_pkts", "resp_bytes", "resp_pkts", "duration", "conn_state"]
df_new = df.query("orig_bytes != resp_bytes")
df_new['biggest_bytes'] = df_new[['orig_bytes', 'resp_bytes']].max(axis=1)
df_new = df_new[['ts', 'biggest_bytes']]
If you do want to include the entries where they are equal to each other, then just skip the query step.

add column to a dataframe in python pandas

How do i loop through my excel sheet and add each 'Adjusted Close' to a dataframe? I want to summarize all adj close and make an stock indice.
When i try with the below code the dataframe Percent_Change is empty.
xls = pd.ExcelFile('databas.xlsx')
countSheets = len(xls.sheet_names)
Percent_Change = pd.DataFrame()
x = 0
for x in range(countSheets):
data = pd.read_excel('databas.xlsx', sheet_name=x, index_col='Date')
# Calculate the percent change from day to day
Percent_Change[x] = pd.Series(data['Adj Close'].pct_change()*100, index=Percent_Change.index)
stock_index = data['Percent_Change'].cumsum()
unfortunately I do not have the data to replicate your complete example. However, there appears to be a bug in your code.
You are looping over "x" and "x" is a list of integers. You probably want to loop over the sheet names and append them to your DF. If you want to do that your code should be:
import pandas as pd
xls = pd.ExcelFile('databas.xlsx')
# pep8 unto thyself only, it is conventional to use "_" instead of camelCase or to avoid longer names if at all possible
sheets = xls.sheet_names
Percent_Change = pd.DataFrame()
# using sheet instead of x is more "pythonic"
for sheet in sheets:
data = pd.read_excel('databas.xlsx', sheet_name=sheet, index_col='Date')
# Calculate the percent change from day to day
Percent_Change[sheet] = pd.Series(data['Adj Close'].pct_change()*100, index=Percent_Change.index)
stock_index = data['Percent_Change'].cumsum()

How to get rid of "chaning" rows above headers (lenght changes everytime but headers and data are always the same)

I have the following csv file:
csv file
there are about 6-8 rows at the top of the file, I know how to make a new dataframe in Pandas, and filter the data:
df = pd.read_csv('payments.csv')
df = df[df["type"] == "Order"]
print df.groupby('sku').size()
df = df[df["marketplace"] == "amazon.com"]
print df.groupby('sku').size()
df = df[df["promotional rebates"] > ((df["product sales"] + df["shipping credits"])*-.25)]
print df.groupby('sku').size()
df.to_csv("out.csv")
My issue is with the Headers. I need to
1. look for the row that has date/time & another field.
That way I do not have to change my code if the file keeps changing the row count before the headers.
2. make a new DF excluding those rows.
What is the best approach, to make sure the code does not break to changes as long as the header row exist and has a few Fields matching. Open for any suggestions.
considering a CSV file like this:
random line content
another random line
yet another one
datetime, settelment id, type
dd, dd, dd
You can use the following to compute the header's line number:
#load the first 20 rows of the csv file as a one column dataframe
#to look for the header
df = pd.read_csv("csv_file.csv", sep="|", header=None, nrows=20)
# use a regular expression to look check which column has the header
# the following will generate a array of booleans
# with True if the row contains the regex "datetime.+settelment id.+type"
indices = df.iloc[:,0].str.contains("datetime.+settelment id.+type")
# get the row index of the header
header_index = df[indices].index.values[0]
and read the csv file starting from the header's index:
# to read the csv file, use the following:
df = pd.read_csv("csv_file.csv", skiprows=header_index+1)
Reproducible example:
import pandas as pd
from StringIO import StringIO
st = """
random line content
another random line
yet another one
datetime, settelment id, type
dd, dd, dd
"""
df = pd.read_csv(StringIO(st), sep="|", header=None, nrows=20)
indices = df.iloc[:,0].str.contains("datetime.+settelment id.+type")
header_index = df[indices].index.values[0]
df = pd.read_csv(StringIO(st), skiprows=header_index+1)
print(df)
print("columns")
print(df.columns)
print("shape")
print(df.shape)
Output:
datetime settelment id type
0 dd dd dd
columns
Index([u'datetime', u' settelment id', u' type'], dtype='object')
shape
(1, 3)

Categories

Resources