KeyError for Column When Trying to Subset - python

I'm trying to subset my data by the value of a column using the typical pandas protocol:
df[df[column_name] == "value"]
But I keep getting a keyerror for "Product (ACQ Search) - ONC". I also found that checking the column names with pd.columns shows only the 4 columns I renamed at a different point in the script. Why do I keep getting a keyerror?
Here's my code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Load the excel file into a dataframe
df = pd.read_excel("Marginal CPA data - NOV.xlsx")
# Delete the bottom row
df = df[:-1]
# Filter the column labeled "Campaign Type (Search ACQ) - ONC" to keep only rows with value "NonBrand"
df = df[df["Campaign Type (Search ACQ) - ONC"] == "NonBrand"]
df["Date"] = pd.to_datetime(df["Day"], format="%d %b %Y")
df = df.drop("Day", axis=1)
# Make a pivot table
pivot_table = pd.pivot_table(df, values=["Media Cost", "CAFE Approvals"],
index=["Campaign Type (Search ACQ) - ONC", "Product (ACQ Search) - ONC", "Date"],
columns=["CDJ"], aggfunc="sum")
df_pivot = pivot_table.fillna(value=0)
# Reset the column index to a single level
df_pivot.columns = ["_".join(col) for col in df_pivot.columns]
cols = {
"Media Cost_CPA": "CPA Spend",
"Media Cost_Non CPA (CDJ)": "CDJ Spend",
"CAFE Approvals_CPA": "CPA Approvals",
"CAFE Approvals_Non CPA (CDJ)": "CDJ Approvals"
}
df_pivot = df_pivot.rename(columns=cols)
# Add two new columns for Total Spend and Total Approvals
df_pivot["Total Approvals"] = df_pivot["CPA Approvals"] + df_pivot["CDJ Approvals"]
df_pivot["Total Spend"] = df_pivot["CPA Spend"] + df_pivot["CDJ Spend"]
#Remove data for days where spend is zero
df_pivot = df_pivot[df_pivot["CPA Spend"] != 0]
df_pivot = df_pivot[df_pivot["Total Approvals"] != 0]
#Sort Date and Product
df_pivot = df_pivot.sort_values("Date", ascending=True)
df_pivot = df_pivot.sort_values("Product (ACQ Search) - ONC", ascending=True)
df_pivot.to_excel("Marginal CPA data - NOV (processed).xlsx")
# filter the data to only include rows where "Product (ACQ Search) - ONC" is "Consumer"
consumer_data = df_pivot[df_pivot["Product (ACQ Search) - ONC"] == "Consumer"]
Data:
Campaign Type (Search ACQ) - ONC
Product (ACQ Search) - ONC
CDJ
Day
Media Cost
CAFE Approvals
NonBrand
Consumer
CPA
11 Jan 2023
29019.77415
94
NonBrand
Consumer
Non CPA (CDJ)
17 Jan 2023
24640.36448
86
NonBrand
Consumer
Non CPA (CDJ)
12 Jan 2023
23627.78256
78
NonBrand
Student
CPA
17 Jan 2023
29863.95447
152
NonBrand
Miles
CPA
23 Jan 2023
380.94
1
NonBrand
Miles
CPA
07 Jan 2023
1786.51
5
NonBrand
Consumer
CPA
19 Jan 2023
26745.81705
64
NonBrand
Secured
CPA
20 Jan 2023
1551.35
19
NonBrand
Consumer
Non CPA (CDJ)
02 Feb 2023
41185.11225
66
NonBrand
Student
CPA
08 Jan 2023
42822.8508
171
NonBrand
Student
CPA
16 Jan 2023
29408.66012
160
NonBrand
Consumer
CPA
17 Jan 2023
29378.05227
85
NonBrand
Miles
CPA
10 Jan 2023
2019.25
4
NonBrand
Miles
CPA
11 Jan 2023
1604.98
4
NonBrand
Secured
CPA
21 Jan 2023
1704.13419
22

The problem is you are trying to slice your data using a column that is actually an index.
You can slice your MultiIndex data frame by using pd.IndexSlice and passing "Consumer" to the second level:
idx = pd.IndexSlice
df_pivot.loc[idx[:, "Consumer", :]]
Which returns the following:
CPA Approvals
CDJ Approvals
CPA Spend
CDJ Spend
Total Approvals
Total Spend
('NonBrand', Timestamp('2023-01-11 00:00:00'))
94
0
29019.8
0
94
29019.8
('NonBrand', Timestamp('2023-01-17 00:00:00'))
85
86
29378.1
24640.4
171
54018.4
('NonBrand', Timestamp('2023-01-19 00:00:00'))
64
0
26745.8
0
64
26745.8
See more about advanced indexing here: https://pandas.pydata.org/docs/user_guide/advanced.html#
You can also reset the index and then subset the data in a manner similar to your last line:
df = df_pivot.reset_index()
df.loc[df["Product (ACQ Search) - ONC"] == "Consumer"]
Which returns the following:
Campaign Type (Search ACQ) - ONC
Product (ACQ Search) - ONC
Date
CPA Approvals
CDJ Approvals
CPA Spend
CDJ Spend
Total Approvals
Total Spend
0
NonBrand
Consumer
2023-01-11 00:00:00
94
0
29019.8
0
94
29019.8
1
NonBrand
Consumer
2023-01-17 00:00:00
85
86
29378.1
24640.4
171
54018.4
2
NonBrand
Consumer
2023-01-19 00:00:00
64
0
26745.8
0
64
26745.8
The first method loses the first two indices, while the second method preserves all of the data.

Related

Python // Pandas: Column Won't Rename

The "Day" column won't rename. Could it have something to do with the "Day" column being an index and not a column? Here's my code and a sample of the unprocessed data:
import pandas as pd
import numpy as np
# Load the excel file into a dataframe
df = pd.read_excel("Marginal CPA data - NOV.xlsx")
# Delete the bottom row
df = df[:-1]
# Filter the column labeled "Campaign Type (Search ACQ) - ONC" to keep only rows with value "NonBrand"
df = df[df["Campaign Type (Search ACQ) - ONC"] == "NonBrand"]
# Make a pivot table
pivot_table = pd.pivot_table(df, values=["Media Cost", "CAFE Approvals"],
index=["Campaign Type (Search ACQ) - ONC", "Product (ACQ Search) - ONC", "Day"],
columns=["CDJ"], aggfunc="sum")
df_pivot = pivot_table.fillna(value=0)
# Reset the column index to a single level
df_pivot.columns = ["_".join(col) for col in df_pivot.columns]
# Rename columns
df_pivot = df_pivot.rename(columns={"Media Cost_CPA": "CPA Spend", "Media Cost_Non CPA (CDJ)": "CDJ Spend",
"CAFE Approvals_CPA": "CPA Approvals", "CAFE Approvals_Non CPA (CDJ)": "CDJ Approvals",
"Day": "Date"})
Campaign Type (Search ACQ) - ONC
Product (ACQ Search) - ONC
CDJ
Day
Media Cost
CAFE Approvals
NonBrand
Consumer
CPA
11 Jan 2023
29019.77415
94
NonBrand
Consumer
Non CPA (CDJ)
17 Jan 2023
24640.36448
86
NonBrand
Consumer
Non CPA (CDJ)
12 Jan 2023
23627.78256
78
NonBrand
Student
CPA
17 Jan 2023
29863.95447
152
NonBrand
Miles
CPA
23 Jan 2023
380.94
1
NonBrand
Miles
CPA
07 Jan 2023
1786.51
5
NonBrand
Consumer
CPA
19 Jan 2023
26745.81705
64
NonBrand
Secured
CPA
20 Jan 2023
1551.35
19
NonBrand
Consumer
Non CPA (CDJ)
02 Feb 2023
41185.11225
66
rename_axis might work better for a MultiIndex:
cols = {
"Media Cost_CPA": "CPA Spend",
"Media Cost_Non CPA (CDJ)": "CDJ Spend",
"CAFE Approvals_CPA": "CPA Approvals",
"CAFE Approvals_Non CPA (CDJ)": "CDJ Approvals"
}
indx = {
"Day": "Date"
}
df_pivot = df_pivot.rename(columns=cols).rename_axis(index=indx)

Splitting single text column into multiple columns Pandas

I am working on extraction of raw data from various sources. After a process, I could form a dataframe that looked like this.
data
0 ₹ 16,50,000\n2014 - 49,000 km\nJaguar XF 2.2\nJAN 16
1 ₹ 23,60,000\n2017 - 28,000 km\nMercedes-Benz CLA 200 CDI Style, 2017, Diesel\nNOV 26
2 ₹ 26,00,000\n2016 - 44,000 km\nMercedes Benz C-Class Progressive C 220d, 2016, Diesel\nJAN 03
I want to split this raw dataframe into relevant columns in order of the raw data occurence: Price, Year, Mileage, Name, Date
I have tried to use df.data.split('-', expand=True) with other delimiter options sequentially along with some lambda functions to achieve this, but haven't gotten much success.
Need assistance in splitting this data into relevant columns.
Expected output:
price year mileage name date
16,50,000 2014 49000 Jaguar 2.2 XF Luxury Jan-17
23,60,000 2017 28000 CLA CDI Style Nov-26
26,00,000 2016 44000 Mercedes C-Class C220d Jan-03
Try split on '\n' then on '-'
df[["Price","Year-Mileage","Name","Date"]] =df.data.str.split('\n', expand=True)
df[["Year","Mileage"]] =df ["Year-Mileage"].str.split('-', expand=True)
df.drop(columns=["data","Year-Mileage"],inplace=True)
print(df)
Price Name Date Year Mileage
0 ₹ 16,50,000 Jaguar XF 2.2 JAN 16 2014 49,000 km
2 ₹ 26,00,000 Mercedes Benz C-Class Progressive C 220d, 2016, Diesel JAN 03 2016 44,000 km
1 ₹ 23,60,000 Mercedes-Benz CLA 200 CDI Style, 2017, Diesel NOV 26 2017 28,000 km

Pandas.read_html only getting header of html table

So I'm using pandas.read_html to try to get a table from a website. For some reason it's not giving me the entire table and it's just getting the header row. How can I fix this?
Code:
import pandas as pd
term_codes = {"fall":"10", "spring":"20", "summer":"30"}
# year must be last number in school year: 2021-2022 so we pick 2022
year = "2022"
department = "CSCI"
term_code = year + term_codes["fall"]
url = "https://courselist.wm.edu/courselist/courseinfo/searchresults?term_code=" + term_code + "&term_subj=" + department + "&attr=0&attr2=0&levl=0&status=0&ptrm=0&search=Search"
def findCourseTable():
dfs = pd.read_html(url)
print(dfs[0])
#df = dfs[1]
#df.to_csv(r'courses.csv', index=False)
if __name__ == "__main__":
findCourseTable()
Output:
Empty DataFrame
Columns: [CRN, COURSE ID, CRSE ATTR, TITLE, INSTRUCTOR, CRDT HRS, MEET DAY:TIME, PROJ ENR, CURR ENR, SEATS AVAIL, STATUS]
Index: []
The page contains malformed HTML code, so use flavor="html5lib" in pd.read_html to read it correctly:
import pandas as pd
term_codes = {"fall": "10", "spring": "20", "summer": "30"}
# year must be last number in school year: 2021-2022 so we pick 2022
year = "2022"
department = "CSCI"
term_code = year + term_codes["fall"]
url = (
"https://courselist.wm.edu/courselist/courseinfo/searchresults?term_code="
+ term_code
+ "&term_subj="
+ department
+ "&attr=0&attr2=0&levl=0&status=0&ptrm=0&search=Search"
)
df = pd.read_html(url, flavor="html5lib")[0]
print(df)
Prints:
CRN COURSE ID CRSE ATTR TITLE INSTRUCTOR CRDT HRS MEET DAY:TIME PROJ ENR CURR ENR SEATS AVAIL STATUS
0 16064 CSCI 100 01 C100, NEW Reading#Russia Willner, Dana; Prokhorova, Elena 4 MWF:1300-1350 10 10 0* CLOSED
1 14614 CSCI 120 01 NaN A Career in CS? And Which One? Kemper, Peter 1 M:1700-1750 36 20 16 OPEN
2 16325 CSCI 120 02 NEW Concepts in Computer Science Deverick, James 3 TR:0800-0920 36 25 11 OPEN
3 12372 CSCI 140 01 NEW, NQR Programming for Data Science Khargonkar, Arohi 4 MWF:0900-0950 36 24 12 OPEN
4 14620 CSCI 140 02 NEW, NQR Programming for Data Science Khargonkar, Arohi 4 MWF:1100-1150 36 27 9 OPEN
5 13553 CSCI 140 03 NEW, NQR Programming for Data Science Khargonkar, Arohi 4 MWF:1300-1350 36 25 11 OPEN
...and so on.

Combine rows from different tables based on common columns pandas

I have 3 tables/df. All have same column names. Bascially they are df for data from different months
October (df1 name)
Sales_value Sales_units Unique_Customer_id Countries Month
1000 10 4 1 Oct
20 2 4 3 Oct
November (df2 name)
Sales_value Sales_units Unique_Customer_id Countries Month
2000 1000 40 14 Nov
112 200 30 10 Nov
December (df3 name)
Sales_value Sales_units Unique_Customer_id Countries Month
20009090 4809509 4500 30 Dec
etc. This is dummy data. Each table has thousands of rows in reality. How to combine all these 3 tables such that columns come only once and all rows are displayed such that rows from October df come first, followed by November df rows followed by December df rows. When i use joins I am getting column names repeated.
Expected output:
Sales_value Sales_units Unique_Customer_id Countries Month
1000 10 4 1 Oct
20 2 4 3 Oct
2000 1000 40 14 Nov
112 200 30 10 Nov
20009090 4809509 4500 30 Dec
Concat combines rows from different tables based on common columns
pd.concat([df1, df2, df3])

In a pandas dataframe, count the number of times a condition occurs in one column?

Background
I have five years of NO2 measurement data, in csv files-one file for every location and year. I have loaded all the files into pandas dataframes in the same format:
Date Hour Location NO2_Level
0 01/01/2016 00 Street 18
1 01/01/2016 01 Street 39
2 01/01/2016 02 Street 129
3 01/01/2016 03 Street 76
4 01/01/2016 04 Street 40
Goal
For each dataframe count the number of times NO2_Level is greater than 150 and output this.
So I wrote a loop that's creates all the dataframes from the right directories and cleans them appropriately .
Problem
Whatever I've tried produces results I know on inspection are incorrect, e.g :
-the count value for every location on a given year is the same (possible but unlikely)
-for a year when I know there should be any positive number for the count, every location returns 0
What I've tried
I have tried a lot of approaches to getting this value for each dataframe, such as making the column a series:
NO2_Level = pd.Series(df['NO2_Level'])
count = (NO2_Level > 150).sum()'''
Using pd.count():
count = df[df['NO2_Level'] >= 150].count()
These two approaches have gotten closest to what I want to output
Example to test on
data = {'Date': ['01/01/2016','01/02/2016',' 01/03/2016', '01/04/2016', '01/05/2016'], 'Hour': ['00', '01', '02', '03', '04'], 'Location': ['Street','Street','Street','Street','Street',], 'NO2_Level': [18, 39, 129, 76, 40]}
df = pd.DataFrame(data=d)
NO2_Level = pd.Series(df['NO2_Level'])
count = (NO2_Level > 150).sum()
count
Expected Outputs
So from this I'm trying to get it to output a single line for each dataframe that was made in the format Location, year, count (of condition):
Kirkstall Road,2013,47
Haslewood Close,2013,97
...
Jack Lane Hunslet,2015,158
So the above example would produce
Street, 2016, 1
Actual
Every year produces the same result for each location, for some years (2014) the count doesn't seem to work at all when on inspection there should be:
Kirkstall Road,2013,47
Haslewood Close,2013,47
Tilbury Terrace,2013,47
Corn Exchange,2013,47
Temple Newsam,2014,0
Queen Street Morley,2014,0
Corn Exchange,2014,0
Tilbury Terrace,2014,0
Haslewood Close,2015,43
Tilbury Terrace,2015,43
Corn Exchange,2015,43
Jack Lane Hunslet,2015,43
Norman Rows,2015,43
Hopefully this helps.
import pandas as pd
ddict = {
'Date':['2016-01-01','2016-01-01','2016-01-01','2016-01-01','2016-01-01','2016-01-02',],
'Hour':['00','01','02','03','04','02'],
'Location':['Street','Street','Street','Street','Street','Street',],
'N02_Level':[19,39,129,76,40, 151],
}
df = pd.DataFrame(ddict)
# Convert dates to datetime
df['Date'] = pd.to_datetime(df['Date'])
# Make a Year column
df['Year'] = df['Date'].apply(lambda x: x.strftime('%Y'))
# Group by lcoation and year, count by M02_Level > 150
df1 = df[df['N02_Level'] > 150].groupby(['Location','Year']).size().reset_index(name='Count')
# Interate the results
for i in range(len(df1)):
loc = df1['Location'][i]
yr = df1['Year'][i]
cnt = df1['Count'][i]
print(f'{loc},{yr},{cnt}')
### To not use f-strings
for i in range(len(df1)):
print('{loc},{yr},{cnt}'.format(loc=df1['Location'][i], yr=df1['Year'][i], cnt=df1['Count'][i]))
Sample data:
Date Hour Location N02_Level
0 2016-01-01 00 Street 19
1 2016-01-01 01 Street 39
2 2016-01-01 02 Street 129
3 2016-01-01 03 Street 76
4 2016-01-01 04 Street 40
5 2016-01-02 02 Street 151
Output:
Street,2016,1
here is a solution with a sample generated (randomly):
def random_dates(start, end, n):
start_u = start.value // 10 ** 9
end_u = end.value // 10 ** 9
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
location = ['street', 'avenue', 'road', 'town', 'campaign']
df = pd.DataFrame({'Date' : random_dates(pd.to_datetime('2015-01-01'), pd.to_datetime('2018-12-31'), 20),
'Location' : np.random.choice(location, 20),
'NOE_level' : np.random.randint(low=130, high= 200, size=20)})
#Keep only year for Date
df['Date'] = df['Date'].dt.strftime("%Y")
print(df)
df = df.groupby(['Location', 'Date'])['NOE_level'].apply(lambda x: (x>150).sum()).reset_index(name='count')
print(df)
Example df generated:
Date Location NOE_level
0 2018 town 191
1 2017 campaign 187
2 2017 town 137
3 2016 avenue 148
4 2017 campaign 195
5 2018 town 181
6 2018 road 187
7 2018 town 184
8 2016 town 155
9 2016 street 183
10 2018 road 136
11 2017 road 171
12 2018 street 165
13 2015 avenue 193
14 2016 campaign 170
15 2016 street 132
16 2016 campaign 165
17 2015 road 161
18 2018 road 161
19 2015 road 140
output:
Location Date count
0 avenue 2015 1
1 avenue 2016 0
2 campaign 2016 2
3 campaign 2017 2
4 road 2015 1
5 road 2017 1
6 road 2018 2
7 street 2016 1
8 street 2018 1
9 town 2016 1
10 town 2017 0
11 town 2018 3

Categories

Resources