trying to make "inplace" work in my extension for pandas - python

I wrote the following class to remove trend from my pandas dataframe.
import pandas as pd
#pd.api.extensions.register_dataframe_accessor("detrend")
class detrend:
def __init__(self, pandas_obj):
self._obj = pandas_obj
def percent(self, inplace=False):
change_m = self._obj.copy()
for name in change_m.columns:
col = self._obj[name]
change = (col[col.first_valid_index()] /
col[col.last_valid_index()])**(1 / (col.count() - 1))
change_m[name] = change
change_m[name] = change_m[name].cumprod()
result = change_m * self._obj
if (inplace == True):
self._obj = result
return(result)
the method works fine and returns the correct dataframe.
the only part that is confusing me is the following:
if (inplace == True):
self._obj = result
When I pass inplace=True, the changes are not saved into the original dataframe.
what am I doing wrong? how should I make inplace work in my method?

Related

Getting Records from A DataFrame, one by one

I'm implementing an API in order to get a data from a database and return the data according to some conditions.
I managed to create a dataframe by using Pandas. Now, my task is to implement a function that returns the records of the dataframe, one by one, like an iterator. Meaning, each time the user calls this method getRecord(self), he gets the next record.
I'm having trouble with implementing this method and I'd really like to get some help. I looked for a ways to do it by using function of Pandas and couldn't find a thing. I also thought about implement the function __iter__ and __next__ in my class but it didn't work.
Wondering if you are looking something like this -
class DfIter:
def __init__(self):
self.df: pd.DataFrame = pd.DataFrame({"a": [1,2,3,4,5], "b": [2,4,8,9,5], "c": [3,4,5,6,7]})
self.state = 0
def getRecord(self):
while self.state < self.df.shape[0]:
_data = self.df.iloc[self.state]
self.state += 1
return _data
else:
raise IndexError("No more datapoints to return")
iter_obj = DfIter()
iter_obj.getRecord()
Perhaps, you are looking for something as follows -
class LazyPandas:
def __init__(self, df):
self.df = df
self.max = df.shape[0]
def __next__(self):
if self.n <= self.max:
result = self.df.iloc[self.n]
self.n += 1
return result
else:
raise StopIteration
def __iter__(self):
self.n = 0
return self
import pandas as pd
df = pd.read_csv("test.csv")
lazy_df = LazyPandas(df)
i = iter(lazy_df)
print(next(i))
print(next(i))
print(next(i))
print(next(i))

How to return data from one definition to another one? [duplicate]

This question already has answers here:
How do I get a result (output) from a function? How can I use the result later?
(4 answers)
Closed 1 year ago.
I am messing around with python, and am trying to make a simple data cleaning program. I'm trying to pass the title values from the read_excel module, to the output module. But, it keeps saying name title is not defined. Here is my code:
import os
import pandas as pd
import math
class Item():
__name = ""
__cost = 0
__gender = ""
__prime = ""
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self,wanted_cost,wanted_gender,wanted_prime):
return bool(self.__name and self.__gender == wanted_gender and self.__cost <= wanted_cost and self.__prime == wanted_prime)
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __eq__(self, other):
return (self.__name == other.__name and self.__cost == other.__cost and self.__gender == other.__gender and self.__prime == other.__prime)
def __hash__(self):
return hash((self.__name, self.__cost, self.__gender, self.__prime))
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def tuple(self):
return self.__name, self.__cost, self.__gender, self.__prime
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
title = list(df.columns.values)
print(title)
array = df.values.tolist()
print(array)
return array
return output(title)
def process(array):
mylist = {Item(*k) for k in array}
print(mylist)
filtered = {obj for obj in mylist if obj.has_all_properties()}
clean = {obj for obj in filtered if obj.clean(20,"male","yes")}
result = list(clean)
print(result)
def output(where, sort_data, title):
t_list = [obj.tuple() for obj in sort_data]
output = pd.DataFrame(t_list, columns = title)
output.to_excel(where, index = False, header = True)
if __name__ == "__main__":
inputfile = read_excel('.XLSX')
processdata = process(inputfile)
result = output('clean_data.xlsx', processdata, title)
can you show me waht to do instead? Thank you for the help
After you call return your function will exit so you can't put any statement after returning from your function.
You can return the both Like this
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
title = list(df.columns.values)
print(title)
array = df.values.tolist()
print(array)
return array, output(title)
this will return a tupple of your values
(array, output(title))
Right after a return statement is executed the function will exit. This means that return output(title) will never actually happen in your code. As well, output() doesn't return anything and DataFrame.to_excel() only writes to an excel file. What you want to do in read_excel() is
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
title = list(df.columns.values)
print(title)
array = df.values.tolist()
print(array)
output(title)
return array
I find one of the most easy to understand way of solving my current issue. So, I just break down the read_excel definition and make a get_header and get_list definition. Here is my solution:
import os
import pandas as pd
import math
class Item():
__name = ""
__cost = 0
__gender = ""
__prime = ""
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self,wanted_cost,wanted_gender,wanted_prime):
return bool(self.__name and self.__gender == wanted_gender and self.__cost <= wanted_cost and self.__prime == wanted_prime)
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __eq__(self, other):
return (self.__name == other.__name and self.__cost == other.__cost and self.__gender == other.__gender and self.__prime == other.__prime)
def __hash__(self):
return hash((self.__name, self.__cost, self.__gender, self.__prime))
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def tuple(self):
return self.__name, self.__cost, self.__gender, self.__prime
def read_excel(filetype):
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith(filetype):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
df = df[['name', 'cost', 'used_by', 'prime']]
return df
def get_list(dataframe):
array = dataframe.values.tolist()
print(array)
return array
def get_header(dataframe):
title = list(dataframe.columns.values)
print(title)
return title
def process(array):
mylist = {Item(*k) for k in array}
print(mylist)
filtered = {obj for obj in mylist if obj.has_all_properties()}
clean = {obj for obj in filtered if obj.clean(20,"male","yes")}
result = list(clean)
print(result)
t_list = [obj.tuple() for obj in result]
return t_list
def output(where, sort_data, title):
output = pd.DataFrame(sort_data, columns = title)
output.to_excel(where, index = False, header = True)
if __name__ == "__main__":
inputfile = read_excel('.XLSX')
array = get_list(inputfile)
header = get_header(inputfile)
processdata = process(array)
result = output('clean_data.xlsx', processdata, header)

How do I delete repeated elements in a dataframe made by a class-method function?

I have a simple python code for data cleaning. This code imports data from Excel that has the format like this:
product cost used_by prime
name price gender yes or no
name price gender yes or no
... and so on
afterward I will get a mylist using the class function that looks something like this:
mylist = [Item(comic,20.0,male,yes),
Item(paint,14.0,male,no),
Item(pen,5.0,female,nan),
Item(phone case,9.0,nan,no),
Item(headphone,40.0,male,yes),
Item(coat,nan,male,no),
Item(comic,15.0,male,yes),
Item(nan,15.0,male,no)
... and so on]
and after all the filter and cleaning I will get a result that looks like this:
result = [Item(comic,20.0,male,yes),
Item(underwear,15.0,male,yes),
Item(comic,15.0,male,yes)
...
Item(underwear,15.0,male,yes),
...and so on]
Here is the code I got so far:
import os
import pandas as pd
import math
cwd = os.path.abspath('')
files = os.listdir(cwd)
df = pd.DataFrame()
for file in files:
if file.endswith('.XLSX'):
df = df.append(pd.read_excel(file), ignore_index=True)
df = df.where(df.notnull(), None)
array = df.values.tolist()
print(array)
class Item():
def has_all_properties(self):
return bool(self.__name and not math.isnan(self.__cost) and self.__gender and self.__prime)
def clean(self):
return bool(self.__name and self.__cost <=20 and self.__gender == "male" and self.__prime == "yes")
def __init__(self, name, cost, gender, prime):
self.__name = name
self.__cost = cost
self.__gender = gender
self.__prime = prime
def __repr__(self):
return f"Item({self.__name},{self.__cost},{self.__gender},{self.__prime})"
def __tuple__(self):
return self.__name, self.__cost, self.__gender, self.__prime
mylist = [Item(*k) for k in array]
filtered = filter(Item.has_all_properties, mylist)
clean = filter(Item.clean, filtered)
result = list(clean)
t_list = [obj.__tuple__() for obj in result]
print(t_list)
output = pd.DataFrame(t_list, columns =['name', 'cost', 'gender', 'prime'])
print(output)
output.to_excel('clean_data.xlsx', index = False, header = True)
In the result, there are two type of repetitive data, one is like the underwear which have two exact same lines; and the other one is like the comic, with different cost values.
So what I want to do is remove one of the line that are exact the same for case one and keep the line that has the smaller cost value for case 2.
So for case two, I am think of reading the product to identify if they are the same and if they are I then compare their cost and keep the one with smaller value. But I am not sure if that is the right way of doing it.
I am aware that using pandas all the way work but I wish to explore the class function and use the customized data frame.
Can you suggest how to do this?
You could use a Set instead of a list, i.e., changing myList = [Item(*k) for k in array] to mySet = {Item(*k) for k in array}.
Sets do not allow duplicates.
EDIT
Make sure to include implementations of __hash__ and __eq__ in your Item class so that set can know how to determine whether an item has a duplicate.
In your case, the __eq__ would look something like the following:
def __eq__(self, other):
self.__name == other.__name
self.__cost == other.__cost
self.__gender == other.__gender
self.__prime == other.__prime

New pandas column based on whether functions are called

I have a class which calls functions depending on whether initial keywords are true or false. The intention is to be able to control how I create a new column in dataframe df. An abridged version of the class and functions is as follows:
class DFSetter:
def __init__(self, justify=True, caps=True, table=True):
if justify:
self.set_justify()
if caps:
self.set_all_caps()
if self.table:
self.set_table()
def set_justify(self):
self.justify = (self.df['jc'] != self.df['jc'].shift())
def set_all_caps(self):
self.all_caps = ((self.df['caps']==True) & (self.df['cap_diffs']>5))
def set_table(self):
self.table = ((self.df['table'] == True) & (self.df['table'].shift() == False))
Suppose I want to make a new column, row_break in this dataframe which will set to True if any of the conditions are met. How could I create this new column if the call to one of the functions is switched off by initialising it as False?
This is currently how I'm doing it with everything set to True:
self.df['row_break'] = (self.justify | self.all_caps | self.table | pStyle)
* UPDATE WITH ANSWER *
Initialise with additional self.switches={} and add self.switches.update({'item':self.item}) to each function.
Create a new dataframe from the self.switches dictionary: self.switches_df=(self.switches)
Set 'row_break' column on main self.df dataframe by seeing if any of the columns are True: self.df['row_break'] = (self.switches_df.any(axis='columns')
i.e
class DFSetter:
def __init__(self, justify=True, caps=True, table=True):
self.switches={}
if justify:
self.set_justify()
if caps:
self.set_all_caps()
if self.table:
self.set_table()
def set_justify(self):
self.justify = (self.df['jc'] != self.df['jc'].shift())
self.switches.update({'justify':self.justify})
def set_all_caps(self):
self.all_caps = ((self.df['caps']==True) & (self.df['cap_diffs']>5))
self.switches.update({'caps':self.all_caps})
def set_table(self):
self.table = ((self.df['table'] == True) & (self.df['table'].shift() == False))
self.switches.update({'table':self.table})
def set_row_break(self):
switches_df = pd.DataFrame(self.switches)
self.df['row_break'] = switches_df.any(axis='columns')
You could initialize the new column to false and then update it for each of the conditions that are set to true.
I'm assuming here that the dataframe is one of the inputs to the class (since you use self.df).
class DFSetter:
def __init__(self, df, justify=True, caps=True, table=True):
self.df = df
self.df['row_break'] = False
if justify:
self.set_justify()
if caps:
self.set_all_caps()
if self.table:
self.set_table()
def set_justify(self):
self.justify = (self.df['jc'] != self.df['jc'].shift())
self.df['row_break'] = self.df['row_break'] | self.justify
def set_all_caps(self):
self.all_caps = ((self.df['caps']==True) & (self.df['cap_diffs']>5))
self.df['row_break'] = self.df['row_break'] | self.all_caps
def set_table(self):
self.table = ((self.df['table'] == True) & (self.df['table'].shift() == False))
self.df['row_break'] = self.df['row_break'] | self.table
Alternativly, keeping to the same idea, the self.df['row_break'] assignments could be done inside each if statement. In this way, the set_-methods would be kept cleaner.

iterate over pyspark dataframe columns

I have the following pyspark.dataframe:
age state name income
21 DC john 30-50K
NaN VA gerry 20-30K
I'm trying to achieve the equivalent of df.isnull().sum() (from pandas) which produces:
age 1
state 0
name 0
income 0
At first I tried something along the lines of:
null_counter = [df[c].isNotNull().count() for c in df.columns]
but this produces the following error:
TypeError: Column is not iterable
Similarly, this is how I'm currently iterating over columns to get the minimum value:
class BaseAnalyzer:
def __init__(self, report, struct):
self.report = report
self._struct = struct
self.name = struct.name
self.data_type = struct.dataType
self.min = None
self.max = None
def __repr__(self):
return '<Column: %s>' % self.name
class BaseReport:
def __init__(self, df):
self.df = df
self.columns_list = df.columns
self.columns = {f.name: BaseAnalyzer(self, f) for f in df.schema.fields}
def calculate_stats(self):
find_min = self.df.select([fn.min(self.df[c]).alias(c) for c in self.df.columns]).collect()
min_row = find_min[0]
for column, min_value in min_row.asDict().items():
self[column].min = min_value
def __getitem__(self, name):
return self.columns[name]
def __repr__(self):
return '<Report>'
report = BaseReport(df)
calc = report.calculate_stats()
for column in report1.columns.values():
if hasattr(column, 'min'):
print("{}:{}".format(column, column.min))
which allows me to 'iterate over the columns'
<Column: age>:1
<Column: name>: Alan
<Column: state>:ALASKA
<Column: income>:0-1k
I think this method has become way to complicated, how can I properly iterate over ALL columns to provide vaiour summary statistcs (min, max, isnull, notnull, etc..) The distinction between pyspark.sql.Row and pyspark.sql.Column seems strange coming from pandas.
Have you tried something like this:
names = df.schema.names
for name in names:
print(name + ': ' + df.where(df[name].isNull()).count())
You can see how this could be modified to put the information into a dictionary or some other more useful format.
you can try this one :
nullDf= df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
nullDf.show()
it will give you a list of columns with the number of null its null values.

Categories

Resources