Apply a binomial function to every row in a pandas Dataframe

Apply a binomial function to every row in a pandas Dataframe - python

I am using the following code on that dataframe:
Here picture of the dataframe used
not_rescued = df["not_rescued"]
rescued = df["rescued"]
breed = df["breed"]
def get_attribute(breed, attribute):
if breed in df.breed.unique():
if attribute in df.columns:
return df[df["breed"] == breed][attribute]
else:
raise NameError('Attribute {} does not exist.'.format(attribute))
else:
raise NameError('Breed {} does not exist.'.format(breed))
def get_not_rescued(breed):
return get_attribute(breed, 'not_rescued')
def get_rescued(breed):
return get_attribute(breed, 'rescued')
def get_total(breed):
return get_attribute(breed, 'total')
from scipy.stats import binom_test
binomial_test = binom_test(get_rescued(breed), get_total(breed), 0.08)
print(binomial_test)
I would like to apply the binomial function without having to call the function each time for each dog.. How could I use a list without having the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [18], in <cell line: 2>()
1 from scipy.stats import binom_test
----> 2 binomial_test = binom_test(get_rescued(breed), get_total(breed), 0.08)
3 print(binomial_test)
File ~/opt/anaconda3/lib/python3.9/site-packages/scipy/stats/morestats.py:2667, in binom_test(x, n, p, alternative)
2665 n = np.int_(n)
2666 else:
-> 2667 raise ValueError("Incorrect length for x.")
2669 if (p > 1.0) or (p < 0.0):
2670 raise ValueError("p must be in range [0,1]")
ValueError: Incorrect length for x.

Related

Stuck on Python "KeyError: <exception str() failed>" in BFS code of a water jug scenario

Intended Function of code: Takes a user input for the volume of 3 jars(1-9) and output the volumes with one of the jars containing the target length. jars can be Emptied/Filled a jar, or poured from one jar to another until one is empty or full.
With the code I have, i'm stuck on a key exception error .
Target length is 4 for this case
Code:
`
class Graph:
class GraphNode:
def __init__(self, jar1 = 0, jar2 = 0, jar3 = 0, color = "white", pi = None):
self.jar1 = jar1
self.jar2 = jar2
self.jar3 = jar3
self.color = color
self.pi = pi
def __repr__(self):
return str(self)
def __init__(self, jl1 = 0, jl2 = 0, jl3 = 0, target = 0):
self.jl1 = jl1
self.jl2 = jl2
self.jl3 = jl3
self.target = target
self.V = {}
for x in range(jl1 + 1):
for y in range(jl2 + 1):
for z in range(jl3 + 1):
node = Graph.GraphNode(x, y, z, "white", None)
self.V[node] = None
def isFound(self, a: GraphNode) -> bool:
if self.target in [a.jar1, a.jar2, a.jar3]:
return True
return False
pass
def isAdjacent(self, a: GraphNode, b: GraphNode) -> bool:
if self.V[a]==b:
return True
return False
pass
def BFS(self) -> [] :
start = Graph.GraphNode(0, 0, 0, "white")
queue=[]
queue.append(start)
while len(queue)>0:
u=queue.pop(0)
for v in self.V:
if self.isAdjacent(u,v):
if v.color =="white":
v.color == "gray"
v.pi=u
if self.isFound(v):
output=[]
while v.pi is not None:
output.insert(0,v)
v=v.pi
return output
else:
queue.append(v)
u.color="black"
return []
#######################################################
j1 = input("Size of first jar: ")
j2 = input("Size of second jar: ")
j3 = input("Size of third jar: ")
t = input("Size of target: ")
jar1 = int(j1)
jar2 = int(j2)
jar3 = int(j3)
target = int(t)
graph1 = Graph(jar1, jar2, jar3, target)
output = graph1.BFS()
print(output)
`
**Error: **
line 37, in isAdjacent
if self.V[a]==b:
KeyError: <exception str() failed>

Strange but when I first ran this in the IPython interpreter I got a different exception:
... :35, in Graph.isAdjacent(self, a, b)
34 def isAdjacent(self, a: GraphNode, b: GraphNode) -> bool:
---> 35 if self.V[a]==b:
36 return True
37 return False
<class 'str'>: (<class 'RecursionError'>, RecursionError('maximum recursion depth exceeded while getting the str of an object'))
When I run it as a script or in the normal interpreter I do get the same one you had:
... line 35, in isAdjacent
if self.V[a]==b:
KeyError: <exception str() failed>
I'm not sure what this means so I ran the debugger and got this:
File "/Users/.../stackoverflow/bfs1.py", line 1, in <module>
class Graph:
File "/Users/.../stackoverflow/bfs1.py", line 47, in BFS
if self.isAdjacent(u,v):
File "/Users/.../stackoverflow/bfs1.py", line 35, in isAdjacent
if self.V[a]==b:
KeyError: <unprintable KeyError object>
Uncaught exception. Entering post mortem debugging
Running 'cont' or 'step' will restart the program
> /Users/.../stackoverflow/bfs1.py(35)isAdjacent()
-> if self.V[a]==b:
(Pdb) type(a)
<class '__main__.Graph.GraphNode'>
(Pdb) str(a)
*** RecursionError: maximum recursion depth exceeded while calling a Python object
So it does seem like a maximum recursion error. (The error message you originally got is not very helpful). But the words <unprintable KeyError object> are a clue. It looks like it was not able to display the KeyError exception...
The culprit is this line in your class definition:
def __repr__(self):
return str(self)
What were you trying to do here?
The __repr__ function is called when the class is asked to produce a string representation of itself. But yours calls the string function on the instance of the class so it will call itself! So I think you actually generated a second exception while the debugger was trying to display the first!!!.
I replaced these lines with
def __repr__(self):
return f"GraphNode({self.jar1}, {self.jar2}, {self.jar3}, {self.color}, {self.pi})"
and I don't get the exception now:
Size of first jar: 1
Size of second jar: 3
Size of third jar: 6
Size of target: 4
Traceback (most recent call last):
File "/Users/.../stackoverflow/bfs1.py", line 77, in <module>
output = graph1.BFS()
File "/Users/.../stackoverflow/bfs1.py", line 45, in BFS
if self.isAdjacent(u,v):
File "/Users/.../stackoverflow/bfs1.py", line 33, in isAdjacent
if self.V[a]==b:
KeyError: GraphNode(0, 0, 0, white, None)
This exception is easier to interpret. Now it's over to you to figure out why this GraphNode was not found in the keys of self.V!

ValueError: "Columns must be same length as key" > Python

I'm using Colab to run the code but I get this error and I can't fix it. Could you help me out?
I don't know what to do in order to fix it because I have tried to change upper case or lower case.
#Inativos: ajustar nomes das colunas
dfInativos = dfInativos.rename(columns={'userId': 'id'})
dfInativos = dfInativos.rename(columns={'classId': 'ClasseId'})
dfInativos[['id','ClasseId','lastActivityDate','inactivityDaysCount','sevenDayInactiveStatus']] = dfInativos
#dfInativos['id'] = dfInativos['id'].astype(int, errors = 'ignore')
#dfInativos['ClasseId'] = dfInativos['ClasseId'].astype(int, errors = 'ignore')
dfInativos['id'] = pd.to_numeric(dfInativos['id'],errors = 'coerce')
dfInativos['ClasseId'] = pd.to_numeric(dfInativos['ClasseId'],errors = 'coerce')
#dfInativos.dropna(subset = ['lastActivityDate'], inplace=True)
dfInativos.drop_duplicates(subset = ['id','ClasseId'], inplace=True)
dfInativos['seven DayInactiveStatus'] = dfInativos['sevenDayInactiveStatus'].replace(0,'')
#Add Inactive data to main data frame
df = df.merge(dfInativos, on=['id','ClasseId'], how='left')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-79-10fe94c48d1f> in <module>()
2 dfInativos = dfInativos.rename(columns={'userId': 'id'})
3 dfInativos = dfInativos.rename(columns={'classId': 'ClasseId'})
----> 4 dfInativos[['id','ClasseId','lastActivityDate','inactivityDaysCount','sevenDayInactiveStatus']] = dfInativos
5
6
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexers.py in check_key_length(columns, key, value)
426 if columns.is_unique:
427 if len(value.columns) != len(key):
--> 428 raise ValueError("Columns must be same length as key")
429 else:
430 # Missing keys in columns are represented as -1
ValueError: Columns must be same length as key

Python/pandas KeyError issue with user-defined functions

Why do these two functions work properly and run with no errors?
def avgSalespWpY(df, weekStart_col_name, sku_col_name, order_col_name):
df['SKU_WEEK'] = pd.DatetimeIndex(df[weekStart_col_name]).week
grouped_data = df.groupby(['SKU_WEEK']).mean().reset_index()
return grouped_data[['SKU_WEEK', 'R_ORDER_QT']]
def avg_sales(df, prediction_window):
earliest_date = prediction_window['WEEK_START_DT'].min()
filter_date = df[df['WEEK_START_DT'] < earliest_date]
avg_sales_df = avgSalespWpY(filter_date, 'WEEK_START_DT', 'SKU', 'R_ORDER_QT')
prediction_window['SKU_WEEK'] = pd.DatetimeIndex(prediction_window['WEEK_START_DT']).week
avg_sales_df.columns = ['SKU_WEEK', 'OA_SPW']
avg_sales_total = avg_sales_df['OA_SPW'].mean()
prediction_window = pd.merge(prediction_window, avg_sales_df, how = 'left', on = 'SKU_WEEK')
prediction_window = prediction_window.loc[:, prediction_window.columns != 'SKU_WEEK']
prediction_window['OA_SPW'] = prediction_window['OA_SPW'].fillna(avg_sales_total)
return prediction_window
But these two do not work and run an error?
def avgSalespMpY(df, weekStart_col_name, order_col_name):
df['SKU_MONTH'] = pd.DatetimeIndex(df[weekStart_col_name]).month
grouped_data = df.groupby(['SKU_MONTH'])[order_col_name].mean()
return grouped_data
def avg_salesMY(df, prediction_window):
earliest_date = prediction_window['WEEK_START_DT'].min()
filter_date = df[df['WEEK_START_DT'] < earliest_date]
avg_sales_df = avgSalespMpY(filter_date, 'WEEK_START_DT', 'R_ORDER_QT')
prediction_window['SKU_MONTH'] = pd.DatetimeIndex(prediction_window['WEEK_START_DT']).month
avg_sales_df.columns = ['SKU_MONTH', 'OA_SPW']
avg_sales_total = avg_sales_df['OA_SPW'].mean()
prediction_window = pd.merge(prediction_window, avg_sales_df, how = 'left', on = 'SKU_MONTH')
prediction_window = prediction_window.loc[:, prediction_window.columns != 'SKU_MONTH']
prediction_window['OA_SPW'] = prediction_window['OA_SPW'].fillna(avg_sales_total)
print(prediction_window['OA_SPW'])
return prediction_window
This is the error I am getting when I run the second function (avg_salesMY):
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
5 frames
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'OA_SPW'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 'OA_SPW'
The output for the first function (avgSalespMpY) is what we expect and is as follows:
SKU_MONTH
1 535.687921
2 577.925649
3 611.837803
4 678.377140
5 496.411170
6 601.806244
7 688.770197
8 574.510967
9 636.203457
10 876.896305
11 719.614757
12 553.642329
Name: R_ORDER_QT, dtype: float64
I am very confused what is going on because the two segments are almost identical. There is an issue happening accessing the OA_SPW column but the one code is above the other, so why does it all of a sudden not want to run properly?
The first segment is supposed to take in data and find the overall average sales per week of year in the first segment and the second is supposed to take in prediction data and add in the output of the previous data. The second segment is supposed to take in data and find the overall average sales per month in the first segment and the second is supposed to take in prediction data and add in the output of the previous data. The biggest difference is that one's results is week by week and the other is based on month.

sqrt and float data in python [duplicate]

This question already has answers here:
Change type of pandas series/dataframe column inplace
(4 answers)
Closed 1 year ago.
I have some problem in my code
its result error "AttributeError: 'Series' object has no attribute 'float'"
i need solution to solve it
my code
def calculateED (pdX, pdY):
arg = (((pdX['NormalTemp']-pdY[0])**2)+
((pdX['NormalHumidity']-pdY[1])**2)+
((pdX['NormalOutlook']-pdY[2])**2)+
((pdX['NormalWindy']-pdY[3])**2)).float()
pdX['ED']= math.sqrt(arg1)
return pdX
dataTest = [64.0,65.0,1.0,1.0]
ngitung = calculateED(dataset, dataTest)
ngitung.sort_values(by=['ED'])
but after run this code i have some problem
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-21-034dd0366562> in <module>
1 dataTest = (64.0,65.0,1.0,1.0)
2 #dataTest2 = [72.0,95.0,3.0,0.0]
----> 3 ngitung = calculateED(dataset, dataTest)
4 #ngitung2 = calculateED2(dataset, dataTest2)
5 ngitung.sort_values(by=['ED'])
<ipython-input-15-5ae8a365490e> in calculateED(pdX, pdY)
1 def calculateED (pdX, pdY):
----> 2 arg = (((pdX['NormalTemp']-pdY[0])**2)+
3 ((pdX['NormalHumidity']-pdY[1])**2)+
4 ((pdX['NormalOutlook']-pdY[2])**2)+
5 ((pdX['NormalWindy']-pdY[3])**2)).float()
c:\users\windows\appdata\local\programs\python\python39\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5463 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5464 return self[name]
-> 5465 return object.__getattribute__(self, name)
5466
5467 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'float'

According to this topic:
numpy functions require float type argument explicity
Example:
import numpy as np
your_array = your_array.float()
output = np.exp(your_array)
In your case:
def calculateED (pdX, pdY):
argument = ((pdX['NormalTemp']-pdY[0])**2)+
((pdX['NormalHumidity']-pdY[1])**2)+
((pdX['NormalOutlook']-pdY[2])**2)+
((pdX['NormalWindy']-pdY[3])**2).float()
pdX['ED']= np.sqrt(argument)
return pdX

Why I have this problem with index range?why does it not work?

I have got this error when try split my one column to few columns. But it split on just on one or two columns.If you wanna split on 3,4,5 columns it writes:
ValueError Traceback (most recent call last)
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
349 try:
--> 350 return self._range.index(new_key)
351 except ValueError:
ValueError: 2 is not in range
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-19-d4e6a4d03e69> in <module>
22 data_old[Col_1_Label] = newz[0]
23 data_old[Col_2_Label] = newz[1]
---> 24 data_old[Col_3_Label] = newz[2]
25 #data_old[Col_4_Label] = newz[3]
26 #data_old[Col_5_Label] = newz[4]
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
2798 if self.columns.nlevels > 1:
2799 return self._getitem_multilevel(key)
-> 2800 indexer = self.columns.get_loc(key)
2801 if is_integer(indexer):
2802 indexer = [indexer]
/usr/local/Cellar/jupyterlab/2.1.5/libexec/lib/python3.8/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
350 return self._range.index(new_key)
351 except ValueError:
--> 352 raise KeyError(key)
353 return super().get_loc(key, method=method, tolerance=tolerance)
354
KeyError: 2
There is my code.I have csv file.And when pandas read it - create one column with value 'Контракт'.Then. I split it on another columns. But it split till two columns.I wanna 7 columns!Help please to understand this logic!
import pandas as pd
from pandas import Series, DataFrame
import re
dframe1 = pd.read_csv('po.csv')
columns = ['Контракт']
data_old = pd.read_csv('po.csv', header=None, names=columns)
data_old
# The thing you want to split the column on
SplitOn = ':'
# Name of Column you want to split
Split_Col = 'Контракт'
newz = data_old[Split_Col].str.split(pat=SplitOn, n=-1, expand=True)
# Column Labels (you can add more if you will have more)
Col_1_Label = 'Номер телефону'
Col_2_Label = 'Тарифний пакет'
Col_3_Label = 'Вихідні дзвінки з України за кордон'
Col_4_Label = 'ВАРТІСТЬ ПАКЕТА/ЩОМІСЯЧНА ПЛАТА'
Col_5_Label = 'ЗАМОВЛЕНІ ДОДАТКОВІ ПОСЛУГИ ЗА МЕЖАМИ ПАКЕТА'
Col_6_Label = 'Вартість послуги "Корпоративна мережа'
Col_7_Label = 'ЗАГАЛОМ ЗА КОНТРАКТОМ (БЕЗ ПДВ ТА ПФ)'
data_old[Col_1_Label] = newz[0]
data_old[Col_2_Label] = newz[1]
data_old[Col_3_Label] = newz[2]
#data_old[Col_4_Label] = newz[3]
#data_old[Col_5_Label] = newz[4]
#data_old[Col_6_Label] = newz[5]
#data_old[Col_7_Label] = newz[6]
data_old

Pandas does not support "unstructured text", you should convert it to a standard format or python objects and then create a dataframe from it
Imagine that you have a file with this text named data.txt:
Contract № 12345679 Number of phone: +7984563774
Total price for month : 00.00000
Total price: 10.0000
You can load an process it with Python like this:
with open('data.txt') as f:
content = list(data.readlines())
# First line contains the contract number and phone information
contract, phone = content[0].split(':')
# find contract number using regex
contract = re.findall('\d+', contract)[0]
# The phone is strightforward
phone = phone.strip()
# Second line and third line for prices
total_price = float(content[1].split(':')[1].strip())
total_month_price = float(content[2].split(':')[1].strip())
Then with these variables you can create a dataframe
df = pd.DataFrame([dict(N_of_contract=contract, total_price=total_price, total_month_price =total_month_price )])
Repeat the same for all files.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Apply a binomial function to every row in a pandas Dataframe - python

Related

Stuck on Python "KeyError: <exception str() failed>" in BFS code of a water jug scenario

ValueError: "Columns must be same length as key" > Python

Python/pandas KeyError issue with user-defined functions

sqrt and float data in python [duplicate]

Why I have this problem with index range?why does it not work?

Categories

Resources