Two-way anova is not recognizing brackets and symbols - python

Here is my working code at the moment
import os as os
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
#extract specific columns from dataframe
F1_para1 = 'ROI' #x-axis
F2_para2 = 'Diet'
value = 'Time(s)'
df['comb'] = df[F1_para1].map(str) + "+" + df[F2_para2].map(str)
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+'*'+F2_para2, data = df).fit()
aov = anova_lm(mod, type=2)
comparison=MultiComparison(df[value], df['comb'])
tdf = pd.read_html(comparison.tukeyhsd().summary().as_html())[0] #unfiltered
I am getting the following error on line
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+'*'+F2_para2, data = df).fit()
Because the error says it doesnt recognized 'Time(s)' which im presuming its because of the brackets as without the brackets it works. Just the original dataframe needs to have those brackets in the heading
Error code:
Exception has occurred: PatsyError
Error evaluating factor: NameError: name 'Time' is not defined
Time(s)~ROI+Diet+ROI*Diet
^^^^^^^
File "", line 1, in
NameError: name 'Time' is not defined
The above exception was the direct cause of the following exception:
File "C:\Users\temp.py", line 55, in bar_2para
mod = ols(value+'~'+F1_para1+'+'+F2_para2+'+'+F1_para1+''+F2_para2, data = df).fit()
File "C:\Users\temp.py", line 215, in
bar_2para(df,F1_para1,F2_para2,para3,flt_para3a,x,x,x)
patsy.PatsyError: Error evaluating factor: NameError: name 'Time' is not defined
Time(s)~ROI+Diet+ROIDiet
^^^^^^^
I don't know what to try as sometimes i need to include '(%)' as part of the value in the two-way anova formula

Related

Getting typeError when trying to convert r dataframe to json with rpy2

import json
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
biocPkgTools = importr('BiocPkgTools')
biocPkgList = biocPkgTools.biocPkgList()
biocPkgList = json.loads(ro.conversion.rpy2py(biocPkgList))
The dataframe looks great and I'm just trying to convert it to a json object with column names as keys but I receive this error:
Traceback (most recent call last):
File "/bioconductor/bioconductor.py", line 11, in <module>
json = json.loads(ro.conversion.rpy2py(biocPkgList))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/json/__init__.py", line 339, in loads
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not DataFrame
Other steps I've tried is converting it to a pandas dataframe then to json but that also gives an error. I appreciate any help I can get.
Pandas method:
import rpy2.robjects.numpy2ri as rpyn
import json
import pandas as pd
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
biocPkgTools = importr('BiocPkgTools')
biocPkgList = biocPkgTools.biocPkgList()
columns = list(biocPkgList.colnames)
biocPkgList_df = pd.DataFrame(biocPkgList)
biocPkgList_df = biocPkgList_df.T
biocPkgList_df.columns = columns
biocPkgList_json = biocPkgList_df.to_json(orient='records')
print(biocPkgList_json)
I get these R errors:
R[write to console]: Error: unimplemented type 'char' in 'eval'
R[write to console]: Error: cannot have attributes on a CHARSXP
R[write to console]: Fatal error: unable to initialize the JIT
To convert an R DataFrame to JSON-formatted Python dict/list structure (which seem to be what you are attempting), you need to either:
(a) convert it to JSON string in R and then parse the JSON string in Python
(b) convert it to pandas DataFrame and then covert that to JSON
For the solution (a), I would recommend using rjson R package:
import json
from rpy2.robjects.packages import importr
bioc_pkg_tools = importr('BiocPkgTools')
rjson = importr('rjson')
bioc_pkg_data_frame = bioc_pkg_tools.biocPkgList()
r_json_string_vector = rjson.toJSON(bioc_pkg_data_frame)
py_json_string = r_json_string_vector[0]
py_json_structure = json.loads(py_json_string)
print(py_json_structure.keys())
# dict_keys(['Package', 'Version', 'Depends', 'Suggests', 'License', 'MD5sum', 'NeedsCompilation', 'Title', 'Description', 'biocViews', 'Author', 'Maintainer', 'git_url', 'git_branch', 'git_last_commit', 'git_last_commit_date', 'Date/Publication', 'source.ver', 'win.binary.ver', 'mac.binary.ver', 'vignettes', 'vignetteTitles', 'hasREADME', 'hasNEWS', 'hasINSTALL', 'hasLICENSE', 'Rfiles', 'dependencyCount', 'Imports', 'Enhances', 'dependsOnMe', 'VignetteBuilder', 'suggestsMe', 'LinkingTo', 'Archs', 'URL', 'SystemRequirements', 'BugReports', 'importsMe', 'PackageStatus', 'Video', 'linksToMe', 'License_restricts_use', 'organism', 'OS_type', 'License_is_FOSS'])
Now, as for (b) the code would be along these lines:
from rpy2.robjects import pandas2ri
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import localconverter, rpy2py
base = importr('base')
with localconverter(default_converter + pandas2ri.converter):
pandas_dataframe = base.as_data_frame(bioc_pkg_data_frame)
py_json_string = pandas_dataframe.to_json()
py_json_structure = json.loads(py_json_structure)
However, it does not work in this case (raising TypeError: 'NULLType' object is not iterable), because the R data frame contains lists (e.g. in the Depends column) and conversion of data frames with embedded lists is not yet supported by rpy2 (https://github.com/rpy2/rpy2/issues/773 and https://github.com/rpy2/rpy2/issues/860).
You can still extract a subset of the data frame that does not include list:
list_columns = []
i = 1
columns_to_keep = []
for column_name in bioc_pkg_data_frame.names:
# rx2 is equivalent of `bioc_pkg_data_frame[[column_name]]` in R
column = bioc_pkg_data_frame.rx2(column_name)
r_class = get_r_class(column)[0]
if r_class == 'list':
list_columns.append(column_name)
else:
columns_to_keep.append(i)
i += 1
# we will exclude these:
print(list_columns)
# Depends, Suggests, biocViews, Author, Maintainer, vignettes, vignetteTitles, Rfiles, Imports, Enhances, dependsOnMe, suggestsMe, LinkingTo, Archs, importsMe, linksToMe
And then get a pandas dataframe and JSON (string/structure) with:
with localconverter(default_converter + pandas2ri.converter):
pandas_dataframe = base.as_data_frame(bioc_pkg_data_frame_no_lists)
py_json_string = pandas_dataframe.to_json()
py_json_structure = json.loads(py_json_structure)
(or you could convert the lists to a concatenated string)

NameError: name 'TabularList' is not defined

from fastai import *
from fastai.tabular import *
from fastai.tabular.all import *
import pandas as pd
# set seed for reproducibility
custom_set_seed(42)
df = pd.read_csv('credit_card_default.csv', index_col=0, na_values='')
df.head()
DEP_VAR = 'default_payment_next_month'
num_features = list(df.select_dtypes('number').columns)
num_features.remove(DEP_VAR)
cat_features = list(df.select_dtypes('object').columns)
preprocessing = [FillMissing, Categorify, Normalize]
data = (TabularList.from_df(df, cat_names=cat_features, cont_names=num_features, procs=preprocessing).split_by_rand_pct(valid_pct=0.2, seed=42).label_from_df(cols=DEP_VAR).databunch())
I have been trying to run this piece of code but it keeps running into this error:
NameError Traceback (most recent call last)
<ipython-input-42-5ca7e57a8e36> in <module>
1 # Create a TabularDataBunch from the DataFrame
2
----> 3 data = (TabularList.from_df(df, cat_names=cat_features, cont_names=num_features, procs=preprocessing).split_by_rand_pct(valid_pct=0.2, seed=42).label_from_df(cols=DEP_VAR).databunch())
NameError: name 'TabularList' is not defined
I believe I have imported all the modules that were needed. Can someone suggest a solution for this?
Check with the full path import as below
from fastai.tabular.data import TabularList
I got this working by installing an older fastai i.e.
pip install fastai==1.0.61
then
from fastai.tabular.data import TabularList
works with no problems.

Need to print a new column for Panda data frame

I am using the following code to print ratio by applying a function, but am getting the following errors.
Code
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL")
print(x)
Error
NameError Traceback (most recent call last)
<ipython-input-2-c17535375449> in <module>
12 return new_df
13 x = main("IOC","HPCL")
---> 14 print(x)
NameError: name 'x' is not defined
You are calling x = main("IOC","HPCL") inside the function main
This makes x defined only inside the scope of the function main
When you call print(x) outside function main the interpreter throws error, as it should, that x is not defined
Does this correction solve the issue:
import investpy
import pandas as pd
import numpy as np
import sys
def main(stock1_name, stock2_name):
stock1 = investpy.get_stock_historical_data(stock=stock1_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
stock2 = investpy.get_stock_historical_data(stock=stock2_name,country='india', from_date='01/01/2020',to_date='08/03/2021')
new_df = pd.merge(stock1, stock2, on='Date')
new_df = new_df.drop(['Open_x', 'High_x', 'Low_x', 'Volume_x', 'Currency_x', 'Low_y','Volume_y', 'Currency_y', 'Open_y', 'High_y'], axis = 1)
new_df['ratio'] = np.log10(new_df['Close_x']/new_df['Close_y'])
return new_df
x = main("IOC","HPCL") # Edit moving this line outside the function main()
print(x)

Tableau error "All Fields must be aggregate or constant" when invoking TabPy SCRIPT_REAL

I am calling a TabPy server via a calculated field in a Tableau worksheet to run a hypothesis test: does the rate of Bookings vary significantly by Group?
I have a table such as:
Group Bookings
0 A 1
1 A 0
3998 B 1
3999 B 0
In Python, on the same server (using the python 2.7 docker image) the test I want is simply:
from scipy.stats import fisher_exact
df_cont_tbl = pd.crosstab(df['Group'], df['Bookings'])
prop_test = fisher_exact(df_cont_tbl)
print 'Fisher exact test: Odds ratio = {:.2f}, p-value = {:.3f}'.format(*prop_test)
Returns: Fisher exact test: Odds ratio = 1.21, p-value = 0.102
I connected Tableau to the TabPy server and can execute a hello-world calculated field. For example, I get 42 back with the calculated field: SCRIPT_REAL("return 42", ATTR([Group]),ATTR([Bookings]) )
However, I try to invoke the stats function above with a calculated field to extract the p-value:
SCRIPT_REAL("
import pandas as pd
from scipy.stats import fisher_exact
df_cont_tbl = pd.crosstab(_arg1, _arg2)
prop_test = fisher_exact(df_cont_tbl)
return prop_test[1]
", [Group], [Bookings] )
I get the notification: The calculation contains errors with the drop-down All fields must be aggregate or constant when using table calculation functions or fields from multiple data sources
I tried wrapping the inputs with ATTR(), as in:
SCRIPT_REAL("
import pandas as pd
from scipy.stats import fisher_exact
df_cont_tbl = pd.crosstab(_arg1, _arg2)
prop_test = fisher_exact(df_cont_tbl)
return prop_test[1]
",ATTR([Group]), ATTR([Bookings])
)
Which changes the notification to "The calculation is valid" but returns a Pandas ValueError from the server:
An error occurred while communicating with the External Service.
Error processing script
Error when POST /evaluate: Traceback
Traceback (most recent call last):
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tabpy_server/tabpy.py", line 467, in post
result = yield self.call_subprocess(function_to_evaluate, arguments)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tornado/gen.py", line 1008, in run
value = future.result()
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tornado/concurrent.py", line 232, in result
raise_exc_info(self._exc_info)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tornado/gen.py", line 1014, in run
yielded = self.gen.throw(*exc_info)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tabpy_server/tabpy.py", line 488, in call_subprocess
ret = yield future
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/tornado/gen.py", line 1008, in run
value = future.result()
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/concurrent/futures/_base.py", line 400, in result
return self.__get_result()
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/concurrent/futures/_base.py", line 359, in __get_result
reraise(self._exception, self._traceback)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/concurrent/futures/_compat.py", line 107, in reraise
exec('raise exc_type, exc_value, traceback', {}, locals_)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/concurrent/futures/thread.py", line 61, in run
result = self.fn(*self.args, **self.kwargs)
File "<string>", line 5, in _user_script
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/pandas/tools/pivot.py", line 479, in crosstab
df = DataFrame(data)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/pandas/core/frame.py", line 266, in __init__
mgr = self._init_dict(data, index, columns, dtype=dtype)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/pandas/core/frame.py", line 402, in _init_dict
return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/pandas/core/frame.py", line 5398, in _arrays_to_mgr
index = extract_index(arrays)
File "/opt/conda/envs/Tableau-Python-Server/lib/python2.7/site-packages/pandas/core/frame.py", line 5437, in extract_index
raise ValueError('If using all scalar values, you must pass'
ValueError: If using all scalar values, you must pass an index
Error type : ValueError
Error message : If using all scalar values, you must pass an index
Example dataset:
To generate the CSV I am connecting to:
import os
import pandas as pd
import numpy as np
from collections import namedtuple
OUTPUT_LOC = os.path.expanduser('~/TabPy_demo/ab_test_demo_results.csv')
GroupObs = namedtuple('GroupObs', ['name','n','p'])
obs = [GroupObs('A',3000,.10),GroupObs('B',1000,.13)]
# note true odds ratio = (13/87)/(10/90) = 1.345
np.random.seed(2019)
df = pd.concat( [ pd.DataFrame({'Group': grp.name,
'Bookings': pd.Series(np.random.binomial(n=1,
p=grp.p, size=grp.n))
}) for grp in obs
],ignore_index=True )
df.to_csv(OUTPUT_LOC,index=False)
Old question, but perhaps this will help someone else. There are a couple of issues. First is in relation to the way the data is passed to the pd.crosstab. Tableau passes a list of values to the tabpy server so wrap this in an array to fix your error you are getting.
SCRIPT_REAL(
"
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
df_cont_tbl = pd.crosstab(np.array(_arg1), np.array(_arg2))
prop_test = fisher_exact(df_cont_tbl)
return prop_test[1]
",
attr([Group]), attr([Bookings])
)
Another problem is the way the table calculation is being performed. You want to send tabpy two lists of information each as long as your table. In the default case tableau wants to calculate at the row level which is not going to work.
I included the row count F1 into the csv that I built the workbook on and made sure to calculate the python value along this function.
Now when you put F1 into the worksheet it will return the P-value as many times as you have rows, A workaround for this is to wrap your calculation in another calculation to only return the value if it is the first row and place this in your worksheet.
Now you can place this into a worksheet.

pandas and rpy2: Why does ezANOVA work via robjects.r but not robjects.packages.importr?

Like many, I'm hoping to stop straddling R and Python worlds and just work in Python using Pandas, Pyr2, Numpy, etc. I'm using the R package ez for its ezANOVA facility. It works if I do things the hard way, but why doesn't it work when I do them the easy way? I don't understand the resulting error:
File "/Users/malcomreynolds/analysis/r_with_pandas.py", line 38, in <module>
res = ez.ezANOVA(data=testData, dv='score', wid='subjectid', between='block', detailed=True)
File "/usr/local/lib/python2.7/site-packages/rpy2/robjects/functions.py", line 178, in __call__
return super(SignatureTranslatedFunction, self).__call__(*args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/rpy2/robjects/functions.py", line 106, in __call__
res = super(Function, self).__call__(*new_args, **new_kwargs)
rpy2.rinterface.RRuntimeError: Error in table(temp[, names(temp) == wid]) :
attempt to set an attribute on NULL
See below for full reproducible code (requires some python packages: pyr2, pandas, numpy):
import pandas as pd
from rpy2 import robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate() # make pyr2 accept and auto-convert pandas dataframes
from rpy2.robjects.packages import importr
base = importr('base')
ez = importr('ez')
robjects.r['options'](warn=-1) # ???
import numpy as np
"""Make pandas data from from scratch"""
score = np.random.normal(loc=10, scale=20, size=10)
subjectid = range(10)
block = ["Sugar"] * 5 + ["Salt"] * 5
testData = pd.DataFrame({'score':score, 'block':block, 'subjectid': subjectid})
# it looks just like a dataframe from R
print testData
"""HARD WAY: Use ezANOVA thorugh pyr2 *** THIS WORKS ***"""
anova1 = robjects.r("""
library(ez)
function(df) {
# df gets passed in
ezANOVA(
data=df,
dv=score,
wid=subjectid,
between=block,
detailed=TRUE)
}
""")
print anova1(testData)
# this command shows that ez instance is setup properly
print ez.ezPrecis(data=testData) # successful
"""EASY WAY: Import ez directly and use it """
# *** THIS APPROACH DOES NOT WORK ***
# yet, trying to use ez.ezANOVA yields an excpetion aboutthe wid value
# res = ez.ezANOVA(data=testData, dv='score', wid='subjectid', between='block', detailed=True)
# print res
# *** THIS APPROACH WORKS (and also uses my options change) ***
res = ez.ezANOVA(data=testData, dv=base.as_symbol('score'), wid=base.as_symbol('subjectid'), between=base.as_symbol('block'))
print res
In the easy version you are passing symbol names as strings. This is not the same as a symbol.
Check the use of as_symbol in Minimal example of rpy2 regression using pandas data frame

Categories

Resources