I have a python function that I want to call in Excel. However I'm getting #VALUE errors, I think it's to do with passing an excel range to a python list.
Python function requires 2 inputs with 4 optional, the first input is a string with the second being a list of list of strings [rows[columns]], a score is then produced for the string against each row in the list. Finally outputting a dataframe:
import pythoncom
import win32com.client
import pandas as pd
from sklearn import feature_extraction, metrics
from typing import List, Any
class PythonObjectLibrary:
_reg_clsid_ = pythoncom.CreateGuid()
_reg_clsctx_ = pythoncom.CLSCTX_LOCAL_SERVER
_reg_progid_ = "Python.ObjectLibrary"
_reg_desc_ = "This is our Python object library."
_public_methods_ = ['nlp_vlookup']
def nlp_vlookup(value: str,
table: List[List[Any]],
col_index: int = None,
include_score: bool = False,
all_matches: bool = False,
threshold: float = 0.5) -> pd.DataFrame:
words = [str(x[0]) for x in table]
vectorizer = feature_extraction.text.CountVectorizer()
vectors = vectorizer.fit_transform([value]+words).toarray()
cosine_sim = metrics.pairwise.cosine_similarity(list(vectors))
scores = cosine_sim[0][1:]
scores_df = pd.DataFrame({"score": scores}, index=words)
table_df = pd.DataFrame(table, index=words)
df = table_df.join(scores_df)
df = df[df["score"] >= threshold]
if not len(df.index):
raise ValueError("No matches found")
df = df.sort_values(by="score", ascending=False)
if not all_matches:
df = df.head(1)
columns = table_df.columns.to_list() if col_index is None else [col_index-1]
if include_score:
columns = ["score"] + columns
df = df.reindex(columns=columns)
return df
if __name__ == '__main__':
import win32com.server.register
win32com.server.register.UseCommandLine(PythonObjectLibrary)
# test function:
# search_list = [['Capital bank'], ['The Little Bank'], ['The Big Bank']]
# match = PythonObjectLibrary.nlp_vlookup("The Capital Bank", search_list, include_score=True, all_matches=True, threshold=0.5)
# print(match)
Excel Function to map the python function:
Function nlp_vlookup(value As String, table As Range, col_index As Integer, include_score As Boolean, all_matches As Boolean, threshold As Double)
nlp_vlookup = VBA.CreateObject("PythonObjectLibrary").nlp_vlookup(value, table, col_index, include_score, all_matches, threshold)
End Function
EDITED after #norie's comment, as it seems the zero-based array thing is a red herring.
I used this test Object (note the hard-coded UUID). It has a simple doubleArray() method that receives a Variant Array of numbers from Excel, doubles them, and returns an array. I have created a dummy DataFrame to test the extraction to a list. Apologies if the Python is inelegant ... I am only a beginner!
import pythoncom as pc
import win32com.client as cl
import numpy as np
import pandas as pd
from win32com.client import Dispatch
class PythonComTestObject:
_reg_clsid_ = '{BB58C07E-B9AD-4BC7-BB8C-01D2FF8FD4E9}' #replace this
_reg_clsctx_ = pc.CLSCTX_LOCAL_SERVER
_reg_progid_ = "PythonComTestObject.Library"
_reg_desc_ = "A library for doubling things."
# a list of strings that indicate the public methods for the object. If they aren't listed they are considered private.
_public_methods_ = ['doubleArray']
# double every value in array and return
def doubleArray(self, vArray):
#the VARIANT array comes gets converted to an [[]] array
nRows = len(vArray)
nCols = len(vArray[0])
headers = [ 'Col{0:}'.format(n) for n in range(1,nCols+1)]
retDF = pd.DataFrame(np.array(vArray) * 2,columns=headers)
return [retDF.columns.values.tolist()] + retDF.values.tolist()
if __name__ == '__main__':
import win32com.server.register
win32com.server.register.UseCommandLine(PythonComTestObject)
Then this is my VBA in Excel. I realize it is a bit more than the original. The conversion from a zero-based to a 1-based array could be put into a utility function. Note I've added a global object for my Python library, so that it gets created once and re-used. This makes subsequent function calls more rapid as you don't have to do the hard work of creating the object again.
Option Explicit
Dim g_PythonObj As Object
Public Function pythonDouble(rng As Variant) As Variant
On Error GoTo comerror
'Initialize the global object
If g_PythonObj Is Nothing Then
Set g_PythonObj = CreateObject("PythonComTestObject.Library")
End If
'Convert the rng parameter to a variant array
Dim vIn As Variant
vIn = rng
'Call the Python function
pythonDouble = g_PythonObj.doubleArray(vIn)
Exit Function
'Catch any errors here
comerror:
Debug.Print Err.Description
pythonDouble = CVErr(xlErrValue)
End Function
'A test subroutine for debugging
Sub testPython()
Dim rng As Range
Set rng = Range("Input") 'A range on my test sheet
Dim vIn As Variant
vIn = rng
Dim vRes As Variant
vRes = pythonDouble(vIn)
End Sub
This is the spreadsheet result:
Related
Im trying to read big csv files and also effectively work on other stuff at the same time. That is why my solution to this problem is to create a progress bar (something that shows how far Ive come threw out the read that gives me a sense of what time I have before the read is complete). However I have tried using tqdm aswell as ownmade while loops, but to my disfortune, I have not found a solution to this problem. I have tried using this thread: How to see the progress bar of read_csv
without no luck. Maybe I can apply TQDM in a different way? Are there any other solutions?
Heres the important part of the code (the one I want to add a progress bar to)
def read_from_csv(filepath: str,
sep: str = ",",
header_line: int = 43,
skip_rows: int = 48) -> pd.DataFrame:
"""Reads a csv file at filepath containing the vehicle trip data and
performs a number of formatting operations
"""
# The first call of read_csv is used to get the column names, which allows
# the typing to take place at the same time as the second read, which is
# faster than forcing type afterwards
df_names: pd.Index[str] = pd.read_csv(
filepath,
sep = sep,
header = header_line,
skip_blank_lines = False,
skipinitialspace = True,
index_col = False,
engine = 'c',
nrows = 0,
encoding = 'iso-8859-1'
).columns
# The "Time" and "Time_abs" columns have some inconsistent
# "Storage group code" preceeding the actual column name, so their
# full column names are stored so they can be renamed later. Also, we want
# to interpret "Time_abs" as a string, while the rest are floats. This is
# stored in a dict to use in the next call to read_csv
time_col = ""
time_abs_col = ""
names_dict = {}
for name in df_names:
if ": Time_abs" in name:
names_dict[name] = 'str'
time_abs_col = name
elif ": Time" in name:
time_col = name
else:
names_dict[name] = 'float'
# A list of values that we want pandas to interpret as having no value.
# "NOVALUE" is the only one of these that's actually used in the files,
# the rest are copy-pasted defaults.
na_vals = ['', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan',
'1.#IND', '1.#QNAN', '<NA>', 'N/A', 'NA', 'NULL', 'NaN', 'n/a',
'nan', 'null', 'NOVALUE']
# The whole file is parsed and put in a dataframe
df: pd.DataFrame = pd.read_csv(filepath,
sep = sep,
skiprows = skip_rows,
header = 0,
names = df_names,
skip_blank_lines = False,
skipinitialspace = True,
index_col = False,
engine = 'c',
na_values = na_vals,
dtype = names_dict,
encoding = 'iso-8859-1'
)
# Renames the "Time" and "Time_abs" columns so they don't include the
# storage group part
df.rename(columns = {time_col: "Time", time_abs_col: "Time_abs"},
inplace = True)
# Second retyping of this column (here from string to datetime).
# Very rarely, the Time_abs column in the csv data only has the time and
# not the date, in which case this line throws an error. We manage this by
# simply letting it stay as a string
try:
df[defs.time_abs] = pd.to_datetime(df[defs.time_abs])
except:
pass
# Every row ends with an extra delimiter which python interprets as another
# column, but it's empty so we remove it. This is not really necessary, but
# is done to reduce confusion when debugging
df.drop(df.columns[-1], axis=1, inplace=True)
# Adding extra columns to the dataframe used later
df[defs.lowest_gear] = np.nan
df[defs.lowest_speed] = np.nan
for i in list(defs.second_trailer_axles_dict.values()):
df[i] = np.nan
return df
Its the reading csv that takes a lot of time thats why that is the point of interest to add a progress bar to.
Thank you in advance!
You can easily do this with Dask. For example:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ddf = dd.read_csv(path, blocksize=1e+6)
with ProgressBar():
df = ddf.compute()
[########################################] | 100% Completed | 37.0s
And you will see the file download process.
the blocksize parameter is responsible for the blocks that your file is read with. By changing it, you can achieve good performance. Plus, Dask uses several threads for reading by default, which will speed up the reading process itself.
You can use tqdm.
Somewhere in your function:
def read_from_csv(filepath: str,
sep: str = ",",
header_line: int = 43,
skip_rows: int = 48,
chunksize: int = 10000) -> pd.DataFrame:
# Count the total lines of the file
# Overhead: 3.73s for 10,000,000 lines / 4.2G on a SSD
length = sum(1 for row in open('large.csv', 'r'))
data = []
with tqdm(total=1 + (length // chunksize)) as pbar:
# Replace your 2nd pd.read_csv by this:
for chunk in pd.read_csv('large.csv', ..., chunksize=chunksize):
data.append(chunk)
pbar.update(chunksize)
df = pd.concat(data)
Since there was a question in the comments about the progress bar for some pandas dataframe methods, I will note the solution for such cases. Library parallelbar allows you to track the progress for such popular methods of the Pool class of the multiprocessing module as map, imap and imap_unordered. It is easy to adapt it for parallel work with pandas dataframes (and track progress) as follows:
# pip install parallelbar
from parallelbar import progress_map
import pandas as pd
import numpy as np
from multiprocessing import cpu_count
def parallelize_dataframe(df, func, split_size=cpu_count() * 4, **kwargs):
df_split = np.array_split(df, split_size)
result_df = pd.concat(progress_map(func, df_split, **kwargs),
ignore_index=True)
return result_df
Where df - your dataframe; func - the function to be applied to the dataframe; split_size - how many parts you need to split df into for parallelization (usually the default value is a good choice); **kwargs-optional keyword arguments for progress_map function (see the documentation)
For example:
def foo(df):
df[col] = pd.to_datetime(df[col])
return df
if __name__=='__main__':
new_df = parallelize_dataframe(df, foo)
not only will you see the progress of execution, but the execution of the pd.to_datetime function will be parallelized, which will significantly speed up your work.
I ran the following codes but Spyder returned "float division by zero"
import pandas as pd
file = pd.read_csv(r"data_ET.csv")
def normalise(df, columnName):
for value in df[columnName]:
df[columnName] = (value - df[columnName].min())/(df[columnName].max()-df[columnName].min()) (This line showed up error)
return df[columnName]
#b)
normalised_RTfirstpass = normalise(file, 'RTfirstpass')
normalised_logfreq = normalise(file, 'log_freq')
file['normalised RTfirstpass'] = normalised_RTfirstpass
file['normalised logfreq'] = normalised_logfreq
print(file)
When I changed into this, it works (the change here is assigning column values to a variable)
import pandas as pd
file = pd.read_csv(r"data_ET.csv")
def normalise(df, columnName):
value=df[columnName]
df[columnName] = (value - df[columnName].min())/(df[columnName].max()-df[columnName].min())
return df[columnName]
#b)
normalised_RTfirstpass = normalise(file, 'RTfirstpass')
normalised_logfreq = normalise(file, 'log_freq')
file['normalised RTfirstpass'] = normalised_RTfirstpass
file['normalised logfreq'] = normalised_logfreq
print(file)
Can anybody explain why the later works but the former does not?
df[Columnname] returns a pd.Series object and you are trying cast a Series object into int.
while in latter case, value=df[ColumnName], df[ColumnName]-df[ColumnName].min()/(....),
the pandas will broadcast the df[ColumnName].min() (which is a int/float value) into a pd.Series object. pandas automatically performs matrix operation on dataframe, thats why you do not need to iterate for every value in column.
I want to create my own function that scans a number of user-specified columns in a dataframe, and that function will create a new variable and assign it as '1' if all the specified columns == 1, otherwise 0.
In the following codes, I am accommodating if users are inputting exactly two columns to be scanned over.
import numpy as np
class Tagger:
def __init__(self):
pass
def summing_all_tagger(self, df, tag_var_list, tag_value=1):
# This tagger creates a tag='1' if all variables in tag_var_list equals to tag_value; otherwise='0'
self.df = df
self.tag_var_list = tag_var_list
self.tag_value = tag_value
self.df['temp'] = np.where((self.df[self.tag_var_list[0]]==self.tag_value) &
(self.df[self.tag_var_list[1]]==self.tag_value), 1, 0)
return self.df_pin['temp']
Then I can call it in the main.py file:
import pandas as pd
import datetime
import feature_tagger.feature_tagger as ft
tagger_obj = ft.Tagger()
df_pin['PIN_RX&TIME_TAG'] = tagger_obj.summing_all_tagger(df_pin, tag_var_list=['PIN_RX_TAG', 'PIN_TIME_TAG'], tag_value=1)
How can I modify it so users can enter as many column names for tag_var_list as they want?
Such as
df_pin['PIN_RX&TIME_TAG'] = tagger_obj.summing_all_tagger(df_pin, tag_var_list=['PIN_RX_TAG', 'PIN_TIME_TAG', 'PIN_NAME_TAG'], tag_value=1)
# or
df_pin['PIN_RX&TIME_TAG'] = tagger_obj.summing_all_tagger(df_pin, tag_var_list=['PIN_RX_TAG'], tag_value=1)
The np.all() is your friend.
self.df['temp'] = np.where(np.all(self.df[self.tag_var_list] == self.tag_value, axis=1), 1, 0)
I think you can create list comprehension for list of boolean masks and then reduce of masks to one with casting to integer for 0/1 column:
L = [self.df[x]==self.tag_value for x in tag_var_list]
self.df['temp'] = np.logical_and.reduce(L).astype(int)
Or DataFrame.all with casting boolean mask to integers:
self.df['temp'] = (self.df[self.tag_var_list] == self.tag_value).all(axis=1).astype(int)
I have two pandas dataframes that on inspection look identical. One was created using the Pandas builtin:
df.corr(method='pearson')
While the other was created with a custom function:
def cor_matrix(dataframe, method):
coeffmat = pd.DataFrame(index=dataframe.columns,
columns=dataframe.columns)
pvalmat = pd.DataFrame(index=dataframe.columns, columns=dataframe.columns)
for i in range(dataframe.shape[1]):
for j in range(dataframe.shape[1]):
x = np.array(dataframe[dataframe.columns[i]])
y = np.array(dataframe[dataframe.columns[j]])
bad = ~np.logical_or(np.isnan(x), np.isnan(y))
if method == 'spearman':
corrtest = spearmanr(np.compress(bad,x), np.compress(bad,y))
if method == 'pearson':
corrtest = pearsonr(np.compress(bad,x), np.compress(bad,y))
coeffmat.iloc[i,j] = corrtest[0]
pvalmat.iloc[i,j] = corrtest[1]
return (coeffmat, pvalmat)
Both look identical and have same type (pandas.core.frame.DataFrame) and their entries are also of same type (numpy.float64)
However when I try to plot these using:
import matplotlib.pyplot as plt
plt.imshow((df))
Only the dataframe created with the pandas builtin function works. For the other dataframe I receive the error: TypeError: Image data cannot be converted to float. Can anyone explain what is going on, how the two dataframes are different and what can be done to address the error?
Edit - It looks as though there is one difference, when I convert the dataframes to a numpy array, the one that doesn't work has dtype = object at the end. Is there a way to remove this?
Amending the function to specify the dataframe as float fixed the issue:
def cor_matrix(dataframe, method):
coeffmat = pd.DataFrame(index=dataframe.columns, columns=dataframe.columns)
pvalmat = pd.DataFrame(index=dataframe.columns, columns=dataframe.columns)
for i in range(dataframe.shape[1]):
for j in range(dataframe.shape[1]):
x = np.array(dataframe[dataframe.columns[i]])
y = np.array(dataframe[dataframe.columns[j]])
bad = ~np.logical_or(np.isnan(x), np.isnan(y))
if method == 'spearman':
corrtest = spearmanr(np.compress(bad,x), np.compress(bad,y))
if method == 'pearson':
corrtest = pearsonr(np.compress(bad,x), np.compress(bad,y))
coeffmat.iloc[i,j] = corrtest[0]
pvalmat.iloc[i,j] = corrtest[1]
#This is to convert to float type otherwise can cause problems when e.g. plotting
coeffmat=coeffmat.apply(pd.to_numeric, errors='ignore')
pvalmat=pvalmat.apply(pd.to_numeric, errors='ignore')
return (coeffmat, pvalmat)
I notice that this is an issue on GitHub already. Does anyone have any code that converts a Pandas DataFrame to an Orange Table?
Explicitly, I have the following table.
user hotel star_rating user home_continent gender
0 1 39 4.0 1 2 female
1 1 44 3.0 1 2 female
2 2 63 4.5 2 3 female
3 2 2 2.0 2 3 female
4 3 26 4.0 3 1 male
5 3 37 5.0 3 1 male
6 3 63 4.5 3 1 male
The documentation of Orange package didn't cover all the details. Table._init__(Domain, numpy.ndarray) works only for int and float according to lib_kernel.cpp.
They really should provide an C-level interface for pandas.DataFrames, or at least numpy.dtype("str") support.
Update: Adding table2df, df2table performance improved greatly by utilizing numpy for int and float.
Keep this piece of script in your orange python script collections, now you are equipped with pandas in your orange environment.
Usage: a_pandas_dataframe = table2df( a_orange_table ) , a_orange_table = df2table( a_pandas_dataframe )
Note: This script works only in Python 2.x, refer to #DustinTang 's answer for Python 3.x compatible script.
import pandas as pd
import numpy as np
import Orange
#### For those who are familiar with pandas
#### Correspondence:
#### value <-> Orange.data.Value
#### NaN <-> ["?", "~", "."] # Don't know, Don't care, Other
#### dtype <-> Orange.feature.Descriptor
#### category, int <-> Orange.feature.Discrete # category: > pandas 0.15
#### int, float <-> Orange.feature.Continuous # Continuous = core.FloatVariable
#### # refer to feature/__init__.py
#### str <-> Orange.feature.String
#### object <-> Orange.feature.Python
#### DataFrame.dtypes <-> Orange.data.Domain
#### DataFrame.DataFrame <-> Orange.data.Table = Orange.orange.ExampleTable
#### # You will need this if you are reading sources
def series2descriptor(d, discrete=False):
if d.dtype is np.dtype("float"):
return Orange.feature.Continuous(str(d.name))
elif d.dtype is np.dtype("int"):
return Orange.feature.Continuous(str(d.name), number_of_decimals=0)
else:
t = d.unique()
if discrete or len(t) < len(d) / 2:
t.sort()
return Orange.feature.Discrete(str(d.name), values=list(t.astype("str")))
else:
return Orange.feature.String(str(d.name))
def df2domain(df):
featurelist = [series2descriptor(df.icol(col)) for col in xrange(len(df.columns))]
return Orange.data.Domain(featurelist)
def df2table(df):
# It seems they are using native python object/lists internally for Orange.data types (?)
# And I didn't find a constructor suitable for pandas.DataFrame since it may carry
# multiple dtypes
# --> the best approximate is Orange.data.Table.__init__(domain, numpy.ndarray),
# --> but the dtype of numpy array can only be "int" and "float"
# --> * refer to src/orange/lib_kernel.cpp 3059:
# --> * if (((*vi)->varType != TValue::INTVAR) && ((*vi)->varType != TValue::FLOATVAR))
# --> Documents never mentioned >_<
# So we use numpy constructor for those int/float columns, python list constructor for other
tdomain = df2domain(df)
ttables = [series2table(df.icol(i), tdomain[i]) for i in xrange(len(df.columns))]
return Orange.data.Table(ttables)
# For performance concerns, here are my results
# dtndarray = np.random.rand(100000, 100)
# dtlist = list(dtndarray)
# tdomain = Orange.data.Domain([Orange.feature.Continuous("var" + str(i)) for i in xrange(100)])
# tinsts = [Orange.data.Instance(tdomain, list(dtlist[i]) )for i in xrange(len(dtlist))]
# t = Orange.data.Table(tdomain, tinsts)
#
# timeit list(dtndarray) # 45.6ms
# timeit [Orange.data.Instance(tdomain, list(dtlist[i])) for i in xrange(len(dtlist))] # 3.28s
# timeit Orange.data.Table(tdomain, tinsts) # 280ms
# timeit Orange.data.Table(tdomain, dtndarray) # 380ms
#
# As illustrated above, utilizing constructor with ndarray can greatly improve performance
# So one may conceive better converter based on these results
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
# Use numpy
# Table._init__(Domain, numpy.ndarray)
return Orange.data.Table(Orange.data.Domain(variable), series.values[:, np.newaxis])
else:
# Build instance list
# Table.__init__(Domain, list_of_instances)
tdomain = Orange.data.Domain(variable)
tinsts = [Orange.data.Instance(tdomain, [i]) for i in series]
return Orange.data.Table(tdomain, tinsts)
# 5x performance
def column2df(col):
if type(col.domain[0]) is Orange.feature.Continuous:
return (col.domain[0].name, pd.Series(col.to_numpy()[0].flatten()))
else:
tmp = pd.Series(np.array(list(col)).flatten()) # type(tmp) -> np.array( dtype=list (Orange.data.Value) )
tmp = tmp.apply(lambda x: str(x[0]))
return (col.domain[0].name, tmp)
def table2df(tab):
# Orange.data.Table().to_numpy() cannot handle strings
# So we must build the array column by column,
# When it comes to strings, python list is used
series = [column2df(tab.select(i)) for i in xrange(len(tab.domain))]
series_name = [i[0] for i in series] # To keep the order of variables unchanged
series_data = dict(series)
print series_data
return pd.DataFrame(series_data, columns=series_name)
Answer below from a closed issue on github
from Orange.data.pandas_compat import table_from_frame
out_data = table_from_frame(df)
Where df is your dataFrame. So far I've only noticed a need to manually define a domain to handle dates if the data source wasn't 100% clean and to the required ISO standard.
I realize this is an old question and a lot changed from when it was first asked - but this question comes up top on google search results on the topic.
from Orange.data.pandas_compat import table_from_frame,table_to_frame
df= table_to_frame(in_data)
#here you go
out_data = table_from_frame(df)
based on answer of Creo
In order to convert pandas DataFrame to Orange Table you need to construct a domain, which specifies the column types.
For continuous variables, you only need to provide the name of the variable, but for Discrete variables, you also need to provide a list of all possible values.
The following code will construct a domain for your DataFrame and convert it to an Orange Table:
import numpy as np
from Orange.feature import Discrete, Continuous
from Orange.data import Domain, Table
domain = Domain([
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('hotel', values=[str(v) for v in np.unique(df.hotel)]),
Continuous('star_rating'),
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('home_continent', values=[str(v) for v in np.unique(df.home_continent)]),
Discrete('gender', values=['male', 'female'])], False)
table = Table(domain, [map(str, row) for row in df.as_matrix()])
The map(str, row) step is needed so Orange know that the data contains values of discrete features (and not the indices of values in the values list).
This code is revised from #TurtleIzzy for Python3.
import numpy as np
from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable
def series2descriptor(d):
if d.dtype is np.dtype("float") or d.dtype is np.dtype("int"):
return ContinuousVariable(str(d.name))
else:
t = d.unique()
t.sort()
return DiscreteVariable(str(d.name), list(t.astype("str")))
def df2domain(df):
featurelist = [series2descriptor(df.iloc[:,col]) for col in range(len(df.columns))]
return Domain(featurelist)
def df2table(df):
tdomain = df2domain(df)
ttables = [series2table(df.iloc[:,i], tdomain[i]) for i in range(len(df.columns))]
ttables = np.array(ttables).reshape((len(df.columns),-1)).transpose()
return Table(tdomain , ttables)
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
series = series.values[:, np.newaxis]
return Table(series)
else:
series = series.astype('category').cat.codes.reshape((-1,1))
return Table(series)
Something like this?
table = Orange.data.Table(df.as_matrix())
The columns in Orange will get generic names (a1, a2...). If you want to copy the names and the types from the data frame, construct Orange.data.Domain object (http://docs.orange.biolab.si/reference/rst/Orange.data.domain.html#Orange.data.Domain.init) from the data frame and pass it as the first argument above.
See the constructors in http://docs.orange.biolab.si/reference/rst/Orange.data.table.html.
table_from_frame, which is available in Python 3, doesn't allow the definition of a class column and therefore, the generated table cannot be used directly to train a classification model. I tweaked the table_from_frame function so it'll allow the definition of a class column. Notice that the class name should be given as an additional parameter.
"""Pandas DataFrame↔Table conversion helpers"""
import numpy as np
import pandas as pd
from pandas.api.types import (
is_categorical_dtype, is_object_dtype,
is_datetime64_any_dtype, is_numeric_dtype,
)
from Orange.data import (
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
ContinuousVariable,
)
__all__ = ['table_from_frame', 'table_to_frame']
def table_from_frame(df,class_name, *, force_nominal=False):
"""
Convert pandas.DataFrame to Orange.data.Table
Parameters
----------
df : pandas.DataFrame
force_nominal : boolean
If True, interpret ALL string columns as nominal (DiscreteVariable).
Returns
-------
Table
"""
def _is_discrete(s):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
s.nunique() < s.size**.666))
def _is_datetime(s):
if is_datetime64_any_dtype(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False
# If df index is not a simple RangeIndex (or similar), put it into data
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
df.index.is_monotonic_decreasing)):
df = df.reset_index()
attrs, metas,calss_vars = [], [],[]
X, M = [], []
# Iter over columns
for name, s in df.items():
name = str(name)
if name == class_name:
discrete = s.astype('category').cat
calss_vars.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_discrete(s):
discrete = s.astype('category').cat
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_datetime(s):
tvar = TimeVariable(name)
attrs.append(tvar)
s = pd.to_datetime(s, infer_datetime_format=True)
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
elif is_numeric_dtype(s):
attrs.append(ContinuousVariable(name))
X.append(s.values)
else:
metas.append(StringVariable(name))
M.append(s.values.astype(object))
return Table.from_numpy(Domain(attrs, calss_vars, metas),
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
None,
np.column_stack(M) if M else None)
This works well
from Orange.data.pandas_compat import table_from_frame,table_to_frame
import pandas as pd
# read the input data into pandas data-frame
df= table_to_frame(in_data)
# perform all data operations / wrangling
# for example only few columns are required in output
df = df[['Col1', 'Col2']]
# Final output
out_data = table_from_frame(df)