I am wanting to convert the following code (which runs in pandas) to code that runs in cuDF.
Sample data from .head() of Series being manipulated is plugged into OG code in the 3rd code cell down -- should be able to copy/paste run.
Original code in pandas
# both are float columns now
# rawcensustractandblock
s_rawcensustractandblock = df_train['rawcensustractandblock'].apply(lambda x: str(x))
# adjust/set new tract number
df_train['census_tractnumber'] = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
df_train['block_number'] = s_rawcensustractandblock.str.slice(start=11)
df_train['block_number'] = df_train['block_number'].apply(lambda x: x[:4]+'.'+x[4:]+'0' )
df_train['block_number'] = df_train['block_number'].apply(lambda x: int(round(float(x),0)) )
df_train['block_number'] = df_train['block_number'].apply(lambda x: str(x).ljust(4,'0') )
Data being manipulated
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
Code adjusted to start with this sample data
Here's how the code looks when using the above provided data instead of the entire dataframe.
Based on errors encountered when trying to convert, this issue is at the Series level, so the converting the cell below to execute in cuDF should solve the problem.
import pandas as pd
# series of values from df_train.['rawcensustractandblock'].head()
data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# how the first line looks using the series
s_rawcensustractandblock = data.apply(lambda x: str(x))
# adjust/set new tract number
census_tractnumber = s_rawcensustractandblock.str.slice(4,11)
# adjust block number
block_number = s_rawcensustractandblock.str.slice(start=11)
block_number = block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0' )
block_number = block_number.apply(lambda x: int(round(float(x),0)) )
block_number = block_number.apply(lambda x: str(x).ljust(4,'0') )
Expected changes (output)
df_train['census_tractnumber'].head()
# out
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
Name: census_tractnumber, dtype: object
df_train['block_number'].head()
0 1001
1 2024
2 3004
3 2002
4 1006
Name: block_number, dtype: object
You can use cuDF string methods (via nvStrings) for almost everything you're trying to do. You will lose some precision converting these floats to strings in cuDF (though it may not matter in your example above), so for this example I've simply converted beforehand. If possible, I would recommend initially creating the rawcensustractandblock as a string column rather than a float column.
import cudf
import pandas as pd
gdata = cudf.from_pandas(pd_data.astype('str'))
tractnumber = gdata.str.slice(4,11)
blocknumber = gdata.str.slice(11)
blocknumber = blocknumber.str.slice(0,4).str.cat(blocknumber.str.slice(4), '.')
blocknumber = blocknumber.astype('float').round(0).astype('int')
blocknumber = blocknumber.astype('str').str.ljust(4, '0')
tractnumber
0 1066.46
1 0524.22
2 4638.00
3 2963.00
4 0423.38
dtype: object
blocknumber
0 1001
1 2024
2 3004
3 2002
4 1006
dtype: object
for loop solution
pandas (original code)
import pandas as pd
# data from df_train.rawcensustractandblock.head()
pd_data = pd.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
pd_raw_block = pd_data.apply(lambda x: str(x))
# adjust/set new tract number
pd_tractnumber = pd_raw_block.str.slice(4,11)
# set/adjust block number
pd_block_number = pd_raw_block.str.slice(11)
pd_block_number = pd_block_number.apply(lambda x: x[:4]+'.'+x[4:]+'0')
pd_block_number = pd_block_number.apply(lambda x: int(round(float(x),0)))
pd_block_number = pd_block_number.apply(lambda x: str(x).ljust(4,'0'))
# print(list(pd_tractnumber))
# print(list(pd_block_number))
cuDF (solution code)
import cudf
# data from df_train.rawcensustractandblock.head()
cudf_data = cudf.Series([60371066.461001, 60590524.222024, 60374638.00300401,
60372963.002002, 60590423.381006])
# using series instead of dataframe
cudf_tractnumber = cudf_data.values_to_string()
# adjust/set new tract number
for i in range(len(cudf_tractnumber)):
funct = slice(4,11)
cudf_tractnumber[i] = cudf_tractnumber[i][funct]
# using series instead of dataframe
cudf_block_number = cudf_data.values_to_string()
# set/adjust block number
for i in range(len(cudf_block_number)):
funct = slice(11, None)
cudf_block_number[i] = cudf_block_number[i][funct]
cudf_block_number[i] = cudf_block_number[i][:4]+'.'+cudf_block_number[i][4:]+'0'
cudf_block_number[i] = int(round(float(cudf_block_number[i]), 0))
cudf_block_number[i] = str(cudf_block_number[i]).ljust(4,'0')
# print(cudf_tractnumber)
# print(cudf_block_number)
Related
I need to change the text variable in the data set below. Namely, each row has categorical values in the object format that need to be changed, depending on the last character in the data set. Below you can see my dataset.
import pandas as pd
import numpy as np
data = {
'stores': ['Lexinton1','ROYAl2','Mall1','Mall2','Levis1','Levis2','Shark1','Shark','Lexinton'],
'quantity':[1,1,1,1,1,1,1,1,1]
}
df = pd.DataFrame(data, columns = ['stores',
'quantity'
])
df
Now I want to change this data depending on the last character. For example, if the last charter is number 1 then I want to put the word open, if is number 2 then I want to put closed.If is not a number then I don't put anything and the text will be the same. Below you can output that is desirable
You can approach this by using pandas.Series.str and pandas.Series.map.
dmap = {1: "_open", 2: "_close"}
suffix = pd.to_numeric(df["stores"].str[-1], errors="coerce").map(dmap).fillna("")
df["stores"] = df["stores"].str[:-1].add(suffix)
Or simply by using pandas.Series.replace :
df["stores"] = df["stores"].replace({"1$": "_open", "2$": "_close"}, regex=True)
Output :
print(df)
stores quantity
0 Lexinton_open 1
1 ROYAl_close 1
2 Mall_open 1
3 Mall_close 1
4 Levis_open 1
5 Levis_close 1
6 Shark_open 1
7 Shar 1
8 Lexinto 1
You can try this:
import pandas as pd
import numpy as np
data = {
'stores': ['Lexinton1','ROYAl2','Mall1','Mall2','Levis1','Levis2','Shark1','Shark','Lexinton'],
'quantity':[1,1,1,1,1,1,1,1,1]
}
for i in range(len(data['stores'])):
if data['stores'][i][-1] == '1':
data['stores'][i] = data['stores'][i][:-1]+'_open'
elif data['stores'][i][-1] == '2':
data['stores'][i] = data['stores'][i][:-1]+'_closed'
df = pd.DataFrame(data, columns = ['stores',
'quantity'
])
I need to add seconds in YYYY-MM-DD-HH-MM-SS. My code works perfectly for one data point but not for the whole set. The data.txt consists of 7 columns and around 200 rows.
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
df = pd.read_csv('data.txt',sep='\t',header=None)
a = np.array(list(df[0]))
b = np.array(list(df[1]))
c = np.array(list(df[2]))
d = np.array(list(df[3]))
e = np.array(list(df[4]))
f = np.array(list(df[5]))
g = np.array(list(df[6]))
t1=datetime(year=a, month=b, day=c, hour=d, minute=e, second=f)
t = t1 + timedelta(seconds=g)
print(t)
You can pass parameter names to read_csv for new columns names in first step and then convert first 5 columns to datetimes by to_datetime and add seconds converted to timedeltas by to_timedelta:
names = ["year","month","day","hour","minute","second","new"]
df = pd.read_csv('data.txt',sep='\t',names=names)
df['out'] = pd.to_datetime(df[names]) + pd.to_timedelta(df["new"], unit='s')
use apply with axis=1 to apply a function to every row of the dataframe.
df.apply(lambda x: datetime(year=x[0],
month=x[1],
day=x[2],
hour=x[3],
minute=x[4],
second=x[5]) + timedelta(seconds=int(x[6])) , axis=1)
generating dataset
simple to do as pandas series
s = 20
df = pd.DataFrame(np.array([np.random.randint(2015,2020,s),np.random.randint(1,12,s),np.random.randint(1,28,s),
np.random.randint(0,23,s), np.random.randint(0,59,s), np.random.randint(0,59,s),
np.random.randint(0,200,s)]).T,
columns=["year","month","day","hour","minute","second","add"])
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + df["add"].apply(lambda s: pd.Timedelta(seconds=s))
without using apply()
pd.to_datetime(df.loc[:,["year","month","day","hour","minute","second"]]) + pd.to_timedelta(df["add"], unit="s")
Another question :) I need to know how to store a variable for reference at the beginning of my script. In this case, I am needing to store an FX conversion rate which I want to be able to adjust at the beginning of the script. I am also wanting to store a directory in my PC which will store the outputs of the script and is prone to changing, each month.
For reference, I have created the following example.
import pandas as pd
FX_rate = {'AUD':[0.71442],'NZD':[0.68476]}
Dir = 'C:\Users\Admin\Desktop\December\Monthly_Output.csv'
df = {'AU_SALES':[1000,2500,750,6800,1000],'NZ_SALES':[500,2200,430,100,6670]
df1 = pd.DataFrame(df)
# ISSUE HERE - Covert Sales using FX_rate dictionary
df_USDAUD = df['AU_SALES'] * FX_rate['AUD']
df_USDNZD = df['NZ_SALES'] * FX_rate['NZD']
df_converted = df_USDAUD.append(df_USDNZD)
# Save output in folder, using Dir directory
df_converted.to_csv(Dir)
If I were to run this script I would get an error telling me that the number of values in df['AU_SALES'] (5) and the number of values in FX_rate['AUD'] (1) do not match.
I do not know what your expected output should be but I think this should work. Use multiply with the start operator *
import pandas as pd
FX_rate = {'AUD':[0.71442],'NZD':[0.68476]}
Dir = r'C:\Users\Admin\Desktop\December\Monthly_Output.csv'
df = {'AU_SALES':[1000,2500,750,6800,1000],'NZ_SALES':[500,2200,430,100,6670]}
df1 = pd.DataFrame(df)
# use multiply with the star operator
df_USDAUD = df1['AU_SALES'].multiply(*FX_rate['AUD'])
df_USDNZD = df1['NZ_SALES'].multiply(*FX_rate['NZD'])
df_converted = df_USDAUD.append(df_USDNZD)
print(df_converted)
0 714.4200
1 1786.0500
2 535.8150
3 4858.0560
4 714.4200
0 342.3800
1 1506.4720
2 294.4468
3 68.4760
4 4567.3492
dtype: float64
Or you can create a function
# create a function
def myFun(df, aud, nzd, Dir):
df_USDAUD = df['AU_SALES'] * aud
df_USDNZD = df['NZ_SALES'] * nzd
df_converted = df_USDAUD.append(df_USDNZD)
df_converted.to_csv(Dir)
return df_converted
Dir = r'C:\Users\Admin\Desktop\December\Monthly_Output.csv'
df = {'AU_SALES':[1000,2500,750,6800,1000],'NZ_SALES':[500,2200,430,100,6670]}
df1 = pd.DataFrame(df)
myFun(df1, 0.71442, 0.68476, Dir)
or just do not store the numbers in a list inside the dict: FX_rate = {'AUD':0.71442,'NZD':0.68476}
FX_rate = {'AUD':0.71442,'NZD':0.68476}
Dir = r'C:\Users\Admin\Desktop\December\Monthly_Output.csv'
df = {'AU_SALES':[1000,2500,750,6800,1000],'NZ_SALES':[500,2200,430,100,6670]}
df1 = pd.DataFrame(df)
# use multiple with the star operator
df_USDAUD = df1['AU_SALES'] * FX_rate['AUD']
df_USDNZD = df1['NZ_SALES'] * FX_rate['NZD']
df_converted = df_USDAUD.append(df_USDNZD)
print(df_converted)
Here is my code and warning message. If I change s to be a standalone Series by using s = pd.Series(np.random.randn(5)), there will no such errors. Using Python 2.7 on Windows.
It seems Series created from standalone and Series created from a column of a data frame are different behavior? Thanks.
My purpose is to change the Series value itself, other than change on a copy.
Source code,
import pandas as pd
sample = pd.read_csv('123.csv', header=None, skiprows=1,
dtype={0:str, 1:str, 2:str, 3:float})
sample.columns = pd.Index(data=['c_a', 'c_b', 'c_c', 'c_d'])
sample['c_d'] = sample['c_d'].astype('int64')
s = sample['c_d']
#s = pd.Series(np.random.randn(5))
for i in range(len(s)):
if s.iloc[i] > 0:
s.iloc[i] = s.iloc[i] + 1
else:
s.iloc[i] = s.iloc[i] - 1
Warning message,
C:\Python27\lib\site-packages\pandas\core\indexing.py:132: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._setitem_with_indexer(indexer, value)
Content of 123.csv,
c_a,c_b,c_c,c_d
hello,python,numpy,0.0
hi,python,pandas,1.0
ho,c++,vector,0.0
ho,c++,std,1.0
go,c++,std,0.0
Edit 1, seems lambda solution does not work, tried to print s before and after, the same value,
import pandas as pd
sample = pd.read_csv('123.csv', header=None, skiprows=1,
dtype={0:str, 1:str, 2:str, 3:float})
sample.columns = pd.Index(data=['c_a', 'c_b', 'c_c', 'c_d'])
sample['c_d'] = sample['c_d'].astype('int64')
s = sample['c_d']
print s
s.apply(lambda x:x+1 if x>0 else x-1)
print s
0 0
1 1
2 0
3 1
4 0
Name: c_d, dtype: int64
Backend TkAgg is interactive backend. Turning interactive mode on.
0 0
1 1
2 0
3 1
4 0
regards,
Lin
By doing s = sample['c_d'], if you make a change to the value of s then your original Dataframe sample also changes. That's why you got the warning.
You can do s = sample[c_d].copy() instead, so that changing the value of s doesn't change the value of c_d column of the Dataframe sample.
I suggest you use apply function instead:
s.apply(lambda x:x+1 if x>0 else x-1)
I notice that this is an issue on GitHub already. Does anyone have any code that converts a Pandas DataFrame to an Orange Table?
Explicitly, I have the following table.
user hotel star_rating user home_continent gender
0 1 39 4.0 1 2 female
1 1 44 3.0 1 2 female
2 2 63 4.5 2 3 female
3 2 2 2.0 2 3 female
4 3 26 4.0 3 1 male
5 3 37 5.0 3 1 male
6 3 63 4.5 3 1 male
The documentation of Orange package didn't cover all the details. Table._init__(Domain, numpy.ndarray) works only for int and float according to lib_kernel.cpp.
They really should provide an C-level interface for pandas.DataFrames, or at least numpy.dtype("str") support.
Update: Adding table2df, df2table performance improved greatly by utilizing numpy for int and float.
Keep this piece of script in your orange python script collections, now you are equipped with pandas in your orange environment.
Usage: a_pandas_dataframe = table2df( a_orange_table ) , a_orange_table = df2table( a_pandas_dataframe )
Note: This script works only in Python 2.x, refer to #DustinTang 's answer for Python 3.x compatible script.
import pandas as pd
import numpy as np
import Orange
#### For those who are familiar with pandas
#### Correspondence:
#### value <-> Orange.data.Value
#### NaN <-> ["?", "~", "."] # Don't know, Don't care, Other
#### dtype <-> Orange.feature.Descriptor
#### category, int <-> Orange.feature.Discrete # category: > pandas 0.15
#### int, float <-> Orange.feature.Continuous # Continuous = core.FloatVariable
#### # refer to feature/__init__.py
#### str <-> Orange.feature.String
#### object <-> Orange.feature.Python
#### DataFrame.dtypes <-> Orange.data.Domain
#### DataFrame.DataFrame <-> Orange.data.Table = Orange.orange.ExampleTable
#### # You will need this if you are reading sources
def series2descriptor(d, discrete=False):
if d.dtype is np.dtype("float"):
return Orange.feature.Continuous(str(d.name))
elif d.dtype is np.dtype("int"):
return Orange.feature.Continuous(str(d.name), number_of_decimals=0)
else:
t = d.unique()
if discrete or len(t) < len(d) / 2:
t.sort()
return Orange.feature.Discrete(str(d.name), values=list(t.astype("str")))
else:
return Orange.feature.String(str(d.name))
def df2domain(df):
featurelist = [series2descriptor(df.icol(col)) for col in xrange(len(df.columns))]
return Orange.data.Domain(featurelist)
def df2table(df):
# It seems they are using native python object/lists internally for Orange.data types (?)
# And I didn't find a constructor suitable for pandas.DataFrame since it may carry
# multiple dtypes
# --> the best approximate is Orange.data.Table.__init__(domain, numpy.ndarray),
# --> but the dtype of numpy array can only be "int" and "float"
# --> * refer to src/orange/lib_kernel.cpp 3059:
# --> * if (((*vi)->varType != TValue::INTVAR) && ((*vi)->varType != TValue::FLOATVAR))
# --> Documents never mentioned >_<
# So we use numpy constructor for those int/float columns, python list constructor for other
tdomain = df2domain(df)
ttables = [series2table(df.icol(i), tdomain[i]) for i in xrange(len(df.columns))]
return Orange.data.Table(ttables)
# For performance concerns, here are my results
# dtndarray = np.random.rand(100000, 100)
# dtlist = list(dtndarray)
# tdomain = Orange.data.Domain([Orange.feature.Continuous("var" + str(i)) for i in xrange(100)])
# tinsts = [Orange.data.Instance(tdomain, list(dtlist[i]) )for i in xrange(len(dtlist))]
# t = Orange.data.Table(tdomain, tinsts)
#
# timeit list(dtndarray) # 45.6ms
# timeit [Orange.data.Instance(tdomain, list(dtlist[i])) for i in xrange(len(dtlist))] # 3.28s
# timeit Orange.data.Table(tdomain, tinsts) # 280ms
# timeit Orange.data.Table(tdomain, dtndarray) # 380ms
#
# As illustrated above, utilizing constructor with ndarray can greatly improve performance
# So one may conceive better converter based on these results
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
# Use numpy
# Table._init__(Domain, numpy.ndarray)
return Orange.data.Table(Orange.data.Domain(variable), series.values[:, np.newaxis])
else:
# Build instance list
# Table.__init__(Domain, list_of_instances)
tdomain = Orange.data.Domain(variable)
tinsts = [Orange.data.Instance(tdomain, [i]) for i in series]
return Orange.data.Table(tdomain, tinsts)
# 5x performance
def column2df(col):
if type(col.domain[0]) is Orange.feature.Continuous:
return (col.domain[0].name, pd.Series(col.to_numpy()[0].flatten()))
else:
tmp = pd.Series(np.array(list(col)).flatten()) # type(tmp) -> np.array( dtype=list (Orange.data.Value) )
tmp = tmp.apply(lambda x: str(x[0]))
return (col.domain[0].name, tmp)
def table2df(tab):
# Orange.data.Table().to_numpy() cannot handle strings
# So we must build the array column by column,
# When it comes to strings, python list is used
series = [column2df(tab.select(i)) for i in xrange(len(tab.domain))]
series_name = [i[0] for i in series] # To keep the order of variables unchanged
series_data = dict(series)
print series_data
return pd.DataFrame(series_data, columns=series_name)
Answer below from a closed issue on github
from Orange.data.pandas_compat import table_from_frame
out_data = table_from_frame(df)
Where df is your dataFrame. So far I've only noticed a need to manually define a domain to handle dates if the data source wasn't 100% clean and to the required ISO standard.
I realize this is an old question and a lot changed from when it was first asked - but this question comes up top on google search results on the topic.
from Orange.data.pandas_compat import table_from_frame,table_to_frame
df= table_to_frame(in_data)
#here you go
out_data = table_from_frame(df)
based on answer of Creo
In order to convert pandas DataFrame to Orange Table you need to construct a domain, which specifies the column types.
For continuous variables, you only need to provide the name of the variable, but for Discrete variables, you also need to provide a list of all possible values.
The following code will construct a domain for your DataFrame and convert it to an Orange Table:
import numpy as np
from Orange.feature import Discrete, Continuous
from Orange.data import Domain, Table
domain = Domain([
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('hotel', values=[str(v) for v in np.unique(df.hotel)]),
Continuous('star_rating'),
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('home_continent', values=[str(v) for v in np.unique(df.home_continent)]),
Discrete('gender', values=['male', 'female'])], False)
table = Table(domain, [map(str, row) for row in df.as_matrix()])
The map(str, row) step is needed so Orange know that the data contains values of discrete features (and not the indices of values in the values list).
This code is revised from #TurtleIzzy for Python3.
import numpy as np
from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable
def series2descriptor(d):
if d.dtype is np.dtype("float") or d.dtype is np.dtype("int"):
return ContinuousVariable(str(d.name))
else:
t = d.unique()
t.sort()
return DiscreteVariable(str(d.name), list(t.astype("str")))
def df2domain(df):
featurelist = [series2descriptor(df.iloc[:,col]) for col in range(len(df.columns))]
return Domain(featurelist)
def df2table(df):
tdomain = df2domain(df)
ttables = [series2table(df.iloc[:,i], tdomain[i]) for i in range(len(df.columns))]
ttables = np.array(ttables).reshape((len(df.columns),-1)).transpose()
return Table(tdomain , ttables)
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
series = series.values[:, np.newaxis]
return Table(series)
else:
series = series.astype('category').cat.codes.reshape((-1,1))
return Table(series)
Something like this?
table = Orange.data.Table(df.as_matrix())
The columns in Orange will get generic names (a1, a2...). If you want to copy the names and the types from the data frame, construct Orange.data.Domain object (http://docs.orange.biolab.si/reference/rst/Orange.data.domain.html#Orange.data.Domain.init) from the data frame and pass it as the first argument above.
See the constructors in http://docs.orange.biolab.si/reference/rst/Orange.data.table.html.
table_from_frame, which is available in Python 3, doesn't allow the definition of a class column and therefore, the generated table cannot be used directly to train a classification model. I tweaked the table_from_frame function so it'll allow the definition of a class column. Notice that the class name should be given as an additional parameter.
"""Pandas DataFrame↔Table conversion helpers"""
import numpy as np
import pandas as pd
from pandas.api.types import (
is_categorical_dtype, is_object_dtype,
is_datetime64_any_dtype, is_numeric_dtype,
)
from Orange.data import (
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
ContinuousVariable,
)
__all__ = ['table_from_frame', 'table_to_frame']
def table_from_frame(df,class_name, *, force_nominal=False):
"""
Convert pandas.DataFrame to Orange.data.Table
Parameters
----------
df : pandas.DataFrame
force_nominal : boolean
If True, interpret ALL string columns as nominal (DiscreteVariable).
Returns
-------
Table
"""
def _is_discrete(s):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
s.nunique() < s.size**.666))
def _is_datetime(s):
if is_datetime64_any_dtype(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False
# If df index is not a simple RangeIndex (or similar), put it into data
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
df.index.is_monotonic_decreasing)):
df = df.reset_index()
attrs, metas,calss_vars = [], [],[]
X, M = [], []
# Iter over columns
for name, s in df.items():
name = str(name)
if name == class_name:
discrete = s.astype('category').cat
calss_vars.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_discrete(s):
discrete = s.astype('category').cat
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_datetime(s):
tvar = TimeVariable(name)
attrs.append(tvar)
s = pd.to_datetime(s, infer_datetime_format=True)
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
elif is_numeric_dtype(s):
attrs.append(ContinuousVariable(name))
X.append(s.values)
else:
metas.append(StringVariable(name))
M.append(s.values.astype(object))
return Table.from_numpy(Domain(attrs, calss_vars, metas),
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
None,
np.column_stack(M) if M else None)
This works well
from Orange.data.pandas_compat import table_from_frame,table_to_frame
import pandas as pd
# read the input data into pandas data-frame
df= table_to_frame(in_data)
# perform all data operations / wrangling
# for example only few columns are required in output
df = df[['Col1', 'Col2']]
# Final output
out_data = table_from_frame(df)