Losing variables in Jupyter Notebook - python

In a jupyter notebook, I declare one variable from file:
with fits.open('mind_dataset/matrix_CEREBELLUM_large.fits') as data:
matrix_cerebellum = pd.DataFrame(data[0].data.byteswap().newbyteorder())
In the cells below, I have two methods:
neuronal_web_pixel = 0.32 # 1 micron => 10e-6 meters
def pixels_to_scale(df, mind=False, cosmos=False):
one_pixel_equals_micron = neuronal_web_pixel
brain_mask = (df != 0.0)
df[brain_mask] *= one_pixel_equals_micron
return df
def binarize_matrix(df, mind=False, cosmos=False):
brain_Llink = 16.0 # microns
zero_mask = (df != 0)
low_mask = (df <= brain_Llink)
df[low_mask & zero_mask] = 1.0
higher_mask = (df >= brain_Llink)
df[higher_mask] = 0.0
return df
Then I pass my variables to methods, to obtain scaled and binary dataframes:
matrix_cerebellum_scaled = pixels_to_scale(matrix_cerebellum, mind=True)
matrix_cerebellum_binary = binarize_matrix(matrix_cerebellum_scaled, mind=True)
However, if I call 'matrix_cerebellum_scaled', now it points to 'matrix_cerebellum_binary' and I lose 'matrix_cerebellum_scaled' dataframe.
Why? what am I missing?

Naming thing: those aren't methods, they're functions; now: if you modify a DataFrame within a function those changes still happen to the DataFrame. If you want a new DataFrame, declare it as a copy of the one being passed in.
At the very least at the top of binarize_matrix() do: new_df = df.copy(). More detail about why that's necessary in this SO answer and comments: https://stackoverflow.com/a/39628860/42346


Pandas complex math in groupby+aggregation

I want to run some complex math while aggregating. I wrote the aggregation function:
import math as mt
# convert calc cols to float from object
cols = dfg_dom_smry.columns
cols = cols[2:]
for col in cols:
df[col] = df[col].astype(float)
# groupby fields
index = ['PA']
df = dfdom.groupby(index).agg({'newcol1': (mt.sqrt(sum('savings'*'rp')**2))/sum('savings')})
I got an error: TypeError: can't multiply sequence by non-int of type 'str'
This is an extract of my data. The full data has many set of savings and rp columns. So ideally I want to run a for loop for each set of savings and rp columns
PA domain savings rp
M M-RET-COM 383,895.36 0.14
P P-RET-AG 14,302,804.19 0.16
P P-RET-COM 56,074,119.28 0.33
P P-RET-IND 46,677,610.00 0.27
P P-SBD/NC-AG 1,411,905.00 -
P P-SBD/NC-COM 4,255,891.25 0.36
P P-SBD/NC-IND 295,365.00 -
S S-RET-AG 2,391,504.33 0.72
S S-RET-COM 19,195,073.84 0.18
S S-RET-IND 17,677,708.38 0.13
S S-SBD/NC-COM 6,116,407.07 0.05
D D-RET-COM 11,944,490.39 0.15
D D-RET-IND 1,213,117.63 -
D D-SBD/NC-COM 2,708,153.57 0.69
For the above data this would be the final result:
PA newcol1
M 0.143027374757981
P 0.18601700701305
S 0.0979541706738756
D 0.166192684106493
thanks for your help
What about
o = dfdom.groupby(index).apply(
lambda s: pow(pow(s.savings*s.rp, 2).sum(), .5)/(s.savings.sum() or 1)
Where s above stands for pandas.Series.
Also, note that o is an instance of pandas.Series, which means that you will have to convert it into a pandas.DataFrame, at least to justify the name you give it, i.e. df. You can do so by doing:
df = o.to_frame('the column name you want')
Put differently/parametrically
def rollup(df, index, svgs, rp, col_name):
return df.groupby(index).apply(
lambda s: pow(pow(s[svgs]*s[rp], 2).sum(), .5)/(s[svgs].sum() or 1)
# rollup(dfdom, index, 'savings', 'rp', 'svgs_rp')
update: I replaced the code below with that in the accepted answer.
This is what I finally did. I created a function to step through each of the calculations and call the function for each set of savings and rp cols.
# the function
def rollup(df, index, svgs, rp):
df['svgs_rp'] = (df[svgs]*df[rp])**2
df2 = df.groupby(index).agg({'svgs_rp':'sum',
df2['temp'] = np.where((df2[svgs] == 0), '', ((df2['svgs_rp']**(1/2))/df2[svgs]))
df2 = df2['temp']
return df2
#calling the function
index = ['PA']
# the next part is within a for loop to go through all the savings and rp column sets. For simplicity I've removed the loop.
svgs = 'savings1'
rp = 'rp1'
dftemp = rollup(dfdom, index, svgs, rp)
dftemp.rename({'temp': newcol1}, axis=1, inplace=True)
df = pd.merge(df, dftemp, on=index, how = 'left') # this last step was not put in my original question. I've provided so the r-code below makes sense.
annoying that I have to first do the math in new columns and then aggregate. This is the equivalent code in R:
# the function
roll_up <- function(savings,rp){
# calling the function
df <- df[dfdom[,.(newcol1=roll_up(savings1,rp1),
I'm relatively new to python programming, and this the best I could come up with. If there is a better way to do this, please share. Thanks.
Your groupby should have () and then the [] like this:
df = dfdom.groupby([index]).agg({'newcol1': (mt.sqrt(sum('savings'*'rp')^2))/sum('savings')})

Python Pandas rolling mean DataFrame Constructor not properly called

I am trying to create a simple time-series, of different rolling types. One specific example, is a rolling mean of N periods using the Panda python package.
I get the following error : ValueError: DataFrame constructor not properly called!
Below is my code :
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'].rolling(n), name = 'MovingAverage_' + str(n))
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df
Would anyone be able to advise?
Kind regards
found the correction! It was erroring out (new error), where I had to explicitly declare n as an integer. Below, the code works
#xw.arg('n', numbers = int, doc = 'this is the rolling window')
def py_TA_MA(v, n, AscendType):
df = pd.DataFrame(v, columns=['Close'])
df = df.sort_index(ascending=AscendType) # ascending/descending flag
M = pd.Series(df['Close'], name = 'Moving Average').rolling(window = n).mean()
#df = pd.Series(df['Close']).rolling(window = n).mean()
df = df.join(M)
df = df.sort_index(ascending=True) #need to double-check this
return df

Pandas apply a change which affects 2 columns at the same time

I have the dataframe below. bet_pl and co_pl keep track of the daily changes in the 2 balances. I have updated co_balance based on co_pl and the cumsum.
init_balance = D('100.0')
co_thresh = D('1.05') * init_balance
def get_pl_co(row):
if row['eod_bet_balance'] > co_thresh:
diff = row['eod_bet_balance']- co_thresh
return Decimal('0.0')
df_odds_winloss['eod_bet_balance'] = df_odds_winloss['bet_pl'].cumsum()+initial_balance
df_odds_winloss['sod_bet_balance']= df_odds_winloss['eod_bet_balance'].shift(1).fillna(init_balance)
df_odds_winloss['co_pl'] = df_odds_winloss.apply(get_pl_co, axis=1)
df_odds_winloss['co_balance'] = df_odds_winloss['co_pl'].cumsum()
# trying this
df_odds_winloss['eod_bet_balance'] = df_odds_winloss['eod_bet_balance'] - df_odds_winloss['co_pl']
Now I want the eod_bet_balance to update with negative co_pl as it is a transfer between the 2 balances, but am not getting the right eod (end of day) balances.
Can anyone give a hint?
UPDATED: The eod_balances reflect the change in bet_pl but not the subsequent change in co_pl.
initial_balance = D('100.0')
df = pd.DataFrame({ 'SP': res_df['SP'], 'winloss': bin_seq_l}, columns=['SP', 'winloss'])
df['bet_pl'] = df.apply(get_pl_lvl, axis=1)
df['interim_balance'] = df_odds_winloss['bet_pl'].cumsum()+initial_balance
df['co_pl'] = (df['interim_balance'] - co_thresh).clip_lower(0)
df['co_balance'] = df_odds_winloss['co_pl'].cumsum()
df['post_co_balance'] = df['interim_balance'] - df['co_pl']
bf_r = D('0.05')
df['post_co_deduct_balance'] = df['post_co_balance'] - (df['post_co_balance']* bf_r)
df['sod_bet_balance'] = df['post_co_deduct_balance'].shift(1).fillna(init_balance)
First, you don't need to apply a custom function to get co_pl, it could be done like so:
df['co_pl'] = (df['eod_bet_balance'] - co_thresh).clip_lower(0)
As for updating the other column, if I understand correctly you want something like this:
df['eod_bet_balance'] = df['eod_bet_balance'].clip_upper(co_thresh)
or, equivalently...
df['eod_bet_balance'] -= df['co_pl']

Python Pandas Merge Causing Memory Overflow

I'm new to Pandas and am trying to merge a few subsets of data. I'm giving a specific case where this happens, but the question is general: How/why is it happening and how can I work around it?
The data I load is around 85 Megs or so but I often watch my python session run up close to 10 gigs of memory usage then give a memory error.
I have no idea why this happens, but it's killing me as I can't even get started looking at the data the way I want to.
Here's what I've done:
Importing the Main data
import requests, zipfile, StringIO
import numpy as np
import pandas as pd
STAR2013fileName = 'ca2013_all_csv_v3.txt'
r = requests.get(STAR2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
Importing some Cross Cross Referencing Tables
STARentityList2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/ca2013entities_csv.zip"
STARentityList2013fileName = "ca2013entities_csv.txt"
r = requests.get(STARentityList2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STARlookUpTestID2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/tests.zip"
STARlookUpTestID2013fileName = "Tests.txt"
r = requests.get(STARlookUpTestID2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
STARlookUpSubgroupID2013url = "http://www3.cde.ca.gov/starresearchfiles/2013/p3/subgroups.zip"
STARlookUpSubgroupID2013fileName = "Subgroups.txt"
r = requests.get(STARlookUpSubgroupID2013url)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
Renaming a Column ID to Allow for Merge
STARlookUpSubgroupID2013 = STARlookUpSubgroupID2013.rename(columns={'001':'Subgroup ID'})
Successful Merge
merged = pd.merge(STAR2013,STARlookUpSubgroupID2013, on='Subgroup ID')
Try a second merge. This is where the Memory Overflow Happens
merged=pd.merge(merged, STARentityList2013, on='School Code')
I did all of this in ipython notebook, but don't think that changes anything.
Although this is an old question, I recently came across the same problem.
In my instance, duplicate keys are required in both dataframes, and I needed a method which could tell if a merge will fit into memory ahead of computation, and if not, change the computation method.
The method I came up with is as follows:
Calculate merge size:
def merge_size(left_frame, right_frame, group_by, how='inner'):
left_groups = left_frame.groupby(group_by).size()
right_groups = right_frame.groupby(group_by).size()
left_keys = set(left_groups.index)
right_keys = set(right_groups.index)
intersection = right_keys & left_keys
left_diff = left_keys - intersection
right_diff = right_keys - intersection
left_nan = len(left_frame[left_frame[group_by] != left_frame[group_by]])
right_nan = len(right_frame[right_frame[group_by] != right_frame[group_by]])
left_nan = 1 if left_nan == 0 and right_nan != 0 else left_nan
right_nan = 1 if right_nan == 0 and left_nan != 0 else right_nan
sizes = [(left_groups[group_name] * right_groups[group_name]) for group_name in intersection]
sizes += [left_nan * right_nan]
left_size = [left_groups[group_name] for group_name in left_diff]
right_size = [right_groups[group_name] for group_name in right_diff]
if how == 'inner':
return sum(sizes)
elif how == 'left':
return sum(sizes + left_size)
elif how == 'right':
return sum(sizes + right_size)
return sum(sizes + left_size + right_size)
At present with this method, the key can only be a label, not a list. Using a list for group_by currently returns a sum of merge sizes for each label in the list. This will result in a merge size far larger than the actual merge size.
If you are using a list of labels for the group_by, the final row size is:
min([merge_size(df1, df2, label, how) for label in group_by])
Check if this fits in memory
The merge_size function defined here returns the number of rows which will be created by merging two dataframes together.
By multiplying this with the count of columns from both dataframes, then multiplying by the size of np.float[32/64], you can get a rough idea of how large the resulting dataframe will be in memory. This can then be compared against psutil.virtual_memory().available to see if your system can calculate the full merge.
def mem_fit(df1, df2, key, how='inner'):
rows = merge_size(df1, df2, key, how)
cols = len(df1.columns) + (len(df2.columns) - 1)
required_memory = (rows * cols) * np.dtype(np.float64).itemsize
return required_memory <= psutil.virtual_memory().available
The merge_size method has been proposed as an extension of pandas in this issue. https://github.com/pandas-dev/pandas/issues/15068.

Converting Pandas DataFrame to Orange Table

I notice that this is an issue on GitHub already. Does anyone have any code that converts a Pandas DataFrame to an Orange Table?
Explicitly, I have the following table.
user hotel star_rating user home_continent gender
0 1 39 4.0 1 2 female
1 1 44 3.0 1 2 female
2 2 63 4.5 2 3 female
3 2 2 2.0 2 3 female
4 3 26 4.0 3 1 male
5 3 37 5.0 3 1 male
6 3 63 4.5 3 1 male
The documentation of Orange package didn't cover all the details. Table._init__(Domain, numpy.ndarray) works only for int and float according to lib_kernel.cpp.
They really should provide an C-level interface for pandas.DataFrames, or at least numpy.dtype("str") support.
Update: Adding table2df, df2table performance improved greatly by utilizing numpy for int and float.
Keep this piece of script in your orange python script collections, now you are equipped with pandas in your orange environment.
Usage: a_pandas_dataframe = table2df( a_orange_table ) , a_orange_table = df2table( a_pandas_dataframe )
Note: This script works only in Python 2.x, refer to #DustinTang 's answer for Python 3.x compatible script.
import pandas as pd
import numpy as np
import Orange
#### For those who are familiar with pandas
#### Correspondence:
#### value <-> Orange.data.Value
#### NaN <-> ["?", "~", "."] # Don't know, Don't care, Other
#### dtype <-> Orange.feature.Descriptor
#### category, int <-> Orange.feature.Discrete # category: > pandas 0.15
#### int, float <-> Orange.feature.Continuous # Continuous = core.FloatVariable
#### # refer to feature/__init__.py
#### str <-> Orange.feature.String
#### object <-> Orange.feature.Python
#### DataFrame.dtypes <-> Orange.data.Domain
#### DataFrame.DataFrame <-> Orange.data.Table = Orange.orange.ExampleTable
#### # You will need this if you are reading sources
def series2descriptor(d, discrete=False):
if d.dtype is np.dtype("float"):
return Orange.feature.Continuous(str(d.name))
elif d.dtype is np.dtype("int"):
return Orange.feature.Continuous(str(d.name), number_of_decimals=0)
t = d.unique()
if discrete or len(t) < len(d) / 2:
return Orange.feature.Discrete(str(d.name), values=list(t.astype("str")))
return Orange.feature.String(str(d.name))
def df2domain(df):
featurelist = [series2descriptor(df.icol(col)) for col in xrange(len(df.columns))]
return Orange.data.Domain(featurelist)
def df2table(df):
# It seems they are using native python object/lists internally for Orange.data types (?)
# And I didn't find a constructor suitable for pandas.DataFrame since it may carry
# multiple dtypes
# --> the best approximate is Orange.data.Table.__init__(domain, numpy.ndarray),
# --> but the dtype of numpy array can only be "int" and "float"
# --> * refer to src/orange/lib_kernel.cpp 3059:
# --> * if (((*vi)->varType != TValue::INTVAR) && ((*vi)->varType != TValue::FLOATVAR))
# --> Documents never mentioned >_<
# So we use numpy constructor for those int/float columns, python list constructor for other
tdomain = df2domain(df)
ttables = [series2table(df.icol(i), tdomain[i]) for i in xrange(len(df.columns))]
return Orange.data.Table(ttables)
# For performance concerns, here are my results
# dtndarray = np.random.rand(100000, 100)
# dtlist = list(dtndarray)
# tdomain = Orange.data.Domain([Orange.feature.Continuous("var" + str(i)) for i in xrange(100)])
# tinsts = [Orange.data.Instance(tdomain, list(dtlist[i]) )for i in xrange(len(dtlist))]
# t = Orange.data.Table(tdomain, tinsts)
# timeit list(dtndarray) # 45.6ms
# timeit [Orange.data.Instance(tdomain, list(dtlist[i])) for i in xrange(len(dtlist))] # 3.28s
# timeit Orange.data.Table(tdomain, tinsts) # 280ms
# timeit Orange.data.Table(tdomain, dtndarray) # 380ms
# As illustrated above, utilizing constructor with ndarray can greatly improve performance
# So one may conceive better converter based on these results
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
# Use numpy
# Table._init__(Domain, numpy.ndarray)
return Orange.data.Table(Orange.data.Domain(variable), series.values[:, np.newaxis])
# Build instance list
# Table.__init__(Domain, list_of_instances)
tdomain = Orange.data.Domain(variable)
tinsts = [Orange.data.Instance(tdomain, [i]) for i in series]
return Orange.data.Table(tdomain, tinsts)
# 5x performance
def column2df(col):
if type(col.domain[0]) is Orange.feature.Continuous:
return (col.domain[0].name, pd.Series(col.to_numpy()[0].flatten()))
tmp = pd.Series(np.array(list(col)).flatten()) # type(tmp) -> np.array( dtype=list (Orange.data.Value) )
tmp = tmp.apply(lambda x: str(x[0]))
return (col.domain[0].name, tmp)
def table2df(tab):
# Orange.data.Table().to_numpy() cannot handle strings
# So we must build the array column by column,
# When it comes to strings, python list is used
series = [column2df(tab.select(i)) for i in xrange(len(tab.domain))]
series_name = [i[0] for i in series] # To keep the order of variables unchanged
series_data = dict(series)
print series_data
return pd.DataFrame(series_data, columns=series_name)
Answer below from a closed issue on github
from Orange.data.pandas_compat import table_from_frame
out_data = table_from_frame(df)
Where df is your dataFrame. So far I've only noticed a need to manually define a domain to handle dates if the data source wasn't 100% clean and to the required ISO standard.
I realize this is an old question and a lot changed from when it was first asked - but this question comes up top on google search results on the topic.
from Orange.data.pandas_compat import table_from_frame,table_to_frame
df= table_to_frame(in_data)
#here you go
out_data = table_from_frame(df)
based on answer of Creo
In order to convert pandas DataFrame to Orange Table you need to construct a domain, which specifies the column types.
For continuous variables, you only need to provide the name of the variable, but for Discrete variables, you also need to provide a list of all possible values.
The following code will construct a domain for your DataFrame and convert it to an Orange Table:
import numpy as np
from Orange.feature import Discrete, Continuous
from Orange.data import Domain, Table
domain = Domain([
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('hotel', values=[str(v) for v in np.unique(df.hotel)]),
Discrete('user', values=[str(v) for v in np.unique(df.user)]),
Discrete('home_continent', values=[str(v) for v in np.unique(df.home_continent)]),
Discrete('gender', values=['male', 'female'])], False)
table = Table(domain, [map(str, row) for row in df.as_matrix()])
The map(str, row) step is needed so Orange know that the data contains values of discrete features (and not the indices of values in the values list).
This code is revised from #TurtleIzzy for Python3.
import numpy as np
from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable
def series2descriptor(d):
if d.dtype is np.dtype("float") or d.dtype is np.dtype("int"):
return ContinuousVariable(str(d.name))
t = d.unique()
return DiscreteVariable(str(d.name), list(t.astype("str")))
def df2domain(df):
featurelist = [series2descriptor(df.iloc[:,col]) for col in range(len(df.columns))]
return Domain(featurelist)
def df2table(df):
tdomain = df2domain(df)
ttables = [series2table(df.iloc[:,i], tdomain[i]) for i in range(len(df.columns))]
ttables = np.array(ttables).reshape((len(df.columns),-1)).transpose()
return Table(tdomain , ttables)
def series2table(series, variable):
if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
series = series.values[:, np.newaxis]
return Table(series)
series = series.astype('category').cat.codes.reshape((-1,1))
return Table(series)
Something like this?
table = Orange.data.Table(df.as_matrix())
The columns in Orange will get generic names (a1, a2...). If you want to copy the names and the types from the data frame, construct Orange.data.Domain object (http://docs.orange.biolab.si/reference/rst/Orange.data.domain.html#Orange.data.Domain.init) from the data frame and pass it as the first argument above.
See the constructors in http://docs.orange.biolab.si/reference/rst/Orange.data.table.html.
table_from_frame, which is available in Python 3, doesn't allow the definition of a class column and therefore, the generated table cannot be used directly to train a classification model. I tweaked the table_from_frame function so it'll allow the definition of a class column. Notice that the class name should be given as an additional parameter.
"""Pandas DataFrame↔Table conversion helpers"""
import numpy as np
import pandas as pd
from pandas.api.types import (
is_categorical_dtype, is_object_dtype,
is_datetime64_any_dtype, is_numeric_dtype,
from Orange.data import (
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
__all__ = ['table_from_frame', 'table_to_frame']
def table_from_frame(df,class_name, *, force_nominal=False):
Convert pandas.DataFrame to Orange.data.Table
df : pandas.DataFrame
force_nominal : boolean
If True, interpret ALL string columns as nominal (DiscreteVariable).
def _is_discrete(s):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
s.nunique() < s.size**.666))
def _is_datetime(s):
if is_datetime64_any_dtype(s):
return True
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
return True
except Exception: # pylint: disable=broad-except
return False
# If df index is not a simple RangeIndex (or similar), put it into data
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
df = df.reset_index()
attrs, metas,calss_vars = [], [],[]
X, M = [], []
# Iter over columns
for name, s in df.items():
name = str(name)
if name == class_name:
discrete = s.astype('category').cat
calss_vars.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_discrete(s):
discrete = s.astype('category').cat
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_datetime(s):
tvar = TimeVariable(name)
s = pd.to_datetime(s, infer_datetime_format=True)
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
elif is_numeric_dtype(s):
return Table.from_numpy(Domain(attrs, calss_vars, metas),
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
np.column_stack(M) if M else None)
This works well
from Orange.data.pandas_compat import table_from_frame,table_to_frame
import pandas as pd
# read the input data into pandas data-frame
df= table_to_frame(in_data)
# perform all data operations / wrangling
# for example only few columns are required in output
df = df[['Col1', 'Col2']]
# Final output
out_data = table_from_frame(df)

