Expected output of step in a Scikit learn feature union step

Expected output of step in a Scikit learn feature union step - python

I have a Scikit learn pipeline which includes a feature union as so
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('feats', FeatureUnion([
#
('Vec', Doc2vec()),
('Counter', I_counter()),
])),
('clf', LogisticRegression()) # classifier
])
Each of the two processes in the feature union are classes I've written myself. The first of these is a self written vectorizer based on the Gensim Doc2Vec model. Full code here
If I understand the feature union documentation correctly, it runs each step in parallel and concatenates the output vectors into a single vector passed to the next step (The CLF pipeline in this case).
I wrote each class to return a single numpy array, however the above code is triggering an error.
TypeError: All estimators should implement fit and transform. 'Pipeline(memory=None, steps=[('vec', Doc2vec())])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't
If I understand the error correctly (?), it's stating that the Doc2vec class is not outputting a suitable feature?
The Doc2vec class outputs a single numpy array, containing a 100 vector array for each inputted text sequence. I naively assumed it would simply concatenate this to the I_counter output and all work happyly.
Might someone be able to highlight where my logic is wrong?
--
EDIT, more code
class Doc2vec(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def vec(data):
print('starting')
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
for line_no, line in data.iterrows():
#tokens = gensim.utils.to_unicode(line).split()
words = gensim.utils.simple_preprocess(line['post'])
tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
split = ['train', 'test'][line_no//1200] # 25k train, 25k test, 25k extra
if gensim.utils.simple_preprocess(line['type']) == ['depression']:
sentiment = (1.0)
else:
sentiment = (0.0)
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
#print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
from random import shuffle
doc_list = alldocs[:]
shuffle(doc_list)
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
simple_models = [
# PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05')
]
for model in simple_models:
model.build_vocab(train_docs)
#print("%s vocabulary scanned & state initialized" % model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
model.train(train_docs, total_examples=len(train_docs), epochs=model.epochs)
train_targets, train_regressors = zip(*[(doc.words, doc.sentiment) for doc in train_docs])
import numpy as np
X = []
for i in range(len(train_targets)):
X.append(model.infer_vector(train_targets[i]))
train_x = np.asarray(X)
print(type(train_x))
return(train_x)
class I_counter(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self


def transform(self, data):
def i_count(name):
tokens = nltk.word_tokenize(name)
count = tokens.count("I")
count2 = tokens.count("i")
return(count+count2)
vecfunc = np.vectorize(i_count)
data = np.transpose(np.matrix(data['post']))
result = vecfunc(data)
return result

Related

Word-embedding does not provide expected relations between words

I am trying to train a word embedding to a list of repeated sentences where only the subject changes. I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding. However, the angle between the vectors of subjects is not always larger than the angle between subjects and a random word.
Man is going to write a very long novel that no one can read.
Woman is going to write a very long novel that no one can read.
Boy is going to write a very long novel that no one can read.
The code is based on pytorch tutorial:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
class EmbedTrainer(nn.Module):
def __init__(self, d_vocab, d_embed, d_context):
super(EmbedTrainer, self).__init__()
self.embed = nn.Embedding(d_vocab, d_embed)
self.fc_1 = nn.Linear(d_embed * d_context, 128)
self.fc_2 = nn.Linear(128, d_vocab)
def forward(self, x):
x = self.embed(x).view((1, -1)) # flatten after embedding
x = self.fc_2(F.relu(self.fc_1(x)))
x = F.log_softmax(x, dim=1)
return x
text = " ".join(["{} is going to write a very long novel that no one can read.".format(x) for x in ["Man", "Woman", "Boy"]])
text_split = text.split()
trigrams = [([text_split[i], text_split[i+1]], text_split[i+2]) for i in range(len(text_split)-2)]
dic = list(set(text.split()))
tok_to_ids = {w:i for i, w in enumerate(dic)}
tokens_text = text.split(" ")
d_vocab, d_embed, d_context = len(dic), 10, 2
""" Train """
loss_func = nn.NLLLoss()
model = EmbedTrainer(d_vocab, d_embed, d_context)
print(model)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
losses = []
epochs = 10
for epoch in range(epochs):
total_loss = 0
for input, target in trigrams:
tok_ids = torch.tensor([tok_to_ids[tok] for tok in input], dtype=torch.long)
target_id = torch.tensor([tok_to_ids[target]], dtype=torch.long)
model.zero_grad()
log_prob = model(tok_ids)
#if total_loss == 0: print("train ", log_prob, target_id)
loss = loss_func(log_prob, target_id)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(total_loss)
losses.append(total_loss)
embed_map = {}
for word in ["Man", "Woman", "Boy", "novel"]:
embed_map[word] = model.embed.weight[tok_to_ids[word]]
print(word, embed_map[word])
def angle(a, b):
from numpy.linalg import norm
a, b = a.detach().numpy(), b.detach().numpy()
return np.dot(a, b) / norm(a) / norm(b)
print("man.woman", angle(embed_map["Man"], embed_map["Woman"]))
print("man.novel", angle(embed_map["Man"], embed_map["novel"]))

I expected that the generated vectors corresponding the subjects provide a strong correlation after training as it is expected from a word embedding
I don't really think you'll achieve that kind of result with only 3 sentences and like 40 iterations in 10 epochs (plus most of the data in your 40 iterations is repeated).
maybe try downloading a couple of free datasets out there, or try your own data with a proven model like a genism model.
I'll give you the code for training a gensim model, so you can test your dataset on another model and see if the problem comes from your data or from your model.
I've tested similar gensim models on datasets with millions of sentences and it worked like a charm, for smaller datasets you might want to change the parameters.
from gensim.models import Word2Vec
from multiprocessing import cpu_count
corpus_path = 'eachLineASentence.txt'
vecSize = 300
winSize = 5
numWorkers = cpu_count()-1
epochs = 20
minCount = 5
skipGram = False
modelName = f'mymodel.model'
model = Word2Vec(corpus_file=corpus_path,
size=vecSize,
window=winSize,
min_count=minCount,
workers=numWorkers,
iter=epochs,
sg=skipGram)
model.save(modelName)
P.S. I don't think it's a good idea to use the keyword input as a variable in your code.

It's most probably the training size. Training a 128d embedding is definitely overkill. Rule of thumb from the the google developers blog:
Why is the embedding vector size 3 in our example? Well, the following "formula" provides a general rule of thumb about the number of embedding dimensions:
embedding_dimensions = number_of_categories**0.25
That is, the embedding vector dimension should be the 4th root of the number of categories. Since our vocabulary size in this example is 81, the recommended number of dimensions is 3:
3 = 81**0.25

Pickling a Complicated Trained FeatureUnion in Python

I have a complicated featureunion consisting of several pipelines with custom transformers and standard transformers.
I am trying to pickle a fit featureunion for subsequent use but I'm getting errors.
I fit and pickle my featureunion as follows:
# Pickle fit pipeline
feature_union = feature_union.fit(X_train)
pickle.dump(feature_union, open("feature_union.p","wb"))
elsewhere I load the pickled featureunion and try to transform new data like this:
# Open fit pipeline and transform new data
feature_union = pickle.load(open("feature_union.p","rb"))
X_validation_enc = feature_union.transform(X_validation)
I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-15-7b78df603a5a> in <module>
1 # Open fit pipeline
2
----> 3 feature_union = pickle.load(open("feature_union.p","rb"))
4
5 X_validation_enc = feature_union.transform(X_validation)
AttributeError: Can't get attribute 'column_selector' on <module '__main__'>
The pickle works when I have the entire featureunion, pipeline and transformer code in the 'new' (destination) script. Does that mean the only thing I can pickle is the the fit featureunion object? The error suggests I need all the code in the new script and all I'm loading is a fit featureunion object so the only 'savings' is that I don't need to fit the featureunion on training data. Is this correct? Is there some way to pickle so I can remove all the featureunion/pipeline/transformer code in the new script?
My featureunion consists of numerous custom and 'library-based' transformers and actions. In some instances I pass outside lists and variables into the class (tranformer). All of these lists and variables are present in the new code.
At a loss.
If it helps, the structure of my featureunion, pipelines, and some code for the transformers is shown below.
Guidance appreciated.
The structure looks like this:
feature_union = FeatureUnion([
('cat_binary', pipeline_categorical_binary),
('cat_ordinal_string', pipeline_categorical_ordinal_string),
('cont', pipeline_continuous)
])
One of the pipelines has this structure:
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('numerical_impute', numerical_imputer(numerical_impute_approach)),
('continuous_transform', continuous_transformer(continuous_transform_dict,do_transform)),
('scaler',DFStandardScaler(perform_scaling))
])
Within the pipeline, I have custom and packaged transformers. For example, the 'continuous transform' custom transformer referenced in the above pipeline log transforms data and looks like this:
# 3 Transform continuous features
class continuous_transformer(BaseEstimator,TransformerMixin):
def __init__(self, type_transform, do_transform ='No'):
self.do_transform = do_transform
self.type_transform = type_transform
def fit(self,X,y=None):
return self
def transform(self, X):
if self.do_transform == 'Yes':
for key, value in self.type_transform.items():
if value == 'log_transform':
X[key] = X[key].apply(lambda x: np.log(x+1.0))
X.rename(columns = {key:'log_' + key}, inplace = True)
X_continuous_transformed_df = X
return X_continuous_transformed_df
else:
return X
And the 'scaler' transformer uses the StandardScaler module and looks like this:
# 3 Standardize continuous features
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self, perform_scaling):
self.ss = None
self.perform = perform_scaling
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
if self.perform == 'Yes':
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
else:
X_continuous_scaled_df = X
return X_continuous_scaled_df
The above hierarchy is well defined in my code.

Pipeline GridSearchCV, corresponding parameters in different steps

I am trying to do some hyper-parameter tuning in my pipeline and have the following setup:
model = KerasClassifier(build_fn = create_model, epochs = 5)
pipeline = Pipeline(steps =[('Tokenizepadder', TokenizePadding()),
('NN', model)] )
Where I have a variable 'maxlen' in both the Tokenizepadder and my Neural Network (for the Neural Network it is called max_length, I was afraid naming them the same would cause errors later in the code). When I try to perform a grid search, I am struggling to have these values correspond. If I perform grid search for these values seperately, they won't mach and there will be a problem with the input data not matching the neural network.
In short I would like to do something like:
pipeline = Pipeline(steps =[('Tokenizepadder', TokenizePadding()),
('NN', KerasClassifier(build_fn = create_model, epochs = 5, max_length = pipeline.get_params()['Tokenizepadder__maxlen']))] )
So that when I am performing a grid search for the parameter 'Tokenizepadder__maxlen', it will change the value 'NN__max_length' to the same value.

May be you can change your classifier and tokenizer, to pass around max_len parameter. Then, only grid search with tokenizer max_len parameter.
Not the cleanest way, but might do.
from sklearn.base import BaseEstimator, TransformerMixin, EstimatorMixin
class TokeinizePadding(BaseEstimator, TransformerMixin):
def __init__(self, max_len, ...):
self.max_len = max_len
...
def fit(self, X, y=None):
...
return self
def transform(self, X, y=None):
data = ... # your stuff
return {"array": data, "max_len": self.max_len}
class KerasClassifier(...):
...
def fit(data, y):
self.max_len = data["max_len"]
self.build_model()
X = data["array"]
... # your stuff

How to make prediction from train Pytorch and PytorchText model?

General speaking, after I have successfully trained a text RNN model with Pytorch, using PytorchText to leverage data loading on an origin source, I would like to test with other data sets (a sort of blink test) that are from different sources but the same text format.
First I defined a class to handle the data loading.
class Dataset(object):
def __init__(self, config):
# init what I need
def load_data(self, df: pd.DataFrame, *args):
# implementation below
# Data format like `(LABEL, TEXT)`
def load_data_but_error(self, df: pd.DataFrame):
# implementation below
# Data format like `(TEXT)`
Here is the detail of load_data which I load data that trained successfully.
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
LABEL = data.Field(sequential=False, use_vocab=False)
datafields = [(label_col, LABEL), (data_col, TEXT)]
# split my data to train/test
train_df, test_df = train_test_split(df, test_size=0.33, random_state=random_state)
train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
train_data = data.Dataset(train_examples, datafields)
# split train to train/val
train_data, val_data = train_data.split(split_ratio=0.8)
# build vocab
TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
self.word_embeddings = TEXT.vocab.vectors
self.vocab = TEXT.vocab
test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
test_data = data.Dataset(test_examples, datafields)
self.train_iterator = data.BucketIterator(
(train_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.title),
repeat=False,
shuffle=True)
self.val_iterator, self.test_iterator = data.BucketIterator.splits(
(val_data, test_data),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.title),
repeat=False,
shuffle=False)
Next is my code (load_data_but_error) to load others source but causing error
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
datafields = [('title', TEXT)]
examples = [data.Example.fromlist(i, datafields) for i in df.values.tolist()]
blink_test = data.Dataset(examples, datafields)
self.blink_test = data.BucketIterator(
(blink_test),
batch_size=self.config.batch_size,
sort_key=lambda x: len(x.title),
repeat=False,
shuffle=True)
When I was executing code, I had an error AttributeError: 'Field' object has no attribute 'vocab' which has a question at here but it doesn't like my situation as here I had vocab from load_data and I want to use it for blink tests.
My question is what the correct way to load and feed new data with a trained PyTorch model for testing current model is?

What I need are
to keep TEXT in load_data and reuse in load_data_but_error by assigning to class variables
add train=True to object data.BucketIterator on load_data_but_error function

Not really sure, but considering you have re-defined TEXT, you will have to explicitly create the vocab for your Field TEXT again. This can be done as follows:
TEXT.build_vocab(examples, min_freq = 2)
This particular statement adds the word from your data to the vocab only if it occurs at least two times in your data-set examples, you can change it as per your requirement.
You can read about build_vocab method at https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Field.build_vocab.

Preprocess huge data with a custom data generator function for keras

Actually I'm building a keras model and I have a dataset in the msg format with over 10 million instances with 40 features which are all categorical. For the moment i'm using just a sample of it since reading all the dataset and encoding it is impossbile to fit into the memory. Here a part of the code i'm using:
import pandas as pd
from category_encoders import BinaryEncoder as be
from sklearn.preprocessing import StandardScaler
def model():
model = Sequential()
model.add(Dense(120, input_dim=233, kernel_initializer='uniform', activation='selu'))
model.add(Dense(12, kernel_initializer='uniform', activation='sigmoid'))
model.compile(SGD(lr=0.008),loss='mean_squared_error', metrics=['accuracy'])
return model
def addrDataLoading():
data=pd.read_msgpack('datum.msg')
data=data.dropna(subset=['s_address','d_address'])
data=data.sample(300000) # taking a sample of all the dataset to make the encoding possible
y=data[['s_address','d_address']]
x=data.drop(['s_address','d_address'],1)
encX = be().fit(x, y)
numeric_X= encX.transform(x)
encY=be().fit(y,y)
numeric_Y=encY.transform(y)
scaler=StandardScaler()
X_all=scaler.fit_transform(numeric_X)
x_train=X_all[0:250000,:]
y_train=numeric_Y.iloc[0:250000,:]
x_val=X_all[250000:,:]
y_val=numeric_Y.iloc[250000:,:]
return x_train,y_train,x_val,y_val
x_train,y_train,x_val,y_val=addrDataLoading()
model.fit(x_train, y_train,validation_data=(x_val,y_val),nb_epoch=20, batch_size=200)
So my question is how to use a custom data generator function to read and process all the data I have and not just a sample, and then use fit_generator() function to train my model?
EDIT
This is a sample of the data:
netData
I think that taking different samples from the data results in different encoding dimensions.
For this sample there's 16 different categories: 4 addresses (3 bit), 4 hostnames (3 bit ), 1 subnetmask (1 bit), 5 infrastructures (3 bit ), 1 accesszone(1 bit ), so the binary encoding will give us 11 bit and the new dimension of the data is 11 previously 5. So let's say for another sample in the address column we have 8 different categories this will give 4 bit in binary and we let the same number of categories in the other columns so the overall encoding will result in 12 dimensions. I believe that what's causing the problem.

Slightly slow solution (repeating the same actions)
Edit - fit BinatyEncoder before create generators
Drop NA first and work with clean data further to avoid reassignments of data frame.
data = pd.read_msgpack('datum.msg')
data.dropna(subset=['s_address','d_address']).to_msgpack('datum_clean.msg')
In this solution data_generator can process same data multiple times. If it's not critical, you can use this solution.
Define function which reads the data snd splits index to train and test. It won't consume a lot of memory.
import pandas as pd
from category_encoders import BinaryEncoder as be
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
def model():
#some code defining the model
def train_test_index_split():
# if there's enough memory to add one more column
data = pd.read_msgpack('datum_cleaned.msg')
train_idx, test_idx = train_test_split(data.index)
return data, train_idx, test_idx
data, train_idx, test_idx = train_test_index_split()
Define and initialize data generator, both for train and validation
def data_generator(data, encX, encY, bathc_size, n_steps, index):
# EDIT: As the data was cleaned, you don't need dropna
# data = data.dropna(subset=['s_address','d_address'])
for i in range(n_steps):
batch_idx = np.random.choice(index, batch_size)
sample = data.loc[batch_idx]
y = sample[['s_address', 'd_address']]
x = sample.drop(['s_address', 'd_address'], 1)
numeric_X = encX.transform(x)
numeric_Y = encY.transform(y)
scaler = StandardScaler()
X_all = scaler.fit_transform(numeric_X)
yield X_all, numeric_Y
Edited part now train binary encoders. You should sub-sample your data to create representative training set for encoders. I guess error with the shape of the data was caused by incorrecly trained BinaryEncoder (Error when checking input: expected dense_9_input to have shape (233,) but got array with shape (234,)):
def get_minimal_unique_frame(df):
return (pd.Series([df[column].unique() for column in df], index=df.columns)
.apply(pd.Series) # tranform list on unique values to pd.Series
.T # transope frame: columns is columns again
.fillna(method='ffill')) # fill NaNs with last value
x = get_minimal_unique_frame(data.drop(['s_address', 'd_address'], 1))
y = get_minimal_unique_frame(data[['s_address', 'd_address']])
NB: I never used category_encoders and have incompatible system configuration, so can't install and check it. So, former code can evoke problems. In that case, I guess, you should compare length of x and y data frames and make it the same, and probaly change an index of data frames.
encX = be().fit(x, y)
encY = be().fit(y, y)
batch_size = 200
train_steps = 100000
val_steps = 5000
train_gen = data_generator(data, encX, encY, batch_size, train_steps, train_idx)
test_gen = data_generator(data, encX, encY, batch_size, test_steps, test_idx)
Edit Please provide an exapmple of x_sample, run train_gen and save output, and post x_samples, y_smaples:
x_samples = []
y_samples = []
for i in range(10):
x_sample, y_sample = next(train_gen)
x_samples.append(x_sample)
y_samples.append(y_sample)
Note: data generator won't stop itself. But itt will be stopped after train_steps by fit_generator method.
Fit model with generators:
model.fit_generator(generator=train_gen, steps_per_epoch=train_steps, epochs=1,
validation_data=test_gen, validation_steps=val_steps)
As far as I know, python does not copy pandas dataframes if you won't do it explicitply with copy() or so. Because of it, both generators use the same object. But if you use Jupyter Notebook, data leaks/uncollected carbage may occur, and a memory troubles comes with them.
More efficient solution - scketch
Clean your data
data = pd.read_msgpack('datum.msg')
data.dropna(subset=['s_address','d_address']).to_msgpack('datum_clean.msg')
Create train/test split, preprocess it and store as numpy array, if you have enough disk space.
data, train_idx, test_idx = train_test_index_split()
def data_preprocessor(data, path, index):
# data = data.dropna(subset=['s_address','d_address'])
sample = data.loc[batch_idx]
y = sample[['s_address', 'd_address']]
x = sample.drop(['s_address', 'd_address'], 1)
encX = be().fit(x, y)
numeric_X = encX.transform(x)
encY = be().fit(y, y)
numeric_Y = encY.transform(y)
scaler = StandardScaler()
X_all = scaler.fit_transform(numeric_X)
np.save(path + '_X', X_all)
np.save(path + '_y', numeric_Y)
data_preprocessor(data, 'train', train_idx)
data_preprocessor(data, 'test', test_idx)
Delete unnecessary data:
del data
Load your files and use following generator:
train_X = np.load('train_X.npy')
train_y = np.load('train_y.npy')
test_X = np.load('test_X.npy')
test_y = np.load('test_y.npy')
def data_generator(X, y, batch_size, n_steps):
idxs = np.arange(len(X))
np.random.shuffle(idxs)
ptr = 0
for _ in range(n_steps):
batch_idx = idxs[ptr:ptr+batch_size]
x_sample = X[batch_idx]
y_sample = y[batch_idx]
ptr += batch_size
if ptr > len(X):
ptr = 0
yield x_sapmple, y_sample
Prepare generators:
train_gen = data_generator(train_X, train_y, batch_size, train_steps)
test_gen = data_generator(test_X, test_y, batch_size, test_steps)
And fit the model finaly. Hope one of this solutions will help. At least if python does pass arrays and data frames buy reference, not by value. Stackoverflow answer about it.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Expected output of step in a Scikit learn feature union step - python

Related

Word-embedding does not provide expected relations between words

Pickling a Complicated Trained FeatureUnion in Python

Pipeline GridSearchCV, corresponding parameters in different steps

How to make prediction from train Pytorch and PytorchText model?

Preprocess huge data with a custom data generator function for keras

Categories

Resources