Hello i'm working with text classification.
I've a dataset with 2 columns one made of text and the other one is the label.
Since i'm a beginner i'm following step by step a tutorial on W2vec trying to understand if it can work for my usecase but i keep getting this error.
This is my code
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self.dim = len(next(iter(word2vec.values())))
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
train_df['clean_text_tok']=[nltk.word_tokenize(i) for i in train_df['clean_text']]
model = Word2Vec(train_df['clean_text_tok'],min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))
modelw = MeanEmbeddingVectorizer(w2v)
# converting text to numerical data using Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)
the error i'm getting is :
Dimension: 100
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-127-289141692350> in <module>
4 modelw = MeanEmbeddingVectorizer(w2v)
5 # converting text to numerical data using Word2Vec
----> 6 X_train_vectors_w2v = modelw.transform(X_train_tok)
7 X_val_vectors_w2v = modelw.transform(X_test_tok)
AttributeError: 'MeanEmbeddingVectorizer' object has no attribute 'transform'
If your MeanEmbeddingVectorizer is defined in your code exactly as its shows here, the failure-to-indent the .fit() and .transform() functions means they're not part of the class, as you likely intended.
Indenting those each an extra 4 spaces – as was likely the intent of any source you copied this code from! – will put them "inside" the MeanEmbeddingVectorizer class, as class methods. Then, objects of that class won't give the same "no attribute" error.
For example:
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self.dim = len(next(iter(word2vec.values())))
def fit(self, X, y):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
Related
I am trying to create and store a gensin Word2Vec model using the fit function, then turn it into a SKLearn pipeline, pickle it, to later use it with transform on new data.
I created the wrapper, but the self.w2v object seems not to have been fitted and does not recognize any word. It is as if self.w2v had never seen any word.
Any ideas about how to address this?
from sklearn.base import TransformerMixin, BaseEstimator
from gensim.models import Word2Vec
class SentenceVectorizer(TransformerMixin, BaseEstimator):
def __init__(self, vector_size=50):
self.vector_size = vector_size
def sent_vectorizer(self, sentence, vectorizer):
'''
Applies the fitted W2V model for each token of each sentence and returns their vector representation.
'''
sent_vec =[]
numw = 0
for word in sentence:
try:
if numw == 0:
sent_vec = vectorizer.wv[word]
else:
sent_vec = np.add(sent_vec, vectorizer.wv[word])
numw += 1
except: # if word not present
if numw == 0:
sent_vec = np.zeros(self.vector_size)
else:
sent_vec = np.add(sent_vec, np.zeros(self.vector_size))
if numw > 0:
return np.asarray(sent_vec) / numw
else:
return np.zeros(self.vector_size)
def fit(self, X):
self.w2v = Word2Vec(X, vector_size=self.vector_size)
return self
def transform(self, X):
X_vec=[]
for sentence in X:
X_vec.append(self.sent_vectorizer(sentence, self.w2v))
return X_vec
This code currently does well in training but returns zeroed vectors on inference (because no word has been recognized).
Most likely problem: fit method is not properly storing self.w2v, although when transform is called it seems to exist.
Turns out I had an outdated gensim version which required vectorizer[word] instead of vectorizer.wv[word]. I'll leave the question here as it might be usefull to someone.
I tried to run a neural network to learn more about categorical embedding (the exaplanation of the neural network code is here https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/) but Spyder gives AttributeError after trying to run the loop in the end.
Traceback (most recent call last):
File "", line 1, in
File "C:\Workspace\Python_Runtime\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\Workspace\Python_Runtime\Python\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TabularDataset' on <module 'main' (built-in)>
My understanding is that this comes from Spyder having issue with multiprocessing functionality.
I have tried, as some answers suggested, to wrap everything that is not in a class or def in
if __name__ == '__main__':
but that did not seem to help, the error still comes up.
I also tried to import multiprocess package instead of multiprocessing but that did not help. I guess I would need to go and change the line in spawn.py file, but not sure how exactly.
The issue is that on my current PC I only have Spyder. I tried to run the same code on another dataset on my personal PC at home with Pycharm and it worked alright, with no errors at all.
Does anyone know how can I resolve the issue in Spyder?
The code for the neural network that I used is here:
from torch.utils.data import Dataset, DataLoader
class TabularDataset(Dataset):
def __init__(self, data, cat_cols=None, output_col=None):
"""
Characterizes a Dataset for PyTorch
Parameters
----------
data: pandas data frame
The data frame object for the input data. It must
contain all the continuous, categorical and the
output columns to be used.
cat_cols: List of strings
The names of the categorical columns in the data.
These columns will be passed through the embedding
layers in the model. These columns must be
label encoded beforehand.
output_col: string
The name of the output variable column in the data
provided.
"""
self.n = data.shape[0]
if output_col:
self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
else:
self.y = np.zeros((self.n, 1))
self.cat_cols = cat_cols if cat_cols else []
self.cont_cols = [col for col in data.columns
if col not in self.cat_cols + [output_col]]
if self.cont_cols:
self.cont_X = data[self.cont_cols].astype(np.float32).values
else:
self.cont_X = np.zeros((self.n, 1))
if self.cat_cols:
self.cat_X = data[cat_cols].astype(np.int64).values
else:
self.cat_X = np.zeros((self.n, 1))
def __len__(self):
"""
Denotes the total number of samples.
"""
return self.n
def __getitem__(self, idx):
"""
Generates one sample of data.
"""
return [self.y[idx], self.cont_X[idx], self.cat_X[idx]]
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):
def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
output_size, emb_dropout, lin_layer_dropouts):
"""
Parameters
----------
emb_dims: List of two element tuples
This list will contain a two element tuple for each
categorical feature. The first element of a tuple will
denote the number of unique values of the categorical
feature. The second element will denote the embedding
dimension to be used for that feature.
no_of_cont: Integer
The number of continuous features in the data.
lin_layer_sizes: List of integers.
The size of each linear layer. The length will be equal
to the total number
of linear layers in the network.
output_size: Integer
The size of the final output.
emb_dropout: Float
The dropout to be used after the embedding layers.
lin_layer_dropouts: List of floats
The dropouts to be used after each linear layer.
"""
super().__init__()
# Embedding layers
self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
for x, y in emb_dims])
no_of_embs = sum([y for x, y in emb_dims])
self.no_of_embs = no_of_embs
self.no_of_cont = no_of_cont
# Linear Layers
first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
lin_layer_sizes[0])
self.lin_layers = nn.ModuleList([first_lin_layer] + [nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1]) for i in range(len(lin_layer_sizes) - 1)])
for lin_layer in self.lin_layers:
nn.init.kaiming_normal_(lin_layer.weight.data)
# Output Layer
self.output_layer = nn.Linear(lin_layer_sizes[-1],
output_size)
nn.init.kaiming_normal_(self.output_layer.weight.data)
# Batch Norm Layers
self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
for size in lin_layer_sizes])
# Dropout Layers
self.emb_dropout_layer = nn.Dropout(emb_dropout)
self.droput_layers = nn.ModuleList([nn.Dropout(size)
for size in lin_layer_dropouts])
def forward(self, cont_data, cat_data):
if self.no_of_embs != 0:
x = [emb_layer(cat_data[:, i])
for i,emb_layer in enumerate(self.emb_layers)]
x = torch.cat(x, 1)
x = self.emb_dropout_layer(x)
if self.no_of_cont != 0:
normalized_cont_data = self.first_bn_layer(cont_data)
if self.no_of_embs != 0:
x = torch.cat([x, normalized_cont_data], 1)
else:
x = normalized_cont_data
for lin_layer, dropout_layer, bn_layer in\
zip(self.lin_layers, self.droput_layers, self.bn_layers):
x = F.relu(lin_layer(x))
x = bn_layer(x)
x = dropout_layer(x)
x = self.output_layer(x)
return x
categorical_features = ["cat1", "cat2", "cat3"]
output_feature = ["output"]
data = data[output_feature + categorical_features + ["cont1", "cont2"]].copy().dropna()
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for cat_col in categorical_features:
label_encoders[cat_col] = LabelEncoder()
data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
dataset = TabularDataset(data=data, cat_cols=categorical_features,output_col=output_feature)
batchsize = 256
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)
cat_dims = [int(data[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FeedForwardNN(emb_dims, no_of_cont=2, lin_layer_sizes=[50, 100],
output_size=1, emb_dropout=0.04,
lin_layer_dropouts=[0.001,0.01]).to(device)
import tqdm
no_of_epochs = 5
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in tqdm.tqdm(range(no_of_epochs)):
for y, cont_x, cat_x in dataloader:
cat_x = cat_x.to(device)
cont_x = cont_x.to(device)
y = y.to(device)
# Forward Pass
preds = model(cont_x, cat_x)
loss = criterion(preds, y)
# Backward Pass and Optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
You could try to run the code using the console namespace instead of an empty one (to try to preserver the TabularDataset definition). For that you need to check the option Run in Console's namespace instead of an empty one in the preferences dialog: menu Tools > Preferences (or the 🔧 button to show the dialog) and Run > General settings > Run in Console's namespace instead of an empty one.
I have a complicated featureunion consisting of several pipelines with custom transformers and standard transformers.
I am trying to pickle a fit featureunion for subsequent use but I'm getting errors.
I fit and pickle my featureunion as follows:
# Pickle fit pipeline
feature_union = feature_union.fit(X_train)
pickle.dump(feature_union, open("feature_union.p","wb"))
elsewhere I load the pickled featureunion and try to transform new data like this:
# Open fit pipeline and transform new data
feature_union = pickle.load(open("feature_union.p","rb"))
X_validation_enc = feature_union.transform(X_validation)
I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-15-7b78df603a5a> in <module>
1 # Open fit pipeline
2
----> 3 feature_union = pickle.load(open("feature_union.p","rb"))
4
5 X_validation_enc = feature_union.transform(X_validation)
AttributeError: Can't get attribute 'column_selector' on <module '__main__'>
The pickle works when I have the entire featureunion, pipeline and transformer code in the 'new' (destination) script. Does that mean the only thing I can pickle is the the fit featureunion object? The error suggests I need all the code in the new script and all I'm loading is a fit featureunion object so the only 'savings' is that I don't need to fit the featureunion on training data. Is this correct? Is there some way to pickle so I can remove all the featureunion/pipeline/transformer code in the new script?
My featureunion consists of numerous custom and 'library-based' transformers and actions. In some instances I pass outside lists and variables into the class (tranformer). All of these lists and variables are present in the new code.
At a loss.
If it helps, the structure of my featureunion, pipelines, and some code for the transformers is shown below.
Guidance appreciated.
The structure looks like this:
feature_union = FeatureUnion([
('cat_binary', pipeline_categorical_binary),
('cat_ordinal_string', pipeline_categorical_ordinal_string),
('cont', pipeline_continuous)
])
One of the pipelines has this structure:
pipeline_continuous = Pipeline(steps = [
('column_selector', column_selector(numeric_features)),
('numerical_impute', numerical_imputer(numerical_impute_approach)),
('continuous_transform', continuous_transformer(continuous_transform_dict,do_transform)),
('scaler',DFStandardScaler(perform_scaling))
])
Within the pipeline, I have custom and packaged transformers. For example, the 'continuous transform' custom transformer referenced in the above pipeline log transforms data and looks like this:
# 3 Transform continuous features
class continuous_transformer(BaseEstimator,TransformerMixin):
def __init__(self, type_transform, do_transform ='No'):
self.do_transform = do_transform
self.type_transform = type_transform
def fit(self,X,y=None):
return self
def transform(self, X):
if self.do_transform == 'Yes':
for key, value in self.type_transform.items():
if value == 'log_transform':
X[key] = X[key].apply(lambda x: np.log(x+1.0))
X.rename(columns = {key:'log_' + key}, inplace = True)
X_continuous_transformed_df = X
return X_continuous_transformed_df
else:
return X
And the 'scaler' transformer uses the StandardScaler module and looks like this:
# 3 Standardize continuous features
class DFStandardScaler(BaseEstimator,TransformerMixin):
def __init__(self, perform_scaling):
self.ss = None
self.perform = perform_scaling
def fit(self,X,y=None):
self.ss = StandardScaler().fit(X)
return self
def transform(self, X):
if self.perform == 'Yes':
Xss = self.ss.transform(X)
X_continuous_scaled_df = pd.DataFrame(Xss, index=X.index, columns=X.columns)
return X_continuous_scaled_df
else:
X_continuous_scaled_df = X
return X_continuous_scaled_df
The above hierarchy is well defined in my code.
I have 2 dfs. df1 are examples of cats and df2 are examples of dogs.
I have to do some preprocessing with these dfs that at the moment I'm doing by calling different functions. I would like to use scikit learn pipelines.
One of these functions is a special encoder function that will look at a column in the df and will return a special value. I rewrote that function in a class like I saw being used in scikit learn:
class Encoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.values = []
super().__init__()
def fit(self, X, y=None):
return self
def encode(self,row):
result = []
for base in row:
result.append(bases[base])
self.values.append(result)
def transform(self, X):
assert isinstance(X, pd.DataFrame)
X["seq_new"].apply(self.encode)
return self.values
so now I would have 2 lists as a result:
encode = Encoder()
X1 = encode.transform(df1)
X2 = encode.transform(df2)
next step would be:
features = np.concatenate((X1, X1), axis=0)
next step build the labels:
Y_dog = [[1]] * len(X1)
Y_cat = [[0]] * len(X2)
labels = np.concatenate((Y_dog, Y_cat), axis=0)
and some other manipulations and then I'll do a model_selection.train_test_split() to split the data into train and test.
How would I call all these functions in a scikit pipeline? The examples that I found start from where the train/test split has already been done.
The thing about an sklearn.pipeline.Pipeline is that every step needs to implement fit and transform. So, for instance, if you know for a fact that you will ALWAYS need to perform the concatenation step, and you really are dying to put it into a Pipeline (which I wouldn't, but that's just my humble opinion), you need to create a Concatenator class with the appropriate fit and transform methods.
Something like this:
class Encoder(object):
def fit(self, X, *args, **kwargs):
return self
def transform(self, X):
return X*2
class Concatenator(object):
def fit(self, X, *args, **kwargs):
return self
def transform(self, Xs):
return np.concatenate(Xs, axis=0)
class MultiEncoder(Encoder):
def transform(self, Xs):
return list(map(super().transform, Xs))
pipe = sklearn.pipeline.Pipeline((
("encoder", MultiEncoder()),
("concatenator", Concatenator())
))
pipe.fit_transform((
pd.DataFrame([[1,2],[3,4]]),
pd.DataFrame([[5,6],[7,8]])
))
I have a Scikit learn pipeline which includes a feature union as so
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('feats', FeatureUnion([
#
('Vec', Doc2vec()),
('Counter', I_counter()),
])),
('clf', LogisticRegression()) # classifier
])
Each of the two processes in the feature union are classes I've written myself. The first of these is a self written vectorizer based on the Gensim Doc2Vec model. Full code here
If I understand the feature union documentation correctly, it runs each step in parallel and concatenates the output vectors into a single vector passed to the next step (The CLF pipeline in this case).
I wrote each class to return a single numpy array, however the above code is triggering an error.
TypeError: All estimators should implement fit and transform. 'Pipeline(memory=None, steps=[('vec', Doc2vec())])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't
If I understand the error correctly (?), it's stating that the Doc2vec class is not outputting a suitable feature?
The Doc2vec class outputs a single numpy array, containing a 100 vector array for each inputted text sequence. I naively assumed it would simply concatenate this to the I_counter output and all work happyly.
Might someone be able to highlight where my logic is wrong?
--
EDIT, more code
class Doc2vec(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def vec(data):
print('starting')
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
for line_no, line in data.iterrows():
#tokens = gensim.utils.to_unicode(line).split()
words = gensim.utils.simple_preprocess(line['post'])
tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
split = ['train', 'test'][line_no//1200] # 25k train, 25k test, 25k extra
if gensim.utils.simple_preprocess(line['type']) == ['depression']:
sentiment = (1.0)
else:
sentiment = (0.0)
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
#print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
from random import shuffle
doc_list = alldocs[:]
shuffle(doc_list)
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
simple_models = [
# PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05')
]
for model in simple_models:
model.build_vocab(train_docs)
#print("%s vocabulary scanned & state initialized" % model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
model.train(train_docs, total_examples=len(train_docs), epochs=model.epochs)
train_targets, train_regressors = zip(*[(doc.words, doc.sentiment) for doc in train_docs])
import numpy as np
X = []
for i in range(len(train_targets)):
X.append(model.infer_vector(train_targets[i]))
train_x = np.asarray(X)
print(type(train_x))
return(train_x)
class I_counter(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, data):
def i_count(name):
tokens = nltk.word_tokenize(name)
count = tokens.count("I")
count2 = tokens.count("i")
return(count+count2)
vecfunc = np.vectorize(i_count)
data = np.transpose(np.matrix(data['post']))
result = vecfunc(data)
return result