I created my own custom pipeline for text processing. Inside the .transform() method, I want to remove the target row if there are no tokens.
class SpacyVectorizer(BaseEstimator, TransformerMixin):
def __init__(
self,
alpha_only: bool = True,
lemmatize: bool = True,
remove_stopwords: bool = True,
case_fold: bool = True,
):
self.alpha_only = alpha_only
self.lemmatize = lemmatize
self.remove_stopwords = remove_stopwords
self.case_fold = case_fold
self.nlp = spacy.load(
name='en_core_web_sm',
disable=["parser", "ner"]
)
def fit(self, X, y=None):
return self
def transform(self, X, y):
# Bag-of-Words matrix
bow_matrix = []
# Iterate over documents in SpaCy pipeline
for i, doc in enumerate(nlp.pipe(X)):
# Words array
words = []
# Tokenize document
for token in doc:
# Remove non-alphanumeric tokens
if self.alpha_only and not token.is_alpha:
continue
# Stopword removal
if self.remove_stopwords and token.is_stop:
continue
# Lemmatization
if self.lemmatize:
token = token.lemma_
# Case folding
if self.case_fold:
token = str(token).casefold()
# Append token to words array
words.append(token)
# Update the Bow representation
if words:
# Preprocessed document
new_doc = ' '.join(words)
# L2-normalized vector of preprocessed document
word_vec = nlp(new_doc).vector
else:
# Remove target label
y.drop(y.index[i], inplace=True)
# Update the BoW matrix
bow_matrix.append(word_vec)
# Return BoW matrix
return bow_matrix
Unfortunately, because I cannot pass the y vector to the .transform() method, it does not work.
How can I force the pipeline to pass both X and y parameters?
Is there any other workaround on how to do it?
I don't want to pass y via .fit_transform(), because test data shouldn't be fitted.
def transform(self, X, y=None):
Here you have written y = None, which means if you aren't passing any y value then it's taking a default value as None.
In order to force a pipeline to pass a y value u should write
def transform(self, X, y):
pass
If you do this then you have to pass a y value, else it will return a error
the space problem I am talking about
class SpacyVectorizer:
def __init__(
self,
alpha_only: bool = True,
lemmatize: bool = True,
remove_stopwords: bool = True,
case_fold: bool = True,
):
self.alpha_only = alpha_only
self.lemmatize = lemmatize
self.remove_stopwords = remove_stopwords
self.case_fold = case_fold
self.nlp = spacy.load(
name='en_core_web_sm',
disable=["parser", "ner"]
)
def transform(self, X, y):
# Bag-of-Words matrix
bow_matrix = []
# Iterate over documents in SpaCy pipeline
for i, doc in enumerate(nlp.pipe(X)):
# Words array
words = []
# Tokenize document
for token in doc:
# Remove non-alphanumeric tokens
if self.alpha_only and not token.is_alpha:
continue
# Stopword removal
if self.remove_stopwords and token.is_stop:
continue
# Lemmatization
if self.lemmatize:
token = token.lemma_
# Case folding
if self.case_fold:
token = str(token).casefold()
# Append token to words array
words.append(token)
# Update the Bow representation
if words:
# Preprocessed document
new_doc = ' '.join(words)
# L2-normalized vector of preprocessed document
word_vec = nlp(new_doc).vector
else:
# Remove target label
y.drop(y.index[i], inplace=True)
# Update the BoW matrix
bow_matrix.append(word_vec)
# Return BoW matrix
return bow_matrix
The error you are getting might be because of the space problem, as self might be taking x value and X parameter might be taking y value
Related
I am working on an NLP project using Seq2Seq. I created a data frame from my dataset then created a batch iterator using data loader, see the following code:
# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
data = pd.DataFrame(original_word_pairs, columns=["src", "trg"])
# conver the data to tensors and pass to the Dataloader
# to create a batch iterator
class MyData(Dataset):
def __init__(self, X, y):
self.data = X
self.target = y
# TODO: convert this into torch code is possible
self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return x,y,x_len
def __len__(self):
return len(self.data)
train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)
train_dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
test_dataset= DataLoader(val_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
That is a part of my code, the thing is I want to use the iterators like this
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
But I got an error "AttributeError: 'list' object has no attribute 'src'"
How can I use the iterator and access a specific column?
You can redefine __getitem__ in your Dataset to return a dictionary:
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return {"src": x, "trg": y, "x_len": x_len}
The default collate_fn of DataLoader will take care to provide a dictionary containing batches instead of single observations, but you need to convert x_len to a tensor into __getitem__ to make it work (or you can pass a custom collate_fn).
The following training curve is generated using the same Tensorflow + Keras script written in Python:
RED line uses five features.
GREEN line uses seven features.
BLUE line uses nine features.
Can anyone tell me the probable cause of the oscillation of the GREEN line so that I can troubleshoot my script?
Source code:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use both gpus for training.
import sys, random
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from lxml import etree, objectify
# <editor-fold desc="GPU">
# resolve GPU related issues.
try:
physical_devices = tf.config.list_physical_devices('GPU')
for gpu_instance in physical_devices:
tf.config.experimental.set_memory_growth(gpu_instance, True)
except Exception as e:
pass
# END of try
# </editor-fold>
# <editor-fold desc="Lxml helper">
class LxmlHelper:
#classmethod
def objectify_xml(cls, input_path_dir):
file_dom = etree.parse(input_path_dir) # parse xml and convert it into DOM
file_xml_bin = etree.tostring(file_dom, pretty_print=False, encoding="ascii") # encode DOM into ASCII object
file_xml_text = file_xml_bin.decode() # convert binary ASCII object into ASCII text
objectified_xml = objectify.fromstring(file_xml_text) # convert text into a Doxygen object
return objectified_xml
# </editor-fold>
# <editor-fold desc="def encode(letter)">
def encode(letter: str):
if letter == 'H':
return [1.0, 0.0, 0.0]
elif letter == 'E':
return [0.0, 1.0, 0.0]
elif letter == 'C':
return [0.0, 0.0, 1.0]
elif letter == '-':
return [0.0, 0.0, 0.0]
# END of function
def encode_string_1(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
try:
one_hot_binary_str = one_hot_binary_str + encode(ch)
except Exception as e:
print(pattern_str, one_hot_binary_str, ch)
# END of for loop
return one_hot_binary_str
# END of function
def encode_string_2(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
temp_encoded_vect = [encode(ch)]
one_hot_binary_str = one_hot_binary_str + temp_encoded_vect
# END of for loop
return one_hot_binary_str
# END of function
# </editor-fold>
# <editor-fold desc="def load_data()">
def load_data_k(fname: str, class_index: int, feature_start_index: int, **selection):
"""Loads data for training and validation
:param fname: (``string``) - name of the file with the data
:param selection: (``kwargs``) - see below
:return: four tensorflow tensors: training input, training output, validation input and validation output
:Keyword Arguments:
* *top_n_lines* (``number``) --
take top N lines of the input and disregard the rest
* *random_n_lines* (``number``) --
take random N lines of the input and disregard the rest
* *validation_part* (``float``) --
separate N_lines * given_fraction of the input lines from the training set and use
them for validation. When the given_fraction = 1.0, then the same input set of
N_lines is used both for training and validation (this is the default)
"""
i = 0
file = open(fname)
if "top_n_lines" in selection:
lines = [next(file) for _ in range(int(selection["top_n_lines"]))]
elif "random_n_lines" in selection:
tmp_lines = file.readlines()
lines = random.sample(tmp_lines, int(selection["random_n_lines"]))
else:
lines = file.readlines()
data_x, data_y, data_z = [], [], []
for l in lines:
row = l.strip().split() # return a list of words from the line.
x = [float(ix) for ix in row[feature_start_index:]] # convert 3rd to 20th word into a vector of float numbers.
y = encode(row[class_index]) # convert the 3rd word into binary.
z = encode_string_1(row[class_index+1])
data_x.append(x) # append the vector into 'data_x'
data_y.append(y) # append the vector into 'data_y'
data_z.append(z) # append the vector into 'data_z'
# END for l in lines
num_rows = len(data_x)
given_fraction = selection.get("validation_part", 1.0)
if given_fraction > 0.9999:
valid_x, valid_y, valid_z = data_x, data_y, data_z
else:
n = int(num_rows * given_fraction)
data_x, data_y, data_z = data_x[n:], data_y[n:], data_z[n:]
valid_x, valid_y, valid_z = data_x[:n], data_y[:n], data_z[:n]
# END of if-else block
tx = tf.convert_to_tensor(data_x, np.float32)
ty = tf.convert_to_tensor(data_y, np.float32)
tz = tf.convert_to_tensor(data_z, np.float32)
vx = tf.convert_to_tensor(valid_x, np.float32)
vy = tf.convert_to_tensor(valid_y, np.float32)
vz = tf.convert_to_tensor(valid_z, np.float32)
return tx, ty, tz, vx, vy, vz
# END of the function
# </editor-fold>
# <editor-fold desc="def create_model()">
def create_model(n_hidden_1, n_hidden_2, num_classes, num_features):
# create the model
model = Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(num_features,)))
model.add(tf.keras.layers.Dense(n_hidden_1, activation='sigmoid'))
model.add(tf.keras.layers.Dense(n_hidden_2, activation='sigmoid'))
###model.add(tf.keras.layers.Dense(n_hidden_3, activation='sigmoid'))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
# instantiate the optimizer
opt = keras.optimizers.SGD(learning_rate=LEARNING_RATE)
# compile the model
model.compile(
optimizer=opt,
loss="categorical_crossentropy",
metrics="categorical_accuracy"
)
# return model
return model
# </editor-fold>
if __name__ == "__main__":
# <editor-fold desc="(input/output parameters)">
my_project_routine = LxmlHelper.objectify_xml("my_project_evaluate.xml")
# input data
INPUT_DATA_FILE = str(my_project_routine.input.input_data_file)
INPUT_PATH = str(my_project_routine.input.input_path)
CLASS_INDEX = int(my_project_routine.input.class_index)
FEATURE_INDEX = int(my_project_routine.input.feature_index)
# output data
OUTPUT_PATH = str(my_project_routine.output.output_path)
MODEL_FILE = str(my_project_routine.output.model_file)
TRAINING_PROGRESS_FILE = str(my_project_routine.output.training_progress_file)
# Learning parameters
LEARNING_RATE = float(my_project_routine.training_params.learning_rate)
EPOCH_SIZE = int(my_project_routine.training_params.epoch_size)
BATCH_SIZE = int(my_project_routine.training_params.batch_size)
INPUT_LINES_COUNT = int(my_project_routine.input.input_lines_count)
VALIDATION_PART = float(my_project_routine.training_params.validation_part)
SAVE_PERIOD = str(my_project_routine.output.save_period)
# NN parameters
HIDDEN_LAYER_1_NEURON_COUNT = int(my_project_routine.hidden_layers.one)
HIDDEN_LAYER_2_NEURON_COUNT = int(my_project_routine.hidden_layers.two)
###HIDDEN_LAYER_3_NEURON_COUNT = int(my_project_routine.hidden_layers.three)
CLASS_COUNT = int(my_project_routine.class_count)
FEATURES_COUNT = int(my_project_routine.features_count)
input_file_path_str = os.path.join(INPUT_PATH, INPUT_DATA_FILE)
training_progress_file_path_str = os.path.join(OUTPUT_PATH, TRAINING_PROGRESS_FILE)
model_file_path = os.path.join(OUTPUT_PATH, MODEL_FILE)
# command-line arg processing
input_file_name_str = None
if len(sys.argv) > 1:
input_file_name_str = sys.argv[1]
else:
input_file_name_str = input_file_path_str
# END of if-else
# </editor-fold>
# <editor-fold desc="(load data from file)">
# load training data from the disk
train_x, train_y, _, validate_x, validate_y, _ = \
load_data_k(
fname=input_file_name_str,
class_index=CLASS_INDEX,
feature_start_index=FEATURE_INDEX,
random_n_lines=INPUT_LINES_COUNT,
validation_part=VALIDATION_PART
)
print("training data size : ", len(train_x))
print("validation data size : ", len(validate_x))
# </editor-fold>
### STEPS_PER_EPOCH = len(train_x) // BATCH_SIZE
### VALIDATION_STEPS = len(validate_x) // BATCH_SIZE
# <editor-fold desc="(model creation)">
# load previously saved NN model
model = None
try:
model = keras.models.load_model(model_file_path)
print("Loading NN model from file.")
model.summary()
except Exception as ex:
print("No NN model found for loading.")
# END of try-except
# </editor-fold>
# <editor-fold desc="(model run)">
# # if there is no model loaded, create a new model
if model is None:
csv_logger = keras.callbacks.CSVLogger(training_progress_file_path_str)
checkpoint = ModelCheckpoint(
model_file_path,
monitor='loss',
verbose=1,
save_best_only=True,
mode='auto',
save_freq='epoch'
)
callbacks_vector = [
csv_logger,
checkpoint
]
# Set mirror strategy
#strategy = tf.distribute.MirroredStrategy(devices=["/device:GPU:0","/device:GPU:1"])
#with strategy.scope():
print("New NN model created.")
# create sequential NN model
model = create_model(
n_hidden_1=HIDDEN_LAYER_1_NEURON_COUNT,
n_hidden_2=HIDDEN_LAYER_2_NEURON_COUNT,
##n_hidden_3=HIDDEN_LAYER_3_NEURON_COUNT,
num_classes=CLASS_COUNT,
num_features=FEATURES_COUNT
)
# Train the model with the new callback
history = model.fit(
train_x, train_y,
validation_data=(validate_x, validate_y),
batch_size=BATCH_SIZE,
epochs=EPOCH_SIZE,
callbacks=[callbacks_vector],
shuffle=True,
verbose=2
)
print(history.history.keys())
# END of ... with
# END of ... if
# </editor-fold>
Plotting Script
import os
from argparse import ArgumentParser
import random
from typing import List
import matplotlib.pyplot as plt
import numpy as np
import math
import sys
import datetime
class Quad:
def __init__(self, x_vector, y_vector, color_char, label_str):
self.__x_vector = x_vector
self.__y_vector = y_vector
self.__color_char = color_char
self.__label_str = label_str
def get_x_vector(self):
return self.__x_vector
def get_y_vector(self):
return self.__y_vector
def get_color_char(self):
return self.__color_char
def get_label_str(self):
return self.__label_str
class HecaPlotClass:
def __init__(self):
self.__x_label_str: str = None
self.__y_label_str: str = None
self.__title_str: str = None
self.__trio_vector: List[Quad] = []
self.__plotter = plt
#property
def x_label_str(self):
return self.__x_label_str
#x_label_str.setter
def x_label_str(self, t):
self.__x_label_str = t
#property
def y_label_str(self):
return self.__y_label_str
#y_label_str.setter
def y_label_str(self, t):
self.__y_label_str = t
#property
def title_str(self):
return self.__title_str
#title_str.setter
def title_str(self, t):
self.__title_str = t
def add_y_axes(self, trio_obj: Quad):
self.__trio_vector.append(trio_obj)
def generate_plot(self):
for obj in self.__trio_vector:
x_vector = obj.get_x_vector()
y_vector = obj.get_y_vector()
label_str = obj.get_label_str()
# print(label_str)
# print(len(x_vector))
# print(len(y_vector))
self.__plotter.plot(
x_vector,
y_vector,
color=obj.get_color_char(),
label=label_str
)
# END of ... for loop
# Naming the x-axis, y_1_vector-axis and the whole graph
self.__plotter.xlabel(self.__x_label_str)
self.__plotter.ylabel(self.__y_label_str)
self.__plotter.title(self.__title_str)
# Adding legend, which helps us recognize the curve according to it's color
self.__plotter.legend()
# To load the display window
#self.__plotter.show()
def save_png(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.png')
self.__plotter.savefig(output_file_str)
def save_pdf(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.pdf')
self.__plotter.savefig(output_file_str)
class MainClass(object):
__colors_vector = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow', 'orange', 'lightgreen', 'crimson']
__working_dir = r"."
__file_names_vector = ["training_progress-32.txt", "training_progress-64.txt", "training_progress-128.txt"]
__input_files_vector = []
__output_directory = None
__column_no_int = 0
__split_percentage_at_tail_int = 100
__is_pdf_output = False
__is_png_output = False
# <editor-fold desc="def load_data()">
#classmethod
def __load_data(cls, fname: str, percetage_int:int, column_no_int:int):
np_array = np.loadtxt(
fname,
# usecols=range(1,11),
dtype=np.float32,
skiprows=1,
delimiter=","
)
size_vector = np_array.shape
array_len_int = size_vector[0]
rows_count_int = int(percetage_int * array_len_int / 100)
np_array = np_array[-rows_count_int:]
x = np_array[:, 0]
y = np_array[:, column_no_int]
return x, y
# END of the function
# </editor-fold>
# <editor-fold desc="(__parse_args())">
#classmethod
def __parse_args(cls):
# initialize argument parser
my_parser = ArgumentParser()
my_parser.add_argument("-c", help="column no.", type=int)
my_parser.add_argument('-i', nargs='+', help='a list of input files', required=True)
my_parser.add_argument("-o", help="output directory", type=str)
my_parser.add_argument("-n", help="percentage of data to split from tail", type=float)
my_parser.add_argument("--pdf", help="PDF output", action='store_true')
my_parser.add_argument("--png", help="PNG output", action='store_true')
# parse the argument
args = my_parser.parse_args()
cls.__input_files_vector = args.i
cls.__output_directory = args.o
cls.__split_percentage_at_tail_int = args.n
cls.__column_no_int = args.c
cls.__is_pdf_output = args.pdf
cls.__is_png_output = args.png
# </editor-fold>
#classmethod
def main(cls):
cls.__parse_args()
if cls.__input_files_vector is None:
cls.__input_files_vector = cls.__file_names_vector
if cls.__output_directory is None:
cls.__output_directory = cls.__working_dir
if cls.__split_percentage_at_tail_int is None:
cls.__split_percentage_at_tail_int = 100
if cls.__column_no_int is None:
cls.__column_no_int = 1
my_project_plot_obj = HecaPlotClass()
i = 0
for file_path_str in cls.__input_files_vector:
print(file_path_str)
x_vector, y_vector = cls.__load_data(os.path.join(cls.__working_dir, file_path_str), cls.__split_percentage_at_tail_int, cls.__column_no_int)
my_project_plot_obj.x_label_str = "Epoch"
my_project_plot_obj.y_label_str = "Accuracy"
my_project_plot_obj.title_str = "training_plot-{date:%Y-%m-%d_%H:%M:%S}".format(date=datetime.datetime.now())
my_project_plot_obj.x_axis_vector = x_vector
if i == 0:
random_int = 0
else:
random_int = i % (len(cls.__colors_vector)-1)
# END of ... if
print("random_int : ", random_int)
my_project_plot_obj.add_y_axes(Quad(x_vector, y_vector, cls.__colors_vector[random_int], file_path_str))
i = i + 1
# END of ... for loop
my_project_plot_obj.generate_plot()
my_project_plot_obj.save_png(cls.__output_directory)
my_project_plot_obj.save_pdf(cls.__output_directory)
if __name__ == "__main__":
MainClass.main()
The primary reason could be improper (non-random ~ ordered) distribution of data.
If you notice the accuracy beyond epoch 180, there is a orderly switching between the accuracy between ~0.43 (approx.) and ~0.33 (~approx.), and occasionally ~0.23 (approx.). The more important thing to notice is that the accuracy is decreasing (there's no improvement in validation accuracy) as we increase the epochs.
The accuracy can increase in such cases if you (1) reduce batch size, or (2) use a better optimizer like Adam. And check the learning rate.
These changes can help the shift and oscillation, as well.
Additionally, Running average of the accuracy can be plotted to avoid the oscillation. This is again a mitigation scheme rather than a correction scheme. But, what it does is removes the order (partition of the data) and mixes the nearby data.
Lastly, I would also reshuffle the data and normalize after each layer. See if that helps.
Generally, sharp jumps and flat lines in the accuracy usually mean that a group of examples is classified as a given class at a same time. If your dataset contains, say, 50 examples with the same combination of 7 features then they would go into the same class at the same time. This is what probably causes sharp jumps - identical or similar examples clustered together.
So for example, if you have 50 men aged 64, and a decision boundary to classify them as more prone to an illness shifts from >65 to >63, then accuracy changes rapidly as all of them change classification at the same time.
Regarding the oscillation of the curve - due to the fact above, oscillation will be amplified by small changes in learning. Your network learns based on cross entropy, which means that it minimizes the difference between target and your predictions. This means that it operates on the difference between probability and target (say, 0.3 vs class 0) instead of class and target like accuracy (so, 0 vs 0) in the same example. Cross entropy is much more smooth as it is not affected by the issue outlined above.
I have a Pipeline built as follows:
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('text',
Pipeline(steps=[('CV',
CountVectorizer())]),
'Tweet'),
('category',
OneHotEncoder(handle_unknown='ignore'),
['Tweet_ID']),
('numeric',
Pipeline(steps=[('knnImputer',
KNNImputer(n_neighbors=2)),
('scaler',
MinMaxScale...
'CS',
'UC',
'CL',
'S',
'SS',
'UW',
...])])),
('classifier', LogisticRegression())])
I am trying to get feature names:
feature_names = lr['preprocessor'].transformers_[0][1].get_feature_names()
coefs = lr.named_steps["classifier"].coef_.flatten()
zipped = zip(feature_names, coefs)
features_df = pd.DataFrame(zipped, columns=["feature", "value"])
features_df["ABS"] = features_df["value"].apply(lambda x: abs(x))
features_df["colors"] = features_df["value"].apply(lambda x: "green" if x > 0 else "red")
features_df = features_df.sort_values("ABS", ascending=False)
features_df
However I am getting an error:
----> 6 feature_names = lr['preprocessor'].transformers_[0][1].get_feature_names()
7 coefs = lr.named_steps["classifier"].coef_.flatten()
8
AttributeError: 'Pipeline' object has no attribute 'get_feature_names
I already went through the following answers:
'OneHotEncoder' object has no attribute 'get_feature_names'
'Pipeline' object has no attribute 'get_feature_names' in scikit-learn
but unfortunately they were not so helpful as I would have expected.
Does anyone know how to fix it?
Happy to provide more info, if needed.
An example of pipeline is the following:
lr = Pipeline(steps=[('preprocessor', preprocessing),
('classifier', LogisticRegression(C=5, tol=0.01, solver='lbfgs', max_iter=10000))])
where preprocessing is
preprocessing = ColumnTransformer(
transformers=[
('text',text_preprocessing, 'Tweet'),
('category', categorical_preprocessing, c_feat),
('numeric', numeric_preprocessing, n_feat)
], remainder='passthrough')
I am separating before splitting train and test sets the different types of features:
text_columns=['Tweet']
target=['Label']
c_feat=['Tweet_ID']
num_features=['CS','UC','CL','S','SS','UW']
Following David's answer and link, I have tried as follows:
For numerical:
class NumericalTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# Numerical features to pass down the numerical pipeline
X = X[[num_features]]
X = X.replace([np.inf, -np.inf], np.nan)
return X.values
# Defining the steps in the numerical pipeline
numerical_pipeline = Pipeline(steps=[
('num_transformer', NumericalTransformer()),
('imputer', KNNImputer(n_neighbors=2)),
('minmax', MinMaxScaler())])
For categorical:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
# Return self nothing else to do here
def fit(self, X, y=None):
return self
# Helper function that converts values to Binary depending on input
def create_binary(self, obj):
if obj == 0:
return 'No'
else:
return 'Yes'
# Transformer method for this transformer
def transform(self, X, y=None):
# Categorical features to pass down the categorical pipeline
return X[[c_feat]].values
# Defining the steps in the categorical pipeline
categorical_pipeline = Pipeline(steps=[
('cat_transformer', CategoricalTransformer()),
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))])
and for text feature:
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
# Return self nothing else to do here
def fit(self, X, y=None):
return self
# Helper function that converts values to Binary depending on input
def create_binary(self, obj):
if obj == 0:
return 'No'
else:
return 'Yes'
# Transformer method for this transformer
def transform(self, X, y=None):
# Text features to pass down the text pipeline
return X[['Tweet']].values
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
('text_transformer', TextTransformer()),
('cv', CountVectorizer())])
Then I combine numerical, text and categorical pipeline into one full big pipeline horizontally:
# using FeatureUnion
union_pipeline = FeatureUnion(transformer_list=[
('categorical_pipeline', categorical_pipeline),
('numerical_pipeline', numerical_pipeline),
('text_pipeline', text_pipeline)])
and finally:
# Combining the custom imputer with the categorical, text and numerical pipeline
preprocess_pipeline = Pipeline(steps=[('custom_imputer', CustomImputer()),
('full_pipeline', union_pipeline)])
What it is still not clear is how to get features names.
You need to implement a dedicated get_feature_names function, as you are using a custom transformer.
Please refer to this question for details, where you can find a code example.
I am a beginner to AI and sentimental analysis. I'm doing sentimental analysis between two documents. This code is working perfectly fine when I add only one source document rather than a list of multiple source documents to compare it with multiple target documents.
Can someone please tell me what I need to change to work it with multiple source documents list?
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'E:\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
results = []
for doc in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'doc' : doc
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Calculate the similarity score between a source rule & a target rule.
source_rule = [ '2.1.1 Context','2.2.3 Value']
target_rule = [ '2.1.1 Context','2.1.2.4 Assist Failed Train']
# This will return one target rules text with a similarity score
sim_scores = ds.calculate_similarity(source_rule, target_rule)
print(sim_scores)
This is the error I am getting right now.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-041084a3f599> in <module>
6 # This will return one target rules text with similarity score
7
----> 8 sim_scores = ds.calculate_similarity(source_rule, target_rule)
9
10 print(sim_scores)
<ipython-input-20-055f5d25808f> in calculate_similarity(self, source_doc, target_docs, threshold)
41 source_doc=[source_doc]
42
---> 43 source_vec = self.vectorize(source_doc)
44 results = []
45 for doc in target_docs:
<ipython-input-20-055f5d25808f> in vectorize(self, doc)
8 def vectorize(self, doc):
9 """Identify the vector values for each word in the given document"""
---> 10 doc = doc.lower()
11 words = [w for w in doc.split(" ") if w not in self.stopwords]
12 word_vecs = []
AttributeError: 'list' object has no attribute 'lower'
Rather than sending the whole list to the function, make sure the source_rule is a list and then iterate over it and then execute the calculate_similarity() function on it
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'E:\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
results = []
for doc in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'doc' : doc
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Calculate the similarity score between a source rule & a target rule.
source_rule = [ '2.1.1 Context','2.2.3 Value']
target_rule = [ '2.1.1 Context','2.1.2.4 Assist Failed Train']
if isinstance(source_rule, str):
source_rule = [source_rule]
# This will return one target rules text with a similarity score
for rule in source_rule:
sim_scores = ds.calculate_similarity(rule, target_rule)
print("Similarity with {} is {}".format(rule, sim_scores))
I am using Pipelines from Pyspark's ML library to preprocess text and calculate the TF-IDF values for all tokens. I also created a custom Transformer that returns for each text snippet the 5 tokens with the highest TF-IDF values. The main code looks like this:
%pyspark
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")
count_vectorizer = CountVectorizer(inputCol="filtered", outputCol="count", vocabSize=pow(2,10))
idf = IDF(inputCol="count", outputCol="TF-IDF")
normalizer = Normalizer(inputCol="TF-IDF", outputCol="normalized", p=2.0)
top_token_extractor = TopTokenExtractor(inputCol="normalized", outputCol="topTokens", vocabulary=model.stages[2].vocabulary) # !!! does not work
pipeline = Pipeline(stages=[tokenizer, remover, count_vectorizer, idf, normalizer, top_token_extractor])
model = pipeline.fit(df)
And here is the implementation of TopTokenExtractor:
%pyspark
from pyspark import keyword_only
from pyspark.ml.pipeline import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
class TopTokenExtractor(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None, vocabulary=None):
super(TopTokenExtractor, self).__init__()
self.vocabulary = Param(self, "vocabulary", "")
self._setDefault(vocabulary=set())
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None, vocabulary=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def setVocabulary(self, value):
self._paramMap[self.vocabulary] = value
return self
def getVocabulary(self):
return self.getOrDefault(self.vocabulary)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
vocabulary = self.getVocabulary()
def f(s):
token_tuples = sorted(list(zip(s.indices, s.values)), key=lambda x: x[1], reverse=True)
top_tokens = list()
for i in range(0, min(5, len(token_tuples))):
top_tokens.append(vocabulary[token_tuples[i][0]])
return top_tokens
t = ArrayType(StringType())
return dataset.withColumn(out_col, udf(f, t)(in_col))
The problem is that in order to return a list of tokens rather than indices, I need to pass the vocabulary from the CountVectorizer as a parameter to TopTokenExtractor. After calling pipeline.fit(df) the vocabulary could be accessed by model.stages[2].vocabulary, but I could not figure out how to pass it as a parameter in the course of a pipeline. Is this possible at all?
As a workaround, I might split up the pipeline into two parts, but I would really prefer to have a single pipeline if possible.