Training a BERT and Running out of memory - Google Colab

Training a BERT and Running out of memory - Google Colab - python

I keep running out of memory even after i bought google colab pro which has 25gb RAM usage. I have no idea why is this happening. I tried every kernel possible (Google colab, Google colab pro, Kaggle kernel, Amazon Sagemaker, Google Cloud Platform). I reduced my batch size to 8, no success whatsoever.
My goal is to train Bert in Deep Pavlov (with Russian text classification extension) to predict emotion of the tweet. It is a multiclass classification with 5 classes
Here is my whole code:
!pip3 install deeppavlov
import pandas as pd
train_df = pd.read_csv('train_pikabu.csv')
test_df = pd.read_csv('test_pikabu.csv')
val_df = pd.read_csv('validation_pikabu.csv')
from deeppavlov.dataset_readers.basic_classification_reader import BasicClassificationDatasetReader
# read data from particular columns of `.csv` file
data = BasicClassificationDatasetReader().read(
data_path='./',
train='train_pikabu.csv',
valid="validation_pikabu_a.csv",
test="test_pikabu.csv",
x = 'content',
y = 'emotions'
)
from deeppavlov.dataset_iterators.basic_classification_iterator import
BasicClassificationDatasetIterator
# initializing an iterator
iterator = BasicClassificationDatasetIterator(data, seed=42, shuffle=True)
!python -m deeppavlov install squad_bert
from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor
bert_preprocessor = BertPreprocessor(vocab_file="./bert/vocab.txt",
do_lower_case=False,
max_seq_length=256)
from deeppavlov.core.data.simple_vocab import SimpleVocabulary
vocab = SimpleVocabulary(save_path="./binary_classes.dict")
iterator.get_instances(data_type="train")
vocab.fit(iterator.get_instances(data_type="train")[1])
from deeppavlov.models.preprocessors.one_hotter import OneHotter
one_hotter = OneHotter(depth=vocab.len,
single_vector=True # means we want to have one vector per sample
)
from deeppavlov.models.classifiers.proba2labels import Proba2Labels
prob2labels = Proba2Labels(max_proba=True)
from deeppavlov.models.bert.bert_classifier import BertClassifierModel
from deeppavlov.metrics.accuracy import sets_accuracy
bert_classifier = BertClassifierModel(
n_classes=vocab.len,
return_probas=True,
one_hot_labels=True,
bert_config_file="./bert/bert_config.json",
pretrained_bert="./bert/bert_model.ckpt",
save_path="sst_bert_model/model",
load_path="sst_bert_model/model",
keep_prob=0.5,
learning_rate=1e-05,
learning_rate_drop_patience=5,
learning_rate_drop_div=2.0
)
# Method `get_instances` returns all the samples of particular data field
x_valid, y_valid = iterator.get_instances(data_type="valid")
# You need to save model only when validation score is higher than previous one.
# This variable will contain the highest accuracy score
best_score = 0.
patience = 2
impatience = 0
# let's train for 3 epochs
for ep in range(3):
nbatches = 0
for x, y in iterator.gen_batches(batch_size=8,
data_type="train", shuffle=True):
x_feat = bert_preprocessor(x)
y_onehot = one_hotter(vocab(y))
bert_classifier.train_on_batch(x_feat, y_onehot)
print("Batch done\n")
nbatches += 1
if nbatches % 1 == 0:
# validating every 100 batches
y_valid_pred = bert_classifier(bert_preprocessor(x_valid))
score = sets_accuracy(y_valid, vocab(prob2labels(y_valid_pred)))
print("Batches done: {}. Valid Accuracy: {}".format(nbatches, score))
y_valid_pred = bert_classifier(bert_preprocessor(x_valid))
score = sets_accuracy(y_valid, vocab(prob2labels(y_valid_pred)))
print("Epochs done: {}. Valid Accuracy: {}".format(ep + 1, score))
if score > best_score:
bert_classifier.save()
print("New best score. Saving model.")
best_score = score
impatience = 0
else:
impatience += 1
if impatience == patience:
print("Out of patience. Stop training.")
break
It runs up to 1 batch and then crushes.

Related

How to use TPU to accelerate sentiment analysis using FinBert in Kaggle

I am trying to analyze the sentiment of earnings calls using FinBert. Since I am analysing more than 40,000 earnings calls, the computation of the sentiment scores takes more than a week. Because of that, I want to use the TPU provided by Kaggle to accelerate this process.
But all the tutorials/ guides I could find where just dealing with the training of the model, but I just want to use one of the pre-trained versions and use the TPU to accelerate the sentiment analysis of the earnings calls.
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
except ValueError:
tpu = None
gpus = tf.config.experimental.list_logical_devices("GPU")
if tpu:
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu,)
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
strategy = tf.distribute.get_strategy()
print('Running on single GPU ', gpus[0].name)
else:
strategy = tf.distribute.get_strategy()
print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)```
#This is the loop I am using to calculate and score
for i in range(40001,len(clean_data)-1):
#for i in range(0,10):
print(i)
# Get QandA Text
temp = test_data.iloc[i,3]
sentences = nltk.sent_tokenize(temp)
results = nlp(sentences)
filename = clean_data.iloc[i,0]
# positive = 0
# neutral = 0
# negative = 0
j = 0
positive = 0
neutral = 0
negative = 0
for j in range (0,len(results)):
label = results[j]["label"]
if label == "Positive":
positive = positive + 1
elif label == "Neutral":
neutral = neutral + 1
else:
negative = negative + 1

per_pos_qanda = positive / len(results)
per_neg_qanda = negative / len(results)
net_score_qanda = per_pos_qanda - per_neg_qanda
finbert_results.iloc[i,0] = filename
finbert_results.iloc[i,7] = per_pos_qanda
finbert_results.iloc[i,8] = per_neg_qanda
finbert_results.iloc[i,9] = net_score_qanda
Do I now need to incorporate the TPU in the for loop code when I am calling the algorithm? So, in this line?
results = nlp(sentences)

If I understand correctly, you're referring to inference and not training. In that case also you can benefit from TPUs, for example by using distributed strategy. Please refer to this guide https://www.tensorflow.org/api_docs/python/tf/distribute/Strategy

Python LSTM Bitcoin prediction flatlines

I'm currently trying to build a "simple" LSTM model that takes historical Bitcoin data, learns from that and then tries to predict the future X steps in advance.
I've build it on the idea that A + B + C = D so B + C + D should be E. (I think that's a very simple idea behind an LSTM model. I might be wrong however i'm pretty new to it.)
I managed to build the basics in python (I'm fairly new to python) but something seems off by the prediction. For some reason many of the predictions i test / make end up flatlining. I have a theory on why but we have no idea if it's correct and even less idea on how to solve it.
My theory is that within a sequence the model learns to put more importance / weight on the last digit in the sequence because with Bitcoin prices the future price (in 1 minute) is probably pretty close to the price now. That's try the predicted values keeps getting closer to the real value eventually being equal and thus flatlining in a graph. (I don't know if that makes sense but thats what i tought anyway.)
I've also added a screenshot of my graph from a few days ago. Almost all predictions however end similar to this graph. This is just a more extreme example as demonstration.
Here is my code, can someone please explain why it flatlines and what i did wrong?
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error
# Create output sets X + Y from given input-set
# with inputset : a 1-dimensional list of floats
# with N : the number of lookback values to use for X
# with Gap : the number of point skipped between X and Y
# Y: is equal to input, (although the first N are missing)
# X: for each y of Y a corresponding set of size N is created
# composed of the N values preceeding y.
def create_lookback(inputset, n=1, gap=0):
print("create_lookback with n=%d gap=%d" % (n,gap))
print(" - length of inputset = %d" % len(inputset))
dataX, dataY = [], []
for i in range(len(inputset) - (n+gap)):
a = inputset[i:(i + n), 0]
dataX.append(a)
dataY.append(inputset[i + n+gap, 0])
print(" - length of dataY = %d" % len(dataY))
data_x = np.array(dataX)
xret = data_x.reshape(data_x.shape[0], 1, data_x.shape[1])
return xret, np.array(dataY)
# Train model based on given training-set + Test-set
def create_model(trainX,trainY,testX,testY):
model = Sequential()
model.add(LSTM(units = 100, input_shape=(trainX.shape[1], trainX.shape[2], )))
model.add(Dropout(0.2))
#model.add(LSTM(30, return_sequences=True))
#model.add(Dropout(0.1))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
history = model.fit(trainX, trainY, epochs=100, batch_size=5, validation_data=(testX, testY), verbose=1, shuffle=False)
return model
# Evaluate given X / Y set.
# - Calculate RMSE
# - Generate visual line-plot to screen
def show_result(scaler,yhat,setY,txt):
print("Show %s result" % txt)
yhat_inverse = scaler.inverse_transform(yhat.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(setY.reshape(-1, 1))
if len(testY_inverse) == len(yhat_inverse):
rmse = sqrt(mean_squared_error(testY_inverse, yhat_inverse))
print(' RMSE %s : %.3f' % (txt,rmse))
pyplot.plot(yhat_inverse, label='predict '+txt)
pyplot.plot(testY_inverse, label='actual '+txt, alpha=0.5)
pyplot.legend()
pyplot.show()
# Extrapoleer is dutch for Extrapolate
def extrapoleer(i,model,tup,toekomst):
if(i == 0):
return
setX = np.array([[tup]])
y = model.predict(setX)
y_float = y[0][0]
tup_new = np.append(tup[1:], y_float)
toekomst.append(y_float)
extrapoleer(i-1, model, tup_new,toekomst)
# --- end of defined functions
# -- start of main flow
data_grid_1 = yf.download('BTC-USD', start="2021-04-14",end="2021-04-15", interval="1m");
data_grid_2 = yf.download('BTC-USD', period="12h", interval="1m");
dataset_1 = data_grid_1.iloc[:, 1:2].values
dataset_2 = data_grid_2.iloc[:, 1:2].values
scaler = MinMaxScaler(feature_range = (0, 1))
scaled = scaler.fit_transform(dataset_1)
# 70% of dataset_1 is used to train ; 30% to test
train_size = int(len(scaled) * 0.7)
test_size = len(scaled) - train_size
train, test = scaled[0:train_size,:], scaled[train_size:len(scaled),:]
print("train: %d test: %d" % (len(train), len(test)))
scaled_2 = scaler.fit_transform(dataset_2)
look_back_n = 3
look_back_gap = 0
trainX, trainY = create_lookback(train, look_back_n, look_back_gap)
testX, testY = create_lookback(test, look_back_n, look_back_gap)
testX_2, testY_2 = create_lookback(scaled_2, look_back_n, look_back_gap)
model = create_model(trainX,trainY,testX,testY)
yhat_1 = model.predict(testX)
yhat_2 = model.predict(testX_2)
show_result(scaler,yhat_1,testY,"test")
show_result(scaler,yhat_2,testY_2,"test2")
last_n = testY_2[-look_back_n:]
#toekomst = Future in dutch
toekomst = []
#aantal = Amount in Dutch, this indicates the amount if steps you want to future predict
aantal = 30
extrapoleer(aantal, model, last_n, toekomst)
print("Resultaat van %d voorspelde punten in de toekomst: " % aantal)
print(toekomst)
yhat_2_plus = np.append(yhat_2,toekomst)
show_result(scaler,yhat_2_plus,testY_2,"test2-plus")

Bert Text Classification Loss is Nan

I'm try to make an model that classify the text in 3 categories.(Negative,Neural,Positive)
I have csv file that contain comments on different apps with their rating.
First I import all the necessary libraries
!pip install transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%tensorflow_version 2.x
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer,DistilBertTokenizer,glue_convert_examples_to_features, InputExample,BertConfig,InputFeatures
from sklearn.model_selection import train_test_split
from tqdm import tqdm
%matplotlib inline
Then i'll get my csv file
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
df = pd.read_csv("reviews.csv")
print(df[['content','score']].head())
content score
0 Update: After getting a response from the deve... 1
1 Used it for a fair amount of time without any ... 1
2 Your app sucks now!!!!! Used to be good but no... 1
3 It seems OK, but very basic. Recurring tasks n... 1
4 Absolutely worthless. This app runs a prohibit... 1
Converting scores to sentiment
def to_sentiment(rating):
rating = int(rating)
if rating <= 2:
return 0
elif rating == 3:
return 1
else:
return 2
df['sentiment'] = df.score.apply(to_sentiment)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case = True)
Creating Helper Methods to fit the data into model
def convert_example_to_feature(review):
return tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=160, # truncates if len(s) > max_length
return_token_type_ids=True,
return_attention_mask=True,
pad_to_max_length=True, # pads to the right by default
)
def map_example_to_dict(input_ids,attention_mask,token_type_ids,label):
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids" : token_type_ids
},label
def encode_examples(ds):
# prepare list, so that we can build up final TensorFlow dataset from slices.
input_ids_list = []
token_type_ids_list = []
attention_mask_list = []
label_list = []
for index, row in tqdm(ds.iterrows()):
bert_input = convert_example_to_feature(row['content'])
input_ids_list.append(bert_input['input_ids'])
token_type_ids_list.append(bert_input['token_type_ids'])
attention_mask_list.append(bert_input['attention_mask'])
label_list.append([row['sentiment']])
return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
df_train, df_test = train_test_split(df,test_size=0.1)
Creating Model
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss,metrics=metric)
history = model.fit(ds_train_encoded,epochs=1)
14/443 [..............................] - ETA: 3:58 - loss: nan - accuracy: 0.3438
If i change the count of the sentiment and make it just positive and negative then it works.
But with 3 or more labels creates this problem.

The label classes index should start from 0 not 1.
TFBertForSequenceClassification requires labels in the range [0,1,...]
labels (tf.Tensor of shape (batch_size,), optional, defaults to None)
– Labels for computing the sequence classification/regression loss.
Indices should be in [0, ..., config.num_labels - 1]. If
config.num_labels == 1 a regression loss is computed (Mean-Square
loss), If config.num_labels > 1 a classification loss is computed
(Cross-Entropy).
Source: https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification

How can I tune my neural network to avoid overfitting the mnist data set?

!!!!!!!!!TL;DR at the bottom!!!!!!!!
In an attempt to learn the in's and out's of ML, I have been implementing a neural network optimizer in c++ and wrapped it with swig as a python module. Of course, the first problem I tackled was XOR via the following snip of code: 2 input layers, 2 hidden layers, 1 output layer.
from MikeLearn import NeuralNetwork
from MikeLearn import ClassificationOptimizer
import time
#=======================================================
# Training Set
#=======================================================
X = [[0,1],[1,0],[1,1],[0,0]]
Y = [[1],[1],[0],[0]]
nIn = len(X[0])
nOut = len(Y[0])
#=======================================================
# Model
#=======================================================
verbosity = 0
#Initualize neural network
# NeuralNetwork([nInputs, nHidden1, nHidden2,..,nOutputs],['Activation1','Activation2'...]
N = NeuralNetwork([nIn,2,nOut],['sigmoid','sigmoid'])
N.setLoggerVerbosity(verbosity)
#Initialize the classification optimizer
#ClassificationOptimizer(NeuralNetwork,Xtrain,Ytrain)
Opt = ClassificationOptimizer(N,X,Y)
Opt.setLoggerVerbosity(verbosity)
start_time = time.time();
#fit data
#fit(nEpoch,LearningRate)
E = Opt.fit(10000,0.1)
print("--- %s seconds ---" % (time.time() - start_time))
#Make a prediction
print(Opt.predict(X))
This snippet of code yields the following output (Correct answer would be [1,1,0,0])
--- 0.10273098945617676 seconds ---
((0.9398755431175232,), (0.9397522211074829,), (0.0612373948097229,), (0.04882470518350601,))
>>>
Looks great!
Now for the issue. The following snippet of code tries to learn from the mnist dataset, but suffers very obviously from overfitting. ~750 input (28X28 pixels), 50 hidden, 10 output
from MikeLearn import NeuralNetwork
from MikeLearn import ClassificationOptimizer
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time
#=======================================================
# Data Set
#=======================================================
#load the data dictionary
modeldata = pickle.load( open( "mnist_data.p", "rb" ) )
X = modeldata['X']
Y = modeldata['Y']
#normalize data
X = np.array(X)
X = X/255
X = X.tolist()
#training set
X1 = X[0:49999]
Y1 = Y[0:49999]
#validation set
X2 = X[50000:59999]
Y2 = Y[50000:59999]
#number of inputs/outputs
nIn = len(X[0]) #~750
nOut = len(Y[0]) #=10
#=======================================================
# Model
#=======================================================
verbosity = 1
#Initualize neural network
# NeuralNetwork([nInputs, nHidden1, nHidden2,..,nOutputs],['Activation1','Activation2'...]
N = NeuralNetwork([nIn,50,nOut],['sigmoid','sigmoid'])
N.setLoggerVerbosity(verbosity)
#Initialize optimizer
#ClassificationOptimizer(NeuralNetwork,Xtrain,Ytrain)
Opt = ClassificationOptimizer(N,X1,Y1)
Opt.setLoggerVerbosity(verbosity)
start_time = time.time();
#fit data
#fit(nEpoch,LearningRate)
E = Opt.fit(10,0.1)
print("--- %s seconds ---" % (time.time() - start_time))
#================================
#Final Accuracy on training set
#================================
XL = Opt.predict(X1)
correct = 0
for i,x in enumerate(XL):
if XL[i].index(max(XL[i])) == Y[i].index(max(Y1[i])):
correct = correct + 1
print("Training set Correct = " + str(correct))
Accuracy = correct/len(XL)*100;
print("Accuracy = " + str(Accuracy) + '%')
#================================
#Final Accuracy on validation set
#================================
XL = Opt.predict(X2)
correct = 0
for i,x in enumerate(XL):
if XL[i].index(max(XL[i])) == Y[i].index(max(Y2[i])):
correct = correct + 1
print("Testing set Correct = " + str(correct))
Accuracy = correct/len(XL)*100;
print("Accuracy = " + str(Accuracy)+'%')
That snippet of code yields the following output which shows the training accuracy and validation accuracy.
-------------------------
Epoch
9
-------------------------
E=
0.00696964
E=
0.350509
E=
3.49568e-05
E=
4.09073e-06
E=
1.38491e-06
E=
0.229873
E=
3.60186e-05
E=
0.000115187
E=
2.29978e-06
E=
2.69165e-06
--- 27.400235176086426 seconds ---
Training set Correct = 48435
Accuracy = 96.87193743874877%
Testing set Correct = 982
Accuracy = 9.820982098209821%
The training set accuracy is great, but then the testing set is no better than a random guess. Any idea what could be causing this?
TL;DR
Solved XOR with a model 2 inputs, 2 hidden, 1 output and sigmoid activation functions. Good results.
Tried to solve the Mnist data set with a model of 750 inputs (28X28 pixels), 50 hidden, 10 output and sigmoid activation functions. Severe overfitting issue. 95% accuracy on the training set, 10% accuracy on validation set.
Any Idea what is causing this?

The cause of overfitting is a combination of the data and model (network in this case). During the training is was 'lazy' and found aspects of the data that worked well in training data but not generalize well.
It is difficult/impossible to point out exactly where in the trained network the nodes/weights are located that are responsible for overfitting.
But we can avoid overfitting with several tricks:
Regularisation
Drop-out (easier to implement)
Change Network Architecture (less layers/less nodes/more dimension-reduction)
https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/
To get an idea of regularization, try the playground from tensorflow:
https://playground.tensorflow.org/
A visualisation of dropout
https://yusugomori.com/projects/deep-learning/dropout-relu
Besides try out regularisation techniques, also experiments with different NN architectures.

Converting mnist data to lmdb with python results very large database

I am currently playing the lenet model provided by caffe.
the example (which is in path/to/caffe/examples/mnist/convert_mnist_data.cpp provides a c++ program to convert the mnist data to lmdb.
I write a python program to do the same thing, but the size (480MB) of lmdb is much larger than the one converted by c++ (60MB).
the test accuracy is almost the same (98%).
I want to know why the size differs a lot.
Here is the program. I utilize the mnist module (https://pypi.python.org/pypi/python-mnist/) to help load the binary mnist data.
from mnist import MNIST
import numpy as np
import cv2
import lmdb
import caffe
mndata = MNIST('./data')
images, labels = mndata.load_training()
labels = np.array(labels)
images = np.array(images).reshape(len(labels), 28, 28).astype(np.uint8)
print type(images[0][0][0])
count = 0
env = lmdb.open('mnist_lmdb', map_size=1000*1000*1000)
txn = env.begin(write=True)
for i in xrange(len(labels)):
print i
datum = caffe.proto.caffe_pb2.Datum()
datum.channels = 1
datum.height = 28
datum.width = 28
datum.data = images[i].tobytes()
datum.label = labels[i]
str_id = '{:08}'.format(i)
txn.put(str_id, datum.SerializeToString())
count = count + 1
if count % 1000 == 0:
txn.commit()
txn = env.begin(write=True)
if count % 1000 != 0:
txn.commit()
env.close()
thank you.

env = lmdb.open('mnist_lmdb', map_size=1000*1000*1000)
The db size is mainly depend on the map_size,so you can reduce the map_size

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Training a BERT and Running out of memory - Google Colab - python

Related

How to use TPU to accelerate sentiment analysis using FinBert in Kaggle

Python LSTM Bitcoin prediction flatlines

Bert Text Classification Loss is Nan

How can I tune my neural network to avoid overfitting the mnist data set?

Converting mnist data to lmdb with python results very large database

Categories

Resources