Convert image to numpy dataset for tesseract ocr training - python

I am trying to create a dataset for tesseract. But unable to do so. The following code should output a csv file containing the image path and image label feature and .npz file. But the code does append any files in the csv
import numpy as np
import os
from tensorflow.keras.preprocessing.image import img_to_array, load_img
import pandas as pd
image_dataset_dir = "datasets/images"
new_dataset_folder = "datasets/new"
dataset = {
"image" :[],
"label" : []
}
for label in os.listdir(image_dataset_dir):
images_dir= image_dataset_dir + "/" + label
if not os.path.isdir(images_dir):
continue
for image_file in os.listdir(images_dir):
# if not image_file.endswith(".jpg", ".png",".tiff"):
# continue
img = load_img(os.path.join(image_dataset_dir, label, image_file))
x = img_to_array(img)
rel_path = label + "/" + os.path.splitext(image_file)[0] + '.npz'
os.makedirs(new_dataset_folder + "/" + label, exist_ok=True)
npz_file = os.path.join(new_dataset_folder, rel_path)
np.savez(npz_file, x)
# print(rel_path)
dataset["image"].append(rel_path)
dataset["label"].append(label)
df = pd.DataFrame(dataset)
df.to_csv(os.path.join(new_dataset_folder, "train.csv"), index=False)
print('Dataset converted to npz and saved here at %s '%new_dataset_folder)
df.head()

Your objective, create files and save the output and their values.
.npz is none public zones, try using it with different backgrounds matching patterns.
Sample: Using Pandas ( data frame as your requirements ) and Tensorflow
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Variables
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
BATCH_SIZE = 1
IMG_SIZE = (32, 32)
new_dataset_folder = "F:\\temp\\Python\\excel"
PATH = 'F:\\datasets\\downloads\\cats_name'
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')
train_dataset = tf.keras.utils.image_dataset_from_directory(train_dir, shuffle=True,
batch_size=BATCH_SIZE, image_size=IMG_SIZE)
class_names = train_dataset.class_names
print( 'class_names: ' + str( class_names ) )
print( train_dataset )
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Dataset
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
dataset = {
"image" :[],
"label" : []
}
file_order = 0
for data in train_dataset :
file_path = new_dataset_folder + "\\" + str(int(data[1][0])) + ".npz"
dataset["image"].append(file_path)
dataset["label"].append(str(int(data[1][0])))
# Save
encoding = "utf-8"
with open( new_dataset_folder + "\\" + str(file_order), "wb" ) as f:
f.write(str(data[0]).encode(encoding))
file_order = file_order + 1
df = pd.DataFrame(dataset)
df.to_csv(os.path.join(new_dataset_folder, "train.csv"), index=False)

Related

How to set a path for directory?

I'm getting an error:
Not a directory: '/root/.keras/datasets/mask_detection_dataset/train_set/without_mask'
Here is my code
_URL = 'https://docs.google.com/uc?export=download&id=1xdjxPV9hT-p9pFZJllpv-3Yjnl3k3JIz'
zip_dir = tf.keras.utils.get_file('mask_detection_dataset', origin = _URL, extract =True)
base_dir = os.path.join(os.path.dirname(zip_dir),'mask_detection_dataset')
train_dir = os.path.join(base_dir,'train_set')
validation_dir = os.path.join(base_dir,'validation_set')
train_withoutmask_dir = os.path.join(train_dir,'without_mask')
train_withmask_dir = os.path.join(train_dir,'with_mask')
train_impropermask_dir = os.path.join(train_dir,'improper_mask')
validation_withoutmask_dir = os.path.join(validation_dir,'without_mask')
validation_withmask_dir = os.path.join(validation_dir,'with_mask')
validation_impropermask_dir = os.path.join(validation_dir,'improper_mask')
num_withoutmask_tr = len(os.listdir(train_withoutmask_dir))
num_withmask_tr = len(os.listdir(train_withmask_dir))
num_impropermask_tr = len(os.listdir(train_impropermask_dir))
total_tr_imgs = num_withoutmask_tr + num_withmask_tr + num_impropermask_tr
print('Total training without mask images : ', num_withoutmask_tr )
print('Total training with mask images : ', num_withmask_tr )
print('Total training with improper mask images :', num_impropermask_tr )
print('Total training images : ', total_tr_imgs)
Error :

OpenCV - How to deal with an unbalanced dataset when developing an image classifier using a Bag of Visual Words Model?

My dataset is represented by a csv file with two attributes: an image path and its label. I have dozens of different labels but the label '51' represents around 34% of the dataset and the label '13' represents around 41%, so just these two labels represent 3/4 of the entire dataset and my classifier ends up classifying everything as '13' (I think I've never seen it classify anything as '51'. Could this be a problem?). How can I deal with this?
I'll leave the code that I currently have here:
from cv2 import cv2 as cv
import numpy as np
import sys
sys.path.extend(['../../'])
from src import utils
if __name__ == '__main__':
DICTIONARY_SIZE = 50
TRAIN_SIZE = 100
TEST_SIZE = 100
DETECTOR = cv.KAZE_create()
MATCHER = cv.FlannBasedMatcher()
EXTRACTOR = cv.BOWImgDescriptorExtractor(DETECTOR, MATCHER)
TRAINER = cv.BOWKMeansTrainer(DICTIONARY_SIZE)
SVM = cv.ml.SVM_create()
SVM.setType(cv.ml.SVM_C_SVC)
SVM.setKernel(cv.ml.SVM_LINEAR)
SVM.setTermCriteria((cv.TERM_CRITERIA_MAX_ITER, 100, 1e-6))
print("Generating Training and Test Sets...")
train, test = utils.getTrainingAndTestSets('multiclass.csv', TRAIN_SIZE, TEST_SIZE)
print("Generating Dictionary...")
for train_entry in train:
img_path = train_entry[0]
img = cv.imread(img_path)
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
keypoint, descriptors = DETECTOR.detectAndCompute(img, None)
if descriptors is not None:
TRAINER.add(descriptors)
EXTRACTOR.setVocabulary(TRAINER.cluster())
print("Preparing Training Data...")
train_desc = []
train_labels = []
for train_entry in train:
img_path = train_entry[0]
img_label = int(train_entry[1])
img = cv.imread(img_path)
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
descriptor = EXTRACTOR.compute(img, DETECTOR.detect(img))
if descriptor is not None:
train_desc.extend(descriptor)
train_labels.append(img_label)
print("Training...")
SVM.train(np.array(train_desc), cv.ml.ROW_SAMPLE, np.array(train_labels))
correct_predictions = 0
samples_tested = len(test)
print("Testing...")
for test_entry in test:
img_path = test_entry[0]
real_attribute_id = int(test_entry[1])
img = cv.imread(img_path)
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
feature = EXTRACTOR.compute(img, DETECTOR.detect(img))
try:
_, prediction = SVM.predict(feature)
predicted_attribute_id = int(prediction[0][0])
if predicted_attribute_id == real_attribute_id:
print("CORRECT PREDICTION! :)")
correct_predictions += 1
else:
print("INCORRECT PREDICTION... :(")
print("Predicted Label: " + utils.getLabelFromAttributeID(predicted_attribute_id) + "(" + str(predicted_attribute_id) + ")")
print("Real Label: " + utils.getLabelFromAttributeID(real_attribute_id) + "(" + str(real_attribute_id) + ")")
except Exception:
samples_tested -= 1
correct_percentage = (correct_predictions / samples_tested) * 100
print("Test Results: " + "{:.2f}".format(correct_percentage) + "% Correct Predictions.")
Feel free to tell me if my current approach has any error. Thanks.

I'm trying to load BERT "tfbert-large-uncased" but i got an error "Can't load config.json file"

I'm trying to load the pre-train BERT model but I'm getting an error while loading tokenized it says config.json is not found.
If anyone knows how to solve these issues please help me
Model and path configure
model_name = 'bert_v13'
data_dir = Path('../input/commonlitreadabilityprize/')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
build_dir = Path('./build/')
output_dir = build_dir / model_name
trn_encode_file = output_dir / 'trn.enc.joblib'
val_predict_file = output_dir / f'{model_name}.val.txt'
submission_file = 'submission.csv'
pretrained_dir = '../tmp/input/tfbert-large-uncased'
id_col = 'id'
target_col = 'target'
text_col = 'excerpt'
max_len = 205
n_fold = 5
n_est = 2
n_stop = 2
batch_size = 8
seed = 42
Load Tokenizer and Model
# Tokenization using "Transformers"
# load tokenizer
def load_tokenizer():
if not os.path.exists(pretrained_dir + '/vocab.txt'):
Path(pretrained_dir).mkdir(parents=True, exist_ok=True)
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased")
tokenizer.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained tokenizer')
tokenizer = BertTokenizerFast.from_pretrained(pretrained_dir)
model_config = BertConfig.from_pretrained(pretrained_dir)
model_config.output_hidden_states = True
return tokenizer, model_config
# load bert model
def load_bert(config):
if not os.path.exists(pretrained_dir + '/tf_model.h5'):
Path(pretrained_dir).mkdir(parents=True, exist_ok=True)
bert_model = TFBertModel.from_pretrained("bert-large-uncased", config=config)
bert_model.save_pretrained(pretrained_dir)
else:
print('loading the saved pretrained model')
bert_model = TFBertModel.from_pretrained(pretrained_dir, config=config)
return bert_model
loading encoder
def bert_encode(texts, tokenizer, max_len=max_len):
input_ids = []
token_type_ids = []
attention_mask = []
for text in texts:
token = tokenizer(text, max_lenght = max_len,truncation=True, padding='max_length',add_special_tokens = True)
input_ids.append(token['input_ids'])
token_type_ids.append(token['token_type_ids'])
attention_mask.append(token['attention_mask'])
return np.array(input_ids), np.array(token_type_ids),np.array(attention_mask)
this function gives an error
tokenizer, bert_cofig = load_tokenizer()
X = bert_encode(trn[text_col].values, tokenizer,
max_len=max_len)
X_tst = bert_encode(tst[text_col].values, tokenizer,
max_len = max_len)
y = trn[target_col].values
print(X[0].shape, X_tst[0].shape, y.shape)
Error
file ../tmp/input/tfbert-large-uncased/config.json not found

How to load external dataset

How can I load external dataset instead of mnist?
# underscore to omit the label arrays
(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')
train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]
BUFFER_SIZE = 60000
BATCH_SIZE = 256
# Batch and shuffle the data
train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
I've tried flow_from_directory "(train_images, _) = train_generator" but can not extract image values as the above code
train_image_generator = ImageDataGenerator(
rescale=1./255)
#rotation_range=10,
#zoom_range = 0.10,
#width_shift_range=0.1,
#height_shift_range=0.1
train_generator = train_image_generator.flow_from_directory(
batch_size=256,
color_mode="grayscale",
directory='../input/main-dataset110/Train',
shuffle=True,
target_size=(28, 28),
class_mode='sparse')
(train_images, _) = train_generator
I found a code which helped
try:
import tensorflow as tf
import cv2
import os
import pickle
import numpy as np
print("Library Loaded Successfully ..........")
except:
print("Library not Found ! ")
class MasterImage(object):
def __init__(self,PATH='', IMAGE_SIZE = 28):
self.PATH = PATH
self.IMAGE_SIZE = IMAGE_SIZE
self.image_data = []
self.x_data = []
self.y_data = []
self.CATEGORIES = []
# This will get List of categories
self.list_categories = []
def get_categories(self):
for path in os.listdir(self.PATH):
if '.DS_Store' in path:
pass
else:
self.list_categories.append(path)
print("Found Categories ",self.list_categories,'\n')
return self.list_categories
def Process_Image(self):
try:
"""
Return Numpy array of image
:return: X_Data, Y_Data
"""
self.CATEGORIES = self.get_categories()
for categories in self.CATEGORIES: # Iterate over categories
train_folder_path = os.path.join(self.PATH, categories) # Folder Path
class_index = self.CATEGORIES.index(categories) # this will get index for classification
for img in os.listdir(train_folder_path): # This will iterate in the Folder
new_path = os.path.join(train_folder_path, img) # image Path
try: # if any image is corrupted
image_data_temp = cv2.imread(new_path,cv2.IMREAD_GRAYSCALE) # Read Image as numbers
image_temp_resize = cv2.resize(image_data_temp,(self.IMAGE_SIZE,self.IMAGE_SIZE))
self.image_data.append([image_temp_resize,class_index])
except:
pass
data = np.asanyarray(self.image_data)
# Iterate over the Data
for x in data:
self.x_data.append(x[0]) # Get the X_Data
self.y_data.append(x[1]) # get the label
X_Data = np.asarray(self.x_data) / (255.0) # Normalize Data
Y_Data = np.asarray(self.y_data)
# reshape x_Data
X_Data = X_Data.reshape(-1, self.IMAGE_SIZE, self.IMAGE_SIZE, 1)
return X_Data, Y_Data
except:
print("Failed to run Function Process Image ")
def pickle_image(self):
"""
:return: None Creates a Pickle Object of DataSet
"""
# Call the Function and Get the Data
X_Data,Y_Data = self.Process_Image()
# Write the Entire Data into a Pickle File
pickle_out = open('X_Data','wb')
pickle.dump(X_Data, pickle_out)
pickle_out.close()
# Write the Y Label Data
pickle_out = open('Y_Data', 'wb')
pickle.dump(Y_Data, pickle_out)
pickle_out.close()
print("Pickled Image Successfully ")
return X_Data,Y_Data
def load_dataset(self):
try:
# Read the Data from Pickle Object
X_Temp = open('X_Data','rb')
X_Data = pickle.load(X_Temp)
Y_Temp = open('Y_Data','rb')
Y_Data = pickle.load(Y_Temp)
print('Reading Dataset from PIckle Object')
return X_Data,Y_Data
except:
print('Could not Found Pickle File ')
print('Loading File and Dataset ..........')
X_Data,Y_Data = self.pickle_image()
return X_Data,Y_Data
if __name__ == "__main__":
path = '../input/main-dataset110/Test'
a = MasterImage(PATH=path,
IMAGE_SIZE=28)
X_Data,Y_Data = a.load_dataset()
print(X_Data.shape)

Image not loading (Jupyter notebook)

Im trying to load an image to Jupyter notebook to use it in Tensorflow, Im using this Code below :
import numpy as np
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as train_test_split
import tensorflow as tf
from PIL import Image
import os
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Dense,Flatten, Dropout
data= []
labels = []
classes = 43
cur_path = os.getcwd()
for i in range(classes) :
path = os.path.join(cur_path,'Dataset\Train', str(i))
images = os.listdir(path)
for a in images :
try:
image = Image.open(path + '\\' + a)
image.resize(30,30)
image.show()
image = np.array(image)
data.append(image)
labels.append(i)
except:
print("error loading image")
data = np.array(data)
labels = np.array(labels)
Unfortunately im having this error message :error loading image
Does anyone have any idea?
Update : Removed the try and except ,
for a in images :
image = Image.open(path + '\\' + a)
image.resize(30,30)
display(image)
image = np.array(image)
data = np.array(data)
labels = np.array(labels)
error :
ValueError
Traceback (most recent call last)
in
2
3 image = Image.open(path + '\' + a)
----> 4 image.resize(30,30)
5 display(image)
6 image = np.array(image)
~\anaconda3\lib\site-packages\PIL\Image.py in resize(self, size, resample, box, reducing_gap)
1883 )
1884 ]
-> 1885 raise ValueError(
1886 message + " Use " + ", ".join(filters[:-1]) + " or " +
filters[-1]
1887 )
ValueError: Unknown resampling filter (30). Use Image.NEAREST (0),
Image.LANCZOS (1), Image.BILINEAR (2), Image.BICUBIC (3), Image.BOX (4) or
Image.HAMMING (5)
Solved the value error after a few changes:
from IPython.display import display
for a in images :
try:
image = Image.open(path + '\\' + a)
image = image.resize((30,30))
display(image)
image = np.array(image)
data.append(image)
labels.append(i)
except:
print("error loading image")
data = np.array(data)
labels = np.array(labels)
**Thankyou basic mojo **

Categories

Resources