BERT tokenizer & model download - python

I`m beginner.. I'm working with Bert. However, due to the security of the company network, the following code does not receive the bert model directly.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
So I think I have to download these files and enter the location manually.
But I'm new to this, and I'm wondering if it's simple to download a format like .py from github and put it in a location.
I'm currently using the bert model implemented by hugging face's pytorch, and the address of the source file I found is:
https://github.com/huggingface/transformers
Please let me know if the method I thought is correct, and if so, what file to get.
Thanks in advance for the comment.

As described here, what you need to do are download pre_train and configs, then putting them in the same folder. Every model has a pair of links, you might want to take a look at lib code.
For instance
import torch
from transformers import *
model = BertModel.from_pretrained('/Users/yourname/workplace/berts/')
with /Users/yourname/workplace/berts/ refer to your folder
Below are what I found
at src/transformers/configuration_bert.py there are a list of models' configs
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
}
and at src/transformers/modeling_bert.py there are links to pre_trains
BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
}

Related

Receiving parse error from SageMaker Multi Model Endpoint using TensorFlow

We are currently moving our models from single model endpoints to multi model endpoints within AWS SageMaker. After deploying the Multi Model Endpoint using prebuilt TensorFlow containers I receive the following error when calling the predict() method:
{"error": "JSON Parse error: The document root must not be followed by other value at offset: 17"}
I invoke the endpoint like this:
data = np.random.rand(n_samples, n_features)
predictor = Predictor(endpoint_name=endpoint_name)
prediction = predictor.predict(data=serializer.serialize(data), target_model=model_name)
My function for processing the input is the following:
def _process_input(data, context):
data = data.read().decode('utf-8')
data = [float(x) for x in data.split(',')]
return json.dumps({'instances': [data]})
For the training I configured my container as follows:
tensorflow_container = TensorFlow(
entry_point=path_script,
framework_version='2.4',
py_version='py37',
instance_type='ml.m4.2xlarge',
instance_count=1,
role=EXECUTION_ROLE,
sagemaker_session=sagemaker_session,
hyperparameters=hyperparameters)
tensorflow_container.fit()
For deploying the endpoint I first initializing a Model from a given Estimator and then a MultiDataModel:
model = estimator.create_model(
role=EXECUTION_ROLE,
image_uri=estimator.training_image_uri(),
entry_point=path_serving)
mdm = MultiDataModel(
name=endpoint_name,
model_data_prefix=dir_model_data,
model=model,
sagemaker_session=sagemaker.Session())
mdm.deploy(
initial_instance_count=1,
instance_type=instance_type,
endpoint_name=endpoint_name)
Afterwards the single models are added using:
mdm.add_model(
model_data_source=source_path,
model_data_path=model_name)
Thank you for any hints and help.
This issue usually occurs in case you either have damaged or malformed JSON data. Recommend you running it past JSON validator https://jsonlint.com/
I work at AWS and my opinions are my own - Thanks,Raghu

How can I retrive the model.pkl in the experiment in Databricks

I want to retrieve the pickle off my trained model, which I know is in the run file inside my experiments in Databricks.
It seems that the mlflow.pyfunc.load_model can only do the predict method.
There is an option to directly access the pickle?
I also tried to use the path in the run using the pickle.load(path) (example of path: dbfs:/databricks/mlflow-tracking/20526156406/92f3ec23bf614c9d934dd0195/artifacts/model/model.pkl).
Use the frmwk's native load_model() method (e.g. sklearn.load_model()) or download_artifacts()
I recently found the solution which can be done by the following two approaches:
Use the customized predict function at the moment of saving the model (check databricks documentation for more details).
example give by Databricks
class AddN(mlflow.pyfunc.PythonModel):
def __init__(self, n):
self.n = n
def predict(self, context, model_input):
return model_input.apply(lambda column: column + self.n)
# Construct and save the model
model_path = "add_n_model"
add5_model = AddN(n=5)
mlflow.pyfunc.save_model(path=model_path, python_model=add5_model)
# Load the model in `python_function` format
loaded_model = mlflow.pyfunc.load_model(model_path)
Load the model artefacts as we are downloading the artefact:
from mlflow.tracking import MlflowClient
client = MlflowClient()
tmp_path = client.download_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path='model/model.pkl')
f = open(tmp_path,'rb')
model = pickle.load(f)
f.close()
client.list_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path="")
client.list_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path="model")

How to propagate mlpipeline-metrics from custom Python function TFX component?

Note: this is a copy of a GitHub issue I reported.
It is re-posted in hope to get more attention, I will update any solutions on either site.
Question
I want to export mlpipeline-metrics from my custom Python function TFX component so that it is displayed in the KubeFlow UI.
This is a minimal example of what I am trying to do:
import json
from tfx.dsl.component.experimental.annotations import OutputArtifact
from tfx.dsl.component.experimental.decorators import component
from tfx.types.standard_artifacts import Artifact
class Metric(Artifact):
TYPE_NAME = 'Metric'
#component
def ShowMetric(MLPipeline_Metrics: OutputArtifact[Metric]):
rmse_eval = 333.33
metrics = {
'metrics':[
{
'name': 'RMSE-validation',
'numberValue': rmse_eval,
'format': 'RAW'
}
]
}
path = '/tmp/mlpipeline-metrics.json'
with open(path, 'w') as _file:
json.dump(metrics, _file)
MLPipeline_Metrics.uri = path
In the KubeFlow UI, the "Run output" tab says "No metrics found for this run." However, the output artefact shows up in the ML MetaData (see screenshot). Any help on how to accomplish this would be greatly appreciated. Thanks!

batch predictions google automl via python

I'm pretty new using stackoverflow as well as using the google cloud platform, so apologies if am not asking this question in the right format. I am currently facing an issue with getting the predictions from my model.
I've trained a multilabel automl model on the google cloud platform and and now i want to use that model to score out new data entries.
Since the platform only allows one entry at the same time i want to make use of python to do batch predictions.
I've stored my data entries in seperate .txt files on the google cloud bucket and created a .txt file where i'm listing the gs:// references to those files (like they recommend in the documentation).
I've exported a .json file with my credentials from the service account and specified the id's and paths in my code:
# import API credentials and specify model / path references
path = 'xxx.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
model_name = 'xxx'
model_id = 'TCN1234567890'
project_id = '1234567890'
model_full_id = f"https://eu-automl.googleapis.com/v1/projects/{project_id}/locations/eu/models/{model_id}"
input_uri = f"gs://bucket_name/{model_name}/file_list.txt"
output_uri = f"gs://bucket_name/{model_name}/outputs/"
prediction_client = automl.PredictionServiceClient()
And then i'm running the following code to get the predictions:
# score batch of file_list
gcs_source = automl.GcsSource(input_uris=[input_uri])
input_config = automl.BatchPredictInputConfig(gcs_source=gcs_source)
gcs_destination = automl.GcsDestination(output_uri_prefix=output_uri)
output_config = automl.BatchPredictOutputConfig(
gcs_destination=gcs_destination
)
response = prediction_client.batch_predict(
name=model_full_id,
input_config=input_config,
output_config=output_config
)
print("Waiting for operation to complete...")
print(
f"Batch Prediction results saved to Cloud Storage bucket. {response.result()}"
)
However, i'm getting the following error: InvalidArgument: 400 Request contains an invalid argument.
Would anyone have a hince what is causing this issue?
Any input would be appreciated! Thanks!
Found the issue!
I needed to set the client to the 'eu' environment first:
options = ClientOptions(api_endpoint='eu-automl.googleapis.com')
prediction_client = automl.PredictionServiceClient(client_options=options)

How can I download a specific part of Coco Dataset?

I am developing an object detection model to detect ships using YOLO. I want to use the COCO dataset. Is there a way to download only the images that have ships with the annotations?
To download images from a specific category, you can use the COCO API. Here's a demo notebook going through this and other usages. The overall process is as follows:
Install pycocotools
Download one of the annotations jsons from the COCO dataset
Now here's an example on how we could download a subset of the images containing a person and saving it in a local file:
from pycocotools.coco import COCO
import requests
# instantiate COCO specifying the annotations json path
coco = COCO('...path_to_annotations/instances_train2014.json')
# Specify a list of category names of interest
catIds = coco.getCatIds(catNms=['person'])
# Get the corresponding image ids and images using loadImgs
imgIds = coco.getImgIds(catIds=catIds)
images = coco.loadImgs(imgIds)
Which returns a list of dictionaries with basic information on the images and its url. We can now use requests to GET the images and write them into a local folder:
# Save the images into a local folder
for im in images:
img_data = requests.get(im['coco_url']).content
with open('...path_saved_ims/coco_person/' + im['file_name'], 'wb') as handler:
handler.write(img_data)
Note that this will save all images from the specified category. So you might want to slice the images list to the first n.
From what I personally know, if you're talking about the COCO dataset only, I don't think they have a category for "ships". The closest category they have is "boat". Here's the link to check the available categories: http://cocodataset.org/#overview
BTW, there are ships inside the boat category too.
If you want to just select images of a specific COCO category, you might want to do something like this (taken and edited from COCO's official demos):
# display COCO categories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
# get all images containing given categories (I'm selecting the "bird")
catIds = coco.getCatIds(catNms=['bird']);
imgIds = coco.getImgIds(catIds=catIds);
Nowadays there is a package called fiftyone with which you could download the MS COCO dataset and get the annotations for specific classes only. More information about installation can be found at https://github.com/voxel51/fiftyone#installation.
Once you have the package installed, simply run the following to get say the "person" and "car" classes:
import fiftyone.zoo as foz
# To download the COCO dataset for only the "person" and "car" classes
dataset = foz.load_zoo_dataset(
"coco-2017",
split="train",
label_types=["detections", "segmentations"],
classes=["person", "car"],
# max_samples=50,
)
If desired, you can comment out the last option to set a maximum samples size. Moreover, you can change the "train" split to "validation" in order to obtain the validation split instead.
To visualize the dataset downloaded, simply run the following:
# Visualize the dataset in the FiftyOne App
import fiftyone as fo
session = fo.launch_app(dataset)
If you would like to download the splits "train", "validation", and "test" in the same function call of the data to be loaded, you could do the following:
dataset = foz.load_zoo_dataset(
"coco-2017",
splits=["train", "validation", "test"],
label_types=["detections", "segmentations"],
classes=["person"],
# max_samples=50,
)
I tried the code that #yatu and #Tim had shared here, but I got lots of requests.exceptions.ConnectionError: HTTPSConnectionPool.
So after carefully reading this answer to Max retries exceeded with URL in requests, I rewrote the code like this one and now it runs smoothly:
from pycocotools.coco import COCO
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import requests
from tqdm.notebook import tqdm
# instantiate COCO specifying the annotations json path
coco = COCO('annotations/instances_train2017.json')
# Specify a list of category names of interest
catIds = coco.getCatIds(catNms=['person'])
# Get the corresponding image ids and images using loadImgs
imgIds = coco.getImgIds(catIds=catIds)
images = coco.loadImgs(imgIds)
# handle annotations
ANNOTATIONS = {"info": {
"description": "my-project-name"
}
}
def cocoJson(images: list) -> dict:
arrayIds = np.array([k["id"] for k in images])
annIds = coco.getAnnIds(imgIds=arrayIds, catIds=catIds, iscrowd=None)
anns = coco.loadAnns(annIds)
for k in anns:
k["category_id"] = catIds.index(k["category_id"])+1
catS = [{'id': int(value), 'name': key}
for key, value in categories.items()]
ANNOTATIONS["images"] = images
ANNOTATIONS["annotations"] = anns
ANNOTATIONS["categories"] = catS
return ANNOTATIONS
def createJson(JsonFile: json, label='train') -> None:
name = label
Path("data/labels").mkdir(parents=True, exist_ok=True)
with open(f"data/labels/{name}.json", "w") as outfile:
json.dump(JsonFile, outfile)
def downloadImages(images: list) -> None:
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
for im in tqdm(images):
if not isfile(f"data/images/{im['file_name']}"):
img_data = session.get(im['coco_url']).content
with open('data/images/' + im['file_name'], 'wb') as handler:
handler.write(img_data)
trainSet = cocoJson(images)
createJson(trainSet)
downloadImages(images)
On my side I had recent difficulties installing fiftyone with Apple Silicon Mac (M1), so I created a script based on pycocotools that allows me to quickly download a subset of the coco 2017 dataset (images and annotations).
It is very simple to use, details are available here: https://github.com/tikitong/minicoco , hope this helps.

Categories

Resources