BERT tokenizer & model download - python
I`m beginner.. I'm working with Bert. However, due to the security of the company network, the following code does not receive the bert model directly.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
So I think I have to download these files and enter the location manually.
But I'm new to this, and I'm wondering if it's simple to download a format like .py from github and put it in a location.
I'm currently using the bert model implemented by hugging face's pytorch, and the address of the source file I found is:
https://github.com/huggingface/transformers
Please let me know if the method I thought is correct, and if so, what file to get.
Thanks in advance for the comment.
As described here, what you need to do are download pre_train and configs, then putting them in the same folder. Every model has a pair of links, you might want to take a look at lib code.
For instance
import torch
from transformers import *
model = BertModel.from_pretrained('/Users/yourname/workplace/berts/')
with /Users/yourname/workplace/berts/ refer to your folder
Below are what I found
at src/transformers/configuration_bert.py there are a list of models' configs
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
}
and at src/transformers/modeling_bert.py there are links to pre_trains
BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
}
Related
Receiving parse error from SageMaker Multi Model Endpoint using TensorFlow
We are currently moving our models from single model endpoints to multi model endpoints within AWS SageMaker. After deploying the Multi Model Endpoint using prebuilt TensorFlow containers I receive the following error when calling the predict() method: {"error": "JSON Parse error: The document root must not be followed by other value at offset: 17"} I invoke the endpoint like this: data = np.random.rand(n_samples, n_features) predictor = Predictor(endpoint_name=endpoint_name) prediction = predictor.predict(data=serializer.serialize(data), target_model=model_name) My function for processing the input is the following: def _process_input(data, context): data = data.read().decode('utf-8') data = [float(x) for x in data.split(',')] return json.dumps({'instances': [data]}) For the training I configured my container as follows: tensorflow_container = TensorFlow( entry_point=path_script, framework_version='2.4', py_version='py37', instance_type='ml.m4.2xlarge', instance_count=1, role=EXECUTION_ROLE, sagemaker_session=sagemaker_session, hyperparameters=hyperparameters) tensorflow_container.fit() For deploying the endpoint I first initializing a Model from a given Estimator and then a MultiDataModel: model = estimator.create_model( role=EXECUTION_ROLE, image_uri=estimator.training_image_uri(), entry_point=path_serving) mdm = MultiDataModel( name=endpoint_name, model_data_prefix=dir_model_data, model=model, sagemaker_session=sagemaker.Session()) mdm.deploy( initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name) Afterwards the single models are added using: mdm.add_model( model_data_source=source_path, model_data_path=model_name) Thank you for any hints and help.
This issue usually occurs in case you either have damaged or malformed JSON data. Recommend you running it past JSON validator https://jsonlint.com/ I work at AWS and my opinions are my own - Thanks,Raghu
How can I retrive the model.pkl in the experiment in Databricks
I want to retrieve the pickle off my trained model, which I know is in the run file inside my experiments in Databricks. It seems that the mlflow.pyfunc.load_model can only do the predict method. There is an option to directly access the pickle? I also tried to use the path in the run using the pickle.load(path) (example of path: dbfs:/databricks/mlflow-tracking/20526156406/92f3ec23bf614c9d934dd0195/artifacts/model/model.pkl).
Use the frmwk's native load_model() method (e.g. sklearn.load_model()) or download_artifacts()
I recently found the solution which can be done by the following two approaches: Use the customized predict function at the moment of saving the model (check databricks documentation for more details). example give by Databricks class AddN(mlflow.pyfunc.PythonModel): def __init__(self, n): self.n = n def predict(self, context, model_input): return model_input.apply(lambda column: column + self.n) # Construct and save the model model_path = "add_n_model" add5_model = AddN(n=5) mlflow.pyfunc.save_model(path=model_path, python_model=add5_model) # Load the model in `python_function` format loaded_model = mlflow.pyfunc.load_model(model_path) Load the model artefacts as we are downloading the artefact: from mlflow.tracking import MlflowClient client = MlflowClient() tmp_path = client.download_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path='model/model.pkl') f = open(tmp_path,'rb') model = pickle.load(f) f.close() client.list_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path="") client.list_artifacts(run_id="0c7946c81fb64952bc8ccb3c7c66bca3", path="model")
How to propagate mlpipeline-metrics from custom Python function TFX component?
Note: this is a copy of a GitHub issue I reported. It is re-posted in hope to get more attention, I will update any solutions on either site. Question I want to export mlpipeline-metrics from my custom Python function TFX component so that it is displayed in the KubeFlow UI. This is a minimal example of what I am trying to do: import json from tfx.dsl.component.experimental.annotations import OutputArtifact from tfx.dsl.component.experimental.decorators import component from tfx.types.standard_artifacts import Artifact class Metric(Artifact): TYPE_NAME = 'Metric' #component def ShowMetric(MLPipeline_Metrics: OutputArtifact[Metric]): rmse_eval = 333.33 metrics = { 'metrics':[ { 'name': 'RMSE-validation', 'numberValue': rmse_eval, 'format': 'RAW' } ] } path = '/tmp/mlpipeline-metrics.json' with open(path, 'w') as _file: json.dump(metrics, _file) MLPipeline_Metrics.uri = path In the KubeFlow UI, the "Run output" tab says "No metrics found for this run." However, the output artefact shows up in the ML MetaData (see screenshot). Any help on how to accomplish this would be greatly appreciated. Thanks!
batch predictions google automl via python
I'm pretty new using stackoverflow as well as using the google cloud platform, so apologies if am not asking this question in the right format. I am currently facing an issue with getting the predictions from my model. I've trained a multilabel automl model on the google cloud platform and and now i want to use that model to score out new data entries. Since the platform only allows one entry at the same time i want to make use of python to do batch predictions. I've stored my data entries in seperate .txt files on the google cloud bucket and created a .txt file where i'm listing the gs:// references to those files (like they recommend in the documentation). I've exported a .json file with my credentials from the service account and specified the id's and paths in my code: # import API credentials and specify model / path references path = 'xxx.json' os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path model_name = 'xxx' model_id = 'TCN1234567890' project_id = '1234567890' model_full_id = f"https://eu-automl.googleapis.com/v1/projects/{project_id}/locations/eu/models/{model_id}" input_uri = f"gs://bucket_name/{model_name}/file_list.txt" output_uri = f"gs://bucket_name/{model_name}/outputs/" prediction_client = automl.PredictionServiceClient() And then i'm running the following code to get the predictions: # score batch of file_list gcs_source = automl.GcsSource(input_uris=[input_uri]) input_config = automl.BatchPredictInputConfig(gcs_source=gcs_source) gcs_destination = automl.GcsDestination(output_uri_prefix=output_uri) output_config = automl.BatchPredictOutputConfig( gcs_destination=gcs_destination ) response = prediction_client.batch_predict( name=model_full_id, input_config=input_config, output_config=output_config ) print("Waiting for operation to complete...") print( f"Batch Prediction results saved to Cloud Storage bucket. {response.result()}" ) However, i'm getting the following error: InvalidArgument: 400 Request contains an invalid argument. Would anyone have a hince what is causing this issue? Any input would be appreciated! Thanks!
Found the issue! I needed to set the client to the 'eu' environment first: options = ClientOptions(api_endpoint='eu-automl.googleapis.com') prediction_client = automl.PredictionServiceClient(client_options=options)
How can I download a specific part of Coco Dataset?
I am developing an object detection model to detect ships using YOLO. I want to use the COCO dataset. Is there a way to download only the images that have ships with the annotations?
To download images from a specific category, you can use the COCO API. Here's a demo notebook going through this and other usages. The overall process is as follows: Install pycocotools Download one of the annotations jsons from the COCO dataset Now here's an example on how we could download a subset of the images containing a person and saving it in a local file: from pycocotools.coco import COCO import requests # instantiate COCO specifying the annotations json path coco = COCO('...path_to_annotations/instances_train2014.json') # Specify a list of category names of interest catIds = coco.getCatIds(catNms=['person']) # Get the corresponding image ids and images using loadImgs imgIds = coco.getImgIds(catIds=catIds) images = coco.loadImgs(imgIds) Which returns a list of dictionaries with basic information on the images and its url. We can now use requests to GET the images and write them into a local folder: # Save the images into a local folder for im in images: img_data = requests.get(im['coco_url']).content with open('...path_saved_ims/coco_person/' + im['file_name'], 'wb') as handler: handler.write(img_data) Note that this will save all images from the specified category. So you might want to slice the images list to the first n.
From what I personally know, if you're talking about the COCO dataset only, I don't think they have a category for "ships". The closest category they have is "boat". Here's the link to check the available categories: http://cocodataset.org/#overview BTW, there are ships inside the boat category too. If you want to just select images of a specific COCO category, you might want to do something like this (taken and edited from COCO's official demos): # display COCO categories cats = coco.loadCats(coco.getCatIds()) nms=[cat['name'] for cat in cats] print('COCO categories: \n{}\n'.format(' '.join(nms))) # get all images containing given categories (I'm selecting the "bird") catIds = coco.getCatIds(catNms=['bird']); imgIds = coco.getImgIds(catIds=catIds);
Nowadays there is a package called fiftyone with which you could download the MS COCO dataset and get the annotations for specific classes only. More information about installation can be found at https://github.com/voxel51/fiftyone#installation. Once you have the package installed, simply run the following to get say the "person" and "car" classes: import fiftyone.zoo as foz # To download the COCO dataset for only the "person" and "car" classes dataset = foz.load_zoo_dataset( "coco-2017", split="train", label_types=["detections", "segmentations"], classes=["person", "car"], # max_samples=50, ) If desired, you can comment out the last option to set a maximum samples size. Moreover, you can change the "train" split to "validation" in order to obtain the validation split instead. To visualize the dataset downloaded, simply run the following: # Visualize the dataset in the FiftyOne App import fiftyone as fo session = fo.launch_app(dataset) If you would like to download the splits "train", "validation", and "test" in the same function call of the data to be loaded, you could do the following: dataset = foz.load_zoo_dataset( "coco-2017", splits=["train", "validation", "test"], label_types=["detections", "segmentations"], classes=["person"], # max_samples=50, )
I tried the code that #yatu and #Tim had shared here, but I got lots of requests.exceptions.ConnectionError: HTTPSConnectionPool. So after carefully reading this answer to Max retries exceeded with URL in requests, I rewrote the code like this one and now it runs smoothly: from pycocotools.coco import COCO from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import requests from tqdm.notebook import tqdm # instantiate COCO specifying the annotations json path coco = COCO('annotations/instances_train2017.json') # Specify a list of category names of interest catIds = coco.getCatIds(catNms=['person']) # Get the corresponding image ids and images using loadImgs imgIds = coco.getImgIds(catIds=catIds) images = coco.loadImgs(imgIds) # handle annotations ANNOTATIONS = {"info": { "description": "my-project-name" } } def cocoJson(images: list) -> dict: arrayIds = np.array([k["id"] for k in images]) annIds = coco.getAnnIds(imgIds=arrayIds, catIds=catIds, iscrowd=None) anns = coco.loadAnns(annIds) for k in anns: k["category_id"] = catIds.index(k["category_id"])+1 catS = [{'id': int(value), 'name': key} for key, value in categories.items()] ANNOTATIONS["images"] = images ANNOTATIONS["annotations"] = anns ANNOTATIONS["categories"] = catS return ANNOTATIONS def createJson(JsonFile: json, label='train') -> None: name = label Path("data/labels").mkdir(parents=True, exist_ok=True) with open(f"data/labels/{name}.json", "w") as outfile: json.dump(JsonFile, outfile) def downloadImages(images: list) -> None: session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) for im in tqdm(images): if not isfile(f"data/images/{im['file_name']}"): img_data = session.get(im['coco_url']).content with open('data/images/' + im['file_name'], 'wb') as handler: handler.write(img_data) trainSet = cocoJson(images) createJson(trainSet) downloadImages(images)
On my side I had recent difficulties installing fiftyone with Apple Silicon Mac (M1), so I created a script based on pycocotools that allows me to quickly download a subset of the coco 2017 dataset (images and annotations). It is very simple to use, details are available here: https://github.com/tikitong/minicoco , hope this helps.