python kubernetes API error when trying to mount volume - python

I am using python with python-kubernetes with a minikube running locally, e.g there are no cloud issues.
I am trying to create a job and provide it with data to run on. I would like to provide it with a mount of a directory with my local machine data.
I am using this example and trying to add a mount volume
This is my code after adding the keyword volume_mounts (I tried multiple places, multiple keywords and nothing works)
from os import path
import yaml
from kubernetes import client, config
JOB_NAME = "pi"
def create_job_object():
# Configureate Pod template container
container = client.V1Container(
name="pi",
image="perl",
volume_mounts=["/home/user/data"],
command=["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"])
# Create and configurate a spec section
template = client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(labels={
"app": "pi"}),
spec=client.V1PodSpec(restart_policy="Never",
containers=[container]))
# Create the specification of deployment
spec = client.V1JobSpec(
template=template,
backoff_limit=0)
# Instantiate the job object
job = client.V1Job(
api_version="batch/v1",
kind="Job",
metadata=client.V1ObjectMeta(name=JOB_NAME),
spec=spec)
return job
def create_job(api_instance, job):
# Create job
api_response = api_instance.create_namespaced_job(
body=job,
namespace="default")
print("Job created. status='%s'" % str(api_response.status))
def update_job(api_instance, job):
# Update container image
job.spec.template.spec.containers[0].image = "perl"
# Update the job
api_response = api_instance.patch_namespaced_job(
name=JOB_NAME,
namespace="default",
body=job)
print("Job updated. status='%s'" % str(api_response.status))
def delete_job(api_instance):
# Delete job
api_response = api_instance.delete_namespaced_job(
name=JOB_NAME,
namespace="default",
body=client.V1DeleteOptions(
propagation_policy='Foreground',
grace_period_seconds=5))
print("Job deleted. status='%s'" % str(api_response.status))
def main():
# Configs can be set in Configuration class directly or using helper
# utility. If no argument provided, the config will be loaded from
# default location.
config.load_kube_config()
batch_v1 = client.BatchV1Api()
# Create a job object with client-python API. The job we
# created is same as the `pi-job.yaml` in the /examples folder.
job = create_job_object()
create_job(batch_v1, job)
update_job(batch_v1, job)
delete_job(batch_v1)
if __name__ == '__main__':
main()
I get this error
HTTP response body:
{"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"Job
in version \"v1\" cannot be handled as a Job: v1.Job.Spec:
v1.JobSpec.Template: v1.PodTemplateSpec.Spec: v1.PodSpec.Containers:
[]v1.Container: v1.Container.VolumeMounts: []v1.VolumeMount:
readObjectStart: expect { or n, but found \", error found in #10 byte
of ...|ounts\": [\"/home/user|..., bigger context ...| \"image\":
\"perl\", \"name\": \"pi\", \"volumeMounts\": [\"/home/user/data\"]}],
\"restartPolicy\": \"Never\"}}}}|...","reason":"BadRequest","code":400
What am i missing here?
Is there another way to expose data to the job?
edit: trying to use client.V1Volumemount
I am trying to add this code, and add mount object in different init functions eg.
mount = client.V1VolumeMount(mount_path="/data", name="shai")
client.V1Container
client.V1PodTemplateSpec
client.V1JobSpec
client.V1Job
under multiple keywords, it all results in errors, is this the correct object to use? How shell I use it if at all?
edit: trying to pass volume_mounts as a list with the following code suggested in the answers:
def create_job_object():
# Configureate Pod template container
container = client.V1Container(
name="pi",
image="perl",
volume_mounts=["/home/user/data"],
command=["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"])
# Create and configurate a spec section
template = client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(labels={
"app": "pi"}),
spec=client.V1PodSpec(restart_policy="Never",
containers=[container]))
# Create the specification of deployment
spec = client.V1JobSpec(
template=template,
backoff_limit=0)
# Instantiate the job object
job = client.V1Job(
api_version="batch/v1",
kind="Job",
metadata=client.V1ObjectMeta(name=JOB_NAME),
spec=spec)
return job
And still getting a similar error
kubernetes.client.rest.ApiException: (422) Reason: Unprocessable
Entity HTTP response headers: HTTPHeaderDict({'Content-Type':
'application/json', 'Date': 'Tue, 06 Aug 2019 06:19:13 GMT',
'Content-Length': '401'}) HTTP response body:
{"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"Job.batch
\"pi\" is invalid:
spec.template.spec.containers[0].volumeMounts[0].name: Not found:
\"d\"","reason":"Invalid","details":{"name":"pi","group":"batch","kind":"Job","causes":[{"reason":"FieldValueNotFound","message":"Not
found:
\"d\"","field":"spec.template.spec.containers[0].volumeMounts[0].name"}]},"code":422}

The V1Container call is expecting a list of V1VolumeMount objects for volume_mounts parameter but you passed in a list of string:
Code:
def create_job_object():
volume_mount = client.V1VolumeMount(
mount_path="/home/user/data"
# other optional arguments, see the volume mount doc link below
)
# Configureate Pod template container
container = client.V1Container(
name="pi",
image="perl",
volume_mounts=[volume_mount],
command=["perl", "-Mbignum=bpi", "-wle", "print bpi(2000)"])
# Create and configurate a spec section
template = client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(labels={
"app": "pi"}),
spec=client.V1PodSpec(restart_policy="Never",
containers=[container]))
....
references:
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1VolumeMount.md

Related

wandb: artifact.add_reference() option to add specific (not current) versionId or ETag to stop the need for re-upload to s3?

I feel like this should be possible, but I looked through the wandb SDK code and I can't find an easy/logical way to do it. It might be possible to hack it by modifying the manifest entries at some point later (but maybe before the artifact is logged to wandb as then the manifest and the entries might be locked)? I saw things like this in the SDK code:
version = manifest_entry.extra.get("versionID")
etag = manifest_entry.extra.get("etag")
So, I figure we can probably edit those?
UPDATE
So, I tried to hack it together with something like this and it works but it feels wrong:
import os
import wandb
import boto3
from wandb.util import md5_file
ENTITY = os.environ.get("WANDB_ENTITY")
PROJECT = os.environ.get("WANDB_PROJECT")
API_KEY = os.environ.get("WANDB_API_KEY")
api = api = wandb.Api(overrides={"entity": ENTITY, "project": ENTITY})
run = wandb.init(entity=ENTITY, project=PROJECT, job_type="test upload")
file = "admin2Codes.txt" # "admin1CodesASCII.txt" # (both already on s3 with a couple versions)
artifact = wandb.Artifact("test_data", type="dataset")
# modify one of the local files so it has a new md5hash etc.
with open(file, "a") as f:
f.write("new_line_1\n")
# upload local file to s3
local_file_path = file
s3_url = f"s3://bucket/prefix/{file}"
s3_url_arr = s3_url.replace("s3://", "").split("/")
s3_bucket = s3_url_arr[0]
key = "/".join(s3_url_arr[1:])
s3_client = boto3.client("s3")
file_digest = md5_file(local_file_path)
s3_client.upload_file(
local_file_path,
s3_bucket,
key,
# save the md5_digest in metadata,
# can be used later to only upload new files to s3,
# as AWS doesn't digest the file consistently in the E-tag
ExtraArgs={"Metadata": {"md5_digest": file_digest}},
)
head_response = s3_client.head_object(Bucket=s3_bucket, Key=key)
version_id: str = head_response["VersionId"]
print(version_id)
# upload a link/ref to this s3 object in wandb:
artifact.add_reference(s3_dir)
# at this point we might be able to modify the artifact._manifest.entries and each entry.extra.get("etag") etc.?
print([(name, entry.extra) for name, entry in artifact._manifest.entries.items()])
# set these to an older version on s3 that we know we want (rather than latest) - do this via wandb public API:
dataset_v2 = api.artifact(f"{ENTITY}/{PROJECT}/test_data:v2", type="dataset")
# artifact._manifest.add_entry(dataset_v2.manifest.entries["admin1CodesASCII.txt"])
artifact._manifest.entries["admin1CodesASCII.txt"] = dataset_v2.manifest.entries[
"admin1CodesASCII.txt"
]
# verify that it did change:
print([(name, entry.extra) for name, entry in artifact._manifest.entries.items()])
run.log_artifact(artifact) # at this point the manifest is locked I believe?
artifact.wait() # wait for upload to finish (blocking - but should be very quick given it is just an s3 link)
print(artifact.name)
run_id = run.id
run.finish()
curr_run = api.run(f"{ENTITY}/{PROJECT}/{run_id}")
used_artifacts = curr_run.used_artifacts()
logged_artifacts = curr_run.logged_artifacts()
Am I on the right track here? I guess the other workaround is to make a copy on s3 (so that older version is the latest again) but I wanted to avoid this as the 1 file that I want to use an old version of is a large NLP model and the only files I want to change are small config.json files etc. (so seems very wasteful to upload all files again).
I was also wondering if when I copy an old version of an object back into the same key in the bucket if that creates a real copy or just like a pointer to the same underlying object. Neither boto3 nor AWS documentation makes that clear - although it seems like it is a proper copy.
I think I found the correct way to do it now:
import os
import wandb
import boto3
from wandb.util import md5_file
ENTITY = os.environ.get("WANDB_ENTITY")
PROJECT = os.environ.get("WANDB_PROJECT")
def wandb_update_only_some_files_in_artifact(
existing_artifact_name: str,
new_s3_file_urls: list[str],
entity: str = ENTITY,
project: str = PROJECT,
) -> Artifact:
"""If you want to just update a config.json file for example,
but the rest of the artifact can remain the same, then you can
use this functions like so:
wandb_update_only_some_files_in_artifact(
"old_artifact:v3",
["s3://bucket/prefix/config.json"],
)
and then all the other files like model.bin will be the same as in v3,
even if there was a v4 or v5 in between (as the v3 VersionIds are used)
Args:
existing_artifact_name (str): name with version like "old_artifact:v3"
new_s3_file_urls (list[str]): files that should be updated
entity (str, optional): wandb entity. Defaults to ENTITY.
project (str, optional): wandb project. Defaults to PROJECT.
Returns:
Artifact: the new artifact object
"""
api = wandb.Api(overrides={"entity": entity, "project": project})
old_artifact = api.artifact(existing_artifact_name)
old_artifact_name = re.sub(r":v\d+$", "", old_artifact.name)
with wandb.init(entity=entity, project=project) as run:
new_artifact = wandb.Artifact(old_artifact_name, type=old_artifact.type)
s3_file_names = [s3_url.split("/")[-1] for s3_url in new_s3_file_urls]
# add the new ones:
for s3_url, filename in zip(new_s3_file_urls, s3_file_names):
new_artifact.add_reference(s3_url, filename)
# add the old ones:
for filename, entry in old_artifact.manifest.entries.items():
if filename in s3_file_names:
continue
new_artifact.add_reference(entry, filename)
# this also works but feels hackier:
# new_artifact._manifest.entries[filename] = entry
run.log_artifact(new_artifact)
new_artifact.wait() # wait for upload to finish (blocking - but should be very quick given it is just an s3 link)
print(new_artifact.name)
print(run.id)
return new_artifact
# usage:
local_file_path = "config.json" # modified file
s3_url = "s3://bucket/prefix/config.json"
s3_url_arr = s3_url.replace("s3://", "").split("/")
s3_bucket = s3_url_arr[0]
key = "/".join(s3_url_arr[1:])
s3_client = boto3.client("s3")
file_digest = md5_file(local_file_path)
s3_client.upload_file(
local_file_path,
s3_bucket,
key,
# save the md5_digest in metadata,
# can be used later to only upload new files to s3,
# as AWS doesn't digest the file consistently in the E-tag
ExtraArgs={"Metadata": {"md5_digest": file_digest}},
)
wandb_update_only_some_files_in_artifact(
"old_artifact:v3",
["s3://bucket/prefix/config.json"],
)

How to convert Ansible-Playbooks to be used with Ansible-API

I'm using Ansible to set up servers.
Sometimes I'm using the command ansible-playbook <my-playbook.yml -i <inventory.txt> and sometimes I'm using the Ansible Python API (https://docs.ansible.com/ansible/latest/dev_guide/developing_api.html)
Unfortunatley I don't know how to insert my already existing playbooks in the API.
Here is the API:
#!/usr/bin/env python
import json
import shutil
from ansible.module_utils.common.collections import ImmutableDict
from ansible.parsing.dataloader import DataLoader
from ansible.vars.manager import VariableManager
from ansible.inventory.manager import InventoryManager
from ansible.playbook.play import Play
from ansible.executor.task_queue_manager import TaskQueueManager
from ansible.plugins.callback import CallbackBase
from ansible import context
import ansible.constants as C
class ResultCallback(CallbackBase):
"""A sample callback plugin used for performing an action as results come in
If you want to collect all results into a single object for processing at
the end of the execution, look into utilizing the ``json`` callback plugin
or writing your own custom callback plugin
"""
def v2_runner_on_ok(self, result, **kwargs):
"""Print a json representation of the result
This method could store the result in an instance attribute for retrieval later
"""
host = result._host
print(json.dumps({host.name: result._result}, indent=4))
# since the API is constructed for CLI it expects certain options to always be set in the context object
context.CLIARGS = ImmutableDict(connection='local', module_path=['/to/mymodules'], forks=10, become=None,
become_method=None, become_user=None, check=False, diff=False)
# initialize needed objects
loader = DataLoader() # Takes care of finding and reading yaml, json and ini files
passwords = dict(vault_pass='secret')
# Instantiate our ResultCallback for handling results as they come in. Ansible expects this to be one of its main display outlets
results_callback = ResultCallback()
# create inventory, use path to host config file as source or hosts in a comma separated string
inventory = InventoryManager(loader=loader, sources='localhost,')
# variable manager takes care of merging all the different sources to give you a unified view of variables available in each context
variable_manager = VariableManager(loader=loader, inventory=inventory)
# create data structure that represents our play, including tasks, this is basically what our YAML loader does internally.
play_source = dict(
name = "Ansible Play",
hosts = 'localhost',
gather_facts = 'no',
tasks = [
dict(action=dict(module='shell', args='ls'), register='shell_out'),
dict(action=dict(module='debug', args=dict(msg='{{shell_out.stdout}}')))
]
)
# Create play object, playbook objects use .load instead of init or new methods,
# this will also automatically create the task objects from the info provided in play_source
play = Play().load(play_source, variable_manager=variable_manager, loader=loader)
# Run it - instantiate task queue manager, which takes care of forking and setting up all objects to iterate over host list and tasks
tqm = None
try:
tqm = TaskQueueManager(
inventory=inventory,
variable_manager=variable_manager,
loader=loader,
passwords=passwords,
stdout_callback=results_callback, # Use our custom callback instead of the ``default`` callback plugin, which prints to stdout
)
result = tqm.run(play) # most interesting data for a play is actually sent to the callback's methods
finally:
# we always need to cleanup child procs and the structures we use to communicate with them
if tqm is not None:
tqm.cleanup()
# Remove ansible tmpdir
shutil.rmtree(C.DEFAULT_LOCAL_TMP, True)
I already tried to convert the playbooks from yaml to python dictionary using
import yaml
with open('ansible-files/main.yml') as f:
data = yaml.load(f)
and then passing data in as play_source.
This will work for simple playbooks, but not for more complex ones, where different roles are involved. Is there a way to pass an existing playbook with roles, templates and other stuff directly to the API?
If not: Is there another way of using existing playbooks, when using the Python API without the need to rewrite everything by hand?
Thank you!
Using PlaybookExecutor to execute the playbook which includes complex roles.
executor = PlaybookExecutor(
playbooks=[playbooks],
inventory=self.inventory,
variable_manager=self.variable_manager,
loader=self.loader,
passwords=self.passwords
)
executor.run()
The playbooks is yml file path.

With Python Kubernetes client, how to replicate `kubectl create -f` generally?

My Bash script using kubectl create/apply -f ... to deploy lots of Kubernetes resources has grown too large for Bash. I'm converting it to Python using the PyPI kubernetes package.
Is there a generic way to create resources given the YAML manifest? Otherwise, the only way I can see to do it would be to create and maintain a mapping from Kind to API method create_namespaced_<kind>. That seems tedious and error prone to me.
Update: I'm deploying many (10-20) resources to many (10+) GKE clusters.
Update in the year 2020, for anyone still interested in this (since the docs for the python library is mostly empty).
At the end of 2018 this pull request has been merged,
so it's now possible to do:
from kubernetes import client, config
from kubernetes import utils
config.load_kube_config()
api = client.ApiClient()
file_path = ... # A path to a deployment file
namespace = 'default'
utils.create_from_yaml(api, file_path, namespace=namespace)
EDIT: from a request in a comment, a snippet for skipping the python error if the deployment already exists
from kubernetes import client, config
from kubernetes import utils
config.load_kube_config()
api = client.ApiClient()
def skip_if_already_exists(e):
import json
# found in https://github.com/kubernetes-client/python/blob/master/kubernetes/utils/create_from_yaml.py#L165
info = json.loads(e.api_exceptions[0].body)
if info.get('reason').lower() == 'alreadyexists':
pass
else
raise e
file_path = ... # A path to a deployment file
namespace = 'default'
try:
utils.create_from_yaml(api, file_path, namespace=namespace)
except utils.FailToCreateError as e:
skip_if_already_exists(e)
I have written a following piece of code to achieve the functionality of creating k8s resources from its json/yaml file:
def create_from_yaml(yaml_file):
"""
:param yaml_file:
:return:
"""
yaml_object = yaml.loads(common.load_file(yaml_file))
group, _, version = yaml_object["apiVersion"].partition("/")
if version == "":
version = group
group = "core"
group = "".join(group.split(".k8s.io,1"))
func_to_call = "{0}{1}Api".format(group.capitalize(), version.capitalize())
k8s_api = getattr(client, func_to_call)()
kind = yaml_object["kind"]
kind = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', kind)
kind = re.sub('([a-z0-9])([A-Z])', r'\1_\2', kind).lower()
if "namespace" in yaml_object["metadata"]:
namespace = yaml_object["metadata"]["namespace"]
else:
namespace = "default"
try:
if hasattr(k8s_api, "create_namespaced_{0}".format(kind)):
resp = getattr(k8s_api, "create_namespaced_{0}".format(kind))(
body=yaml_object, namespace=namespace)
else:
resp = getattr(k8s_api, "create_{0}".format(kind))(
body=yaml_object)
except Exception as e:
raise e
print("{0} created. status='{1}'".format(kind, str(resp.status)))
return k8s_api
In above function, If you provide any object yaml/json file, it will automatically pick up the API type and object type and create the object like statefulset, deployment, service etc.
PS: The above code doesn't handler multiple kubernetes resources in one file, so you should have only one object per yaml file.
I see what you are looking for. This is possible with other k8s clients available in other languages. Here is an example in java. Unfortunately the python client library does not support that functionality yet. I opened a new feature request requesting the same and you can either choose to track it or contribute yourself :). Here is the link for the issue on GitHub.
The other way to still do what you are trying to do is to use java/golang client and put your code in a docker container.

How to avoid re-downloading media to S3 in Scrapy?

I previously asked a similar question (How does Scrapy avoid re-downloading media that was downloaded recently?), but since I did not receive a definite answer I'll ask it again.
I've downloaded a large number of files to an AWS S3 bucket using Scrapy's Files Pipeline. According to the documentation (https://doc.scrapy.org/en/latest/topics/media-pipeline.html#downloading-and-processing-files-and-images), this pipeline avoids "re-downloading media that was downloaded recently", but it does not say how long ago "recent" is or how to set this parameter.
Looking at the implementation of the FilesPipeline class at https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py, it would appear that this is obtained from the FILES_EXPIRES setting, for which the default is 90 days:
class FilesPipeline(MediaPipeline):
"""Abstract pipeline that implement the file downloading
This pipeline tries to minimize network transfers and file processing,
doing stat of the files and determining if file is new, uptodate or
expired.
`new` files are those that pipeline never processed and needs to be
downloaded from supplier site the first time.
`uptodate` files are the ones that the pipeline processed and are still
valid files.
`expired` files are those that pipeline already processed but the last
modification was made long time ago, so a reprocessing is recommended to
refresh it in case of change.
"""
MEDIA_NAME = "file"
EXPIRES = 90
STORE_SCHEMES = {
'': FSFilesStore,
'file': FSFilesStore,
's3': S3FilesStore,
}
DEFAULT_FILES_URLS_FIELD = 'file_urls'
DEFAULT_FILES_RESULT_FIELD = 'files'
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
raise NotConfigured
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)
super(FilesPipeline, self).__init__(download_func=download_func, settings=settings)
#classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.POLICY = settings['FILES_STORE_S3_ACL']
store_uri = settings['FILES_STORE']
return cls(store_uri, settings=settings)
def _get_store(self, uri):
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
scheme = 'file'
else:
scheme = urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)
def media_to_download(self, request, info):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
Do I understand this correctly? Also, I do not see a similar Boolean statement with age_days in the S3FilesStore class; is the checking of age also implemented for files on S3? (I was also unable to find any tests testing this age-checking feature for S3).
FILES_EXPIRES is indeed the setting to tell the FilesPipeline how "old" can a file be before downloading it (again).
The key section of the code is in media_to_download:
the _onsuccess callback checks the result of the pipeline's self.store.stat_file call, and for your question, it especially looks for the "last_modified" info. If last modified is older than "expires days", then the download is triggered.
You can check how the S3store gets the "last modified" information. It depends if botocore is available or not.
One line answer to this would be - class FilesPipeline(MediaPipeline): is the only class responsible for managing, validating and downloading files in your local paths. class S3FilesStore(object): just gets the files from local paths and uploads them to S3.
class FSFilesStore is the one which manages all your local paths and FilesPipeline uses them to store your files at local.
Links:
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L264
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L397
https://github.com/scrapy/scrapy/blob/master/scrapy/pipelines/files.py#L299

BigQuery load_table_from_storage won't recognize uri

I'm running into the following error when trying to load a table from google cloud storage:
BadRequest: 400 Load configuration must specify at least one source URI (POST https://www.googleapis.com/bigquery/v2/projects/fansidata/jobs)
Meanwhile my uri is valid (ie: i can see it in the gcs web app)
uris = ['gs://my-bucket-name/datastore_backup_analytics_2016_12_21_2_User/1569751766512529035929A5AA9742/output-0']
job_name = 'Load_User'
destinationTable = dataset.table('Transfer')
job = bigquery_client.load_table_from_storage(job_name, destinationTable, uris)
job.begin()
I could be wrong, but it looks like load_table_from_storage in the Python API expects a single string for the third argument as opposed to a list. If you want to match multiple files, you can use a * at the end. For example,
uri = 'gs://my-bucket-name/datastore_backup_analytics_2016_12_21_2_User/1569751766512529035929A5AA9742/output-*']
job_name = 'Load_User'
destinationTable = dataset.table('Transfer')
job = bigquery_client.load_table_from_storage(job_name, destinationTable, uri)
job.begin()
Client.load_table_from_storage takes one or more source URIs (that is what *soure_uris means in python). E.g.:
job = client.load_table_from_storage(
'load-job-123', my_table_object,
'gs://my-bucket-name/table_one',
'gs://my-bucket-name/table_two')

Categories

Resources