httplib2.socks.HTTPError: (403, b'Forbidden') python apache-beam dataflow - python

I work on a google cloud environment where i don't have internet access. I'm trying to launch a dataflow job. I'm using a proxy to access the internet.
when i run a simple wordcount.py with dataflow i get this error
WARNING:apache_beam.utils.retry:Retry with exponential backoff: waiting for 4.750968074377858 seconds before retrying _uncached_gcs_file_copy because we caught exception: httplib2.socks.HTTPError: (403, b'Forbidden')
Traceback for above exception (most recent call last):
File "/opt/py38/lib64/python3.8/site-packages/apache_beam/utils/retry.py", line 275, in wrapper
return fun(*args, **kwargs)
File "/opt/py38/lib64/python3.8/site-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 631, in _uncached_gcs_file_copy
self.stage_file(to_folder, to_name, f, total_size=total_size)
File "/opt/py38/lib64/python3.8/site-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 735, in stage_file
response = self._storage_client.objects.Insert(request, upload=upload)
File "/opt/py38/lib64/python3.8/site-packages/apache_beam/io/gcp/internal/clients/storage/storage_v1_client.py", line 1152, in Insert
return self._RunMethod(
File "/opt/py38/lib64/python3.8/site-packages/apitools/base/py/base_api.py", line 728, in _RunMethod
http_response = http_wrapper.MakeRequest(
File "/opt/py38/lib64/python3.8/site-packages/apitools/base/py/http_wrapper.py", line 359, in MakeRequest
retry_func(ExceptionRetryArgs(http, http_request, e, retry,
File "/opt/py38/lib64/python3.8/site-packages/apache_beam/io/gcp/gcsio_overrides.py", line 45, in retry_func
return http_wrapper.HandleExceptionsAndRebuildHttpConnections(retry_args)
File "/opt/py38/lib64/python3.8/site-packages/apitools/base/py/http_wrapper.py", line 304, in HandleExceptionsAndRebuildHttpConnections
raise retry_args.exc
File "/opt/py38/lib64/python3.8/site-packages/apitools/base/py/http_wrapper.py", line 348, in MakeRequest
return _MakeRequestNoRetry(
File "/opt/py38/lib64/python3.8/site-packages/apitools/base/py/http_wrapper.py", line 397, in _MakeRequestNoRetry
info, content = http.request(
File "/opt/py38/lib64/python3.8/site-packages/google_auth_httplib2.py", line 209, in request
self.credentials.before_request(self._request, method, uri, request_headers)
File "/opt/py38/lib64/python3.8/site-packages/google/auth/credentials.py", line 134, in before_request
self.refresh(request)
File "/opt/py38/lib64/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 111, in refresh
self._retrieve_info(request)
File "/opt/py38/lib64/python3.8/site-packages/google/auth/compute_engine/credentials.py", line 87, in _retrieve_info
info = _metadata.get_service_account_info(
File "/opt/py38/lib64/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 234, in get_service_account_info
return get(request, path, params={"recursive": "true"})
File "/opt/py38/lib64/python3.8/site-packages/google/auth/compute_engine/_metadata.py", line 150, in get
response = request(url=url, method="GET", headers=_METADATA_HEADERS)
File "/opt/py38/lib64/python3.8/site-packages/google_auth_httplib2.py", line 119, in __call__
response, data = self.http.request(
File "/opt/py38/lib64/python3.8/site-packages/httplib2/__init__.py", line 1701, in request
(response, content) = self._request(
File "/opt/py38/lib64/python3.8/site-packages/httplib2/__init__.py", line 1421, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/opt/py38/lib64/python3.8/site-packages/httplib2/__init__.py", line 1343, in _conn_request
conn.connect()
File "/opt/py38/lib64/python3.8/site-packages/httplib2/__init__.py", line 1026, in connect
self.sock.connect((self.host, self.port) + sa[2:])
File "/opt/py38/lib64/python3.8/site-packages/httplib2/socks.py", line 504, in connect
self.__negotiatehttp(destpair[0], destpair[1])
File "/opt/py38/lib64/python3.8/site-packages/httplib2/socks.py", line 465, in __negotiatehttp
raise HTTPError((statuscode, statusline[2]))
My service account have this role:
BigQuery Data Editor
BigQuery User
Dataflow Developer
Dataflow Worker
Service Account User
Storage Admin
The istance have Cloud API access scopes: Allow full access to all Cloud APIs
what is the problem?

Based on the comment #luca the above error is solved using an internal proxy that will allow access to the internet. Add this --no_use_public_ip to the command and set no_proxy="metadata.google.internal,www.googleapis.com,dataflow.googleapis.com,bigquery.googleapis.com".

Related

google.auth.exceptions.RefreshError: ('invalid_grant: Token has been expired or revoked.}

In one my application files, I am running a query on BQ Public Tables.
from google.cloud import bigquery
import pyarrow
client = bigquery.Client(project = "project-name")
os.environ.setdefault("GCLOUD_PROJECT", "project-name")
sql = """
SELECT *
FROM `bigquery-public-data.geo_us_boundaries.cbsa`
WHERE name IN UNNEST(%s)
""" %(metros)
df_geom = client.query(sql).to_dataframe()
When I run the Application, I am getting a google.auth.exception out of nowhere.
File "/Users/...tab1.py", line 72, in <module>
df_geom = client.query(sql).to_dataframe()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 3391, in query
future = do_query()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 3368, in do_query
query_job._begin(retry=retry, timeout=timeout)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/job/query.py", line 1249, in _begin
super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/job/base.py", line 509, in _begin
api_response = client._call_api(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 782, in _call_api
return call()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/api_core/retry.py", line 283, in retry_wrapped_func
return retry_target(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/api_core/retry.py", line 190, in retry_target
return target()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 469, in api_request
response = self._make_request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 333, in _make_request
return self._do_request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 371, in _do_request
return self.http.request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/auth/transport/requests.py", line 476, in request
self.credentials.before_request(auth_request, method, url, request_headers)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/auth/credentials.py", line 133, in before_request
self.refresh(request)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/credentials.py", line 302, in refresh
) = reauth.refresh_grant(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/reauth.py", line 347, in refresh_grant
_client._handle_error_response(response_data)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/_client.py", line 60, in _handle_error_response
raise exceptions.RefreshError(error_details, response_data)
google.auth.exceptions.RefreshError: ('invalid_grant: Token has been expired or revoked.', {'error': 'invalid_grant', 'error_description': 'Token has been expired or revoked.'})
I have no issues whatsoever before this for months. My application bundle includes a project-name-xxxx.json file from Google Cloud Console. Everytime, I attempt to run the application it fails.
I have looked through some of the other answers that suggests tokens expire after 7 days for publishing mode Testing. The publishing mode is Production in our case.

Using PyDrive with proxy for authenticating from a server

I've been trying to authenticate from a server using PyDrive. I'm trying to use a proxy but I keep getting a 403 Forbidden error. I'm not sure if my code for using the proxy is correct, or if this is even possible.
The error:
Traceback (most recent call last):
File "/Users/user/Desktop/Python/files/test_post.py", line 67, in <module>
file1.Upload(param={'supportsAllDrives': True, "http": gauth.http})
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pydrive/files.py", line 285, in Upload
self._FilesInsert(param=param)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pydrive/auth.py", line 75, in _decorated
return decoratee(self, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pydrive/files.py", line 368, in _FilesInsert
metadata = self.auth.service.files().insert(**param).execute(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 131, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/googleapiclient/http.py", line 901, in execute
_, body = self.next_chunk(http=http, num_retries=num_retries)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/googleapiclient/_helpers.py", line 131, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/googleapiclient/http.py", line 1006, in next_chunk
resp, content = _retry_request(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/googleapiclient/http.py", line 190, in _retry_request
resp, content = http.request(uri, method, *args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/oauth2client/transport.py", line 173, in new_request
resp, content = request(orig_request_method, uri, method, body,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/oauth2client/transport.py", line 280, in request
return http_callable(uri, method=method, body=body, headers=headers,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/__init__.py", line 1701, in request
(response, content) = self._request(
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/__init__.py", line 1421, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/__init__.py", line 1343, in _conn_request
conn.connect()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/__init__.py", line 1133, in connect
sock.connect((self.host, self.port))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/socks.py", line 512, in connect
self.__negotiatehttp(destpair[0], destpair[1])
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/httplib2/socks.py", line 465, in __negotiatehttp
raise HTTPError((statuscode, statusline[2]))
httplib2.socks.HTTPError: (403, b'Forbidden')
I should note: the script seems to work fine up until it gets to the file upload command with PyDrive. I've tried passing http into the params and tried it without it. They both don't work.
Here is the code for setting up the proxy:
proxy_info = httplib2.ProxyInfo(proxy_type=httplib2.socks.PROXY_TYPE_HTTP_NO_TUNNEL,
proxy_host='myproxyhost',
proxy_port=8080)
print("Proxy info variable set")
gauth.http = httplib2.Http(proxy_info=proxy_info)
print("gauth.http is set")
# Try to load saved client credentials
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
# Authenticate if they're not there
print("Gauth credentials is none conditional")
gauth.GetFlow()
gauth.flow.params.update({'access_type': 'offline'})
gauth.flow.params.update({'approval_prompt': 'force'})
gauth.CommandLineAuth()
elif gauth.access_token_expired:
print("Gauth access token expired conditional.")
# Refresh them if expired
gauth.Refresh()
else:
print("Gauth authroized conditional")
# Initialize the saved creds
gauth.CommandLineAuth()
print("Commandlineauth allowed!")
gauth.Authorize()```

Google Cloud API: EOFError: Compressed file ended before the end-of-stream marker was reached

I have some really simple Python code that I'm using to do a GCP Google Vault export. However, about 4 out of ever 5 runs returns an EOF Error. The stacktrace is
Traceback (most recent call last):
File "src/initiate_job.py", line 57, in <module>
results = service.matters().list(pageSize=10).execute()
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/googleapiclient/http.py", line 901, in execute
headers=self.headers,
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/googleapiclient/http.py", line 177, in _retry_request
resp, content = http.request(uri, method, *args, **kwargs)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google_auth_httplib2.py", line 190, in request
self._request, method, uri, request_headers)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google/auth/credentials.py", line 133, in before_request
self.refresh(request)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google/oauth2/service_account.py", line 359, in refresh
access_token, expiry, _ = _client.jwt_grant(request, self._token_uri, assertion)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google/oauth2/_client.py", line 153, in jwt_grant
response_data = _token_endpoint_request(request, token_uri, body)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google/oauth2/_client.py", line 105, in _token_endpoint_request
response = request(method="POST", url=token_uri, headers=headers, body=body)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/google_auth_httplib2.py", line 117, in __call__
url, method=method, body=body, headers=headers, **kwargs)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/httplib2/__init__.py", line 1994, in request
cachekey,
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/httplib2/__init__.py", line 1651, in _request
conn, request_uri, method, body, headers
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/httplib2/__init__.py", line 1621, in _conn_request
content = _decompressContent(response, content)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/site-packages/httplib2/__init__.py", line 460, in _decompressContent
content = gzip.GzipFile(fileobj=io.BytesIO(new_content)).read()
File "/Users/ipq500/opt/miniconda3/lib/python3.7/gzip.py", line 276, in read
return self._buffer.read(size)
File "/Users/ipq500/opt/miniconda3/lib/python3.7/gzip.py", line 482, in read
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached
The code that throws this is just:
service = build('vault', 'v1', credentials=delegated_credentials)
results = service.matters().list(pageSize=10).execute()
I tried to add some defensive programming wherein I simply retry anything that throws this error -- but now I'm hitting rate limits. So, I have to really debug why I'm getting this error.
Any and all help would be appreciated!
Based on this question and this conversation, It seems that the file that you read is damaged and the error of end of the file is expected, In this case I suggest to verify if the credentials exist, take the sample as references and also try to perform a double check before to perform any task, On the other hand, it does the same with the file for the file since it is possible that the writing of the file is not finished, including whether it is being compressed or decompressed

GCS broken pipe error when trying to upload large files

I'm trying to upload a .csv.gz file to GCS after extracting it to .csv, the file size changes from 500MB to around 5GB. I'm able to extract the .csv.gz file to a temporary path and it fails when I try to upload that file to GCS. I get the following error:
[2019-11-11 13:59:58,180] {models.py:1796} ERROR - [Errno 32] Broken pipe
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models.py", line 1664, in _run_raw_tas
result = task_copy.execute(context=context
File "/home/airflow/gcs/dags/operators/s3_to_gcs_transform_operator.py", line 220, in execut
gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, target_file, gzip=True
File "/home/airflow/gcs/dags/hooks/gcs_hook_conn.py", line 208, in uploa
.insert(bucket=bucket, name=object, media_body=media)
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrappe
return wrapped(*args, **kwargs
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 835, in execut
method=str(self.method), body=self.body, headers=self.headers
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 179, in _retry_reques
raise exceptio
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 162, in _retry_reques
resp, content = http.request(uri, method, *args, **kwargs
File "/opt/python3.6/lib/python3.6/site-packages/google_auth_httplib2.py", line 198, in reques
uri, method, body=body, headers=request_headers, **kwargs
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 155, in new_reques
redirections, connection_type
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1924, in reques
cachekey
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1595, in _reques
conn, request_uri, method, body, header
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1502, in _conn_reques
conn.request(method, request_uri, body, headers
File "/opt/python3.6/lib/python3.6/http/client.py", line 1239, in reques
self._send_request(method, url, body, headers, encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1285, in _send_reques
self.endheaders(body, encode_chunked=encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1234, in endheader
self._send_output(message_body, encode_chunked=encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1065, in _send_outpu
self.send(chunk
File "/opt/python3.6/lib/python3.6/http/client.py", line 986, in sen
self.sock.sendall(data
File "/opt/python3.6/lib/python3.6/ssl.py", line 975, in sendal
v = self.send(byte_view[count:]
File "/opt/python3.6/lib/python3.6/ssl.py", line 944, in sen
return self._sslobj.write(data
File "/opt/python3.6/lib/python3.6/ssl.py", line 642, in writ
return self._sslobj.write(data
BrokenPipeError: [Errno 32] Broken pip
From what I understood, the error could be due to the following:
Your server process has received a SIGPIPE writing to a socket. This
usually happens when you write to a socket fully closed on the other
(client) side. This might be happening when a client program doesn't
wait till all the data from the server is received and simply closes a
socket (using close function).
But I have no idea whether this is the issue or how I can fix this. Can someone help?
You should try to uploads big files in chunks.
from google.cloud import storage
CHUNK_SIZE = 128 * 1024 * 1024
client = storage.Client()
bucket = client.bucket('destination')
blob = bucket.blob('really-big-blob', chunk_size=CHUNK_SIZE)
blob.upload_from_filename('/path/to/really-big-file')
Also you can check Parallel Composite Uploads
Similar SO question link.

Python dataflow job fails to submit

We have a kubernetes cron job on GCP that submits several copies of the same Python dataflow job, each in its own container. Whenever we need a new copy of the job, we just add it to the spec->jobTemplate->spec->template->spec->containers part of the cron job yaml and adjust the dataflow job parameters as needed. This usually works fine, but the latest copy we tried to add does not work. The existing copies are all still working as expected. The job seems to fail on submission to GCP, and the error message is not very helpful:
Traceback (most recent call last):
File "/app/job.py", line 117, in <module>
newness.pipeline.run_dataflow(sys.argv)
File "/app/newness/pipeline.py", line 480, in run_dataflow
result = pipe.run()
File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 403, in run
self.to_runner_api(), self.runner, self._options).run(False)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/pipeline.py", line 416, in run
return self.runner.run_pipeline(self)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 389, in run_pipeline
self.dataflow_client.create_job(self.job), self)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/utils/retry.py", line 184, in wrapper
return fun(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 504, in create_job
return self.submit_job_description(job)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/utils/retry.py", line 184, in wrapper
return fun(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/apiclient.py", line 551, in submit_job_description
response = self._client.projects_locations_jobs.Create(request)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/runners/dataflow/internal/clients/dataflow/dataflow_v1b3_client.py", line 578, in Create
config, request, global_params=global_params)
File "/usr/local/lib/python2.7/dist-packages/apitools/base/py/base_api.py", line 731, in _RunMethod
return self.ProcessHttpResponse(method_config, http_response, request)
File "/usr/local/lib/python2.7/dist-packages/apitools/base/py/base_api.py", line 737, in ProcessHttpResponse
self.__ProcessHttpResponse(method_config, http_response, request))
File "/usr/local/lib/python2.7/dist-packages/apitools/base/py/base_api.py", line 604, in __ProcessHttpResponse
http_response, method_config=method_config, request=request)
apitools.base.py.exceptions.HttpError: <exception str() failed>
The job does not appear in the dataflow console at all.
The previous lines of the container logs look like:
2019-10-13T03:57:47.725542287Z Successfully downloaded apache-beam
2019-10-13T03:58:17.125601280Z INFO:root:Staging SDK sources from PyPI to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/dataflow_python_sdk.tar
2019-10-13T03:58:17.324843623Z INFO:root:Starting GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/dataflow_python_sdk.tar...
2019-10-13T03:58:24.825657227Z INFO:root:Completed GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/dataflow_python_sdk.tar
2019-10-13T03:58:25.225646529Z INFO:root:Downloading binary distribtution of the SDK from PyPi
2019-10-13T03:58:25.225716554Z INFO:root:Executing command: ['/usr/bin/python', '-m', 'pip', 'download', '--dest', '/tmp/tmpk5TfMS', 'apache-beam==2.8.0', '--no-deps', '--only-binary', ':all:', '--python-version', '27', '--implementation', 'cp', '--abi', 'cp27mu', '--platform', 'manylinux1_x86_64']
2019-10-13T03:59:33.926186906Z Collecting apache-beam==2.8.0
2019-10-13T03:59:52.125678183Z Using cached https://files.pythonhosted.org/packages/0f/63/ea5453ba656d060936acf41d2ec057f23aafd69649e2129ac66fdda67d48/apache_beam-2.8.0-cp27-cp27mu-manylinux1_x86_64.whl
2019-10-13T04:00:11.525435891Z Saved /tmp/tmpk5TfMS/apache_beam-2.8.0-cp27-cp27mu-manylinux1_x86_64.whl
2019-10-13T04:00:12.025054706Z Successfully downloaded apache-beam
2019-10-13T04:00:26.726190542Z INFO:root:Staging binary distribution of the SDK from PyPI to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/apache_beam-2.8.0-cp27-cp27mu-manylinux1_x86_64.whl
2019-10-13T04:00:26.825618945Z INFO:root:Starting GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/apache_beam-2.8.0-cp27-cp27mu-manylinux1_x86_64.whl...
2019-10-13T04:00:33.725522899Z INFO:root:Completed GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1570936519.827087/apache_beam-2.8.0-cp27-cp27mu-manylinux1_x86_64.whl
2019-10-13T04:06:14.525017097Z Traceback (most recent call last):
...
Why is this job failing to submit? Are there any other logs we can look at to see the cause of this failure?
(Most of our dataflow jobs are written in Java, where the error messages are usually more helpful.)
UPDATE: Running job locally (Windows) with apache-beam 2.16 has the same issue but more logging detail:
...
INFO:root:Starting GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1571606418.971000/apache_beam-2.16.0-cp27-cp27mu-manylinux1_x86_64.whl...
INFO:root:Completed GCS upload to gs://gcs-bucket-name/staging/newness-boosting-c2898.1571606418.971000/apache_beam-2.16.0-cp27-cp27mu-manylinux1_x86_64.whl in 3 seconds.
WARNING:root:Discarding unparseable args: ['job.py', '--days_history=30']
WARNING:root:Discarding unparseable args: ['job.py', '--days_history=30']
WARNING:root:Retry with exponential backoff: waiting for 2.64795143823 seconds before retrying submit_job_description because we caught exception: error: [Errno 10053] An established connection was aborted by the software in your host machine
Traceback for above exception (most recent call last):
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 206, in wrapper
return fun(*args, **kwargs)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 593, in submit_job_description
response = self._client.projects_locations_jobs.Create(request)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\clients\dataflow\dataflow_v1b3_client.py", line 657, in Create
config, request, global_params=global_params)
File "C:\Python27\lib\site-packages\apitools\base\py\base_api.py", line 729, in _RunMethod
http, http_request, **opts)
File "C:\Python27\lib\site-packages\apitools\base\py\http_wrapper.py", line 346, in MakeRequest
check_response_func=check_response_func)
File "C:\Python27\lib\site-packages\apitools\base\py\http_wrapper.py", line 396, in _MakeRequestNoRetry
redirections=redirections, connection_type=connection_type)
File "C:\Python27\lib\site-packages\oauth2client\transport.py", line 169, in new_request
redirections, connection_type)
File "C:\Python27\lib\site-packages\oauth2client\transport.py", line 169, in new_request
redirections, connection_type)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1694, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1434, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1390, in _conn_request
response = conn.getresponse()
File "C:\Python27\lib\httplib.py", line 1121, in getresponse
response.begin()
File "C:\Python27\lib\httplib.py", line 438, in begin
version, status, reason = self._read_status()
File "C:\Python27\lib\httplib.py", line 394, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "C:\Python27\lib\socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
File "C:\Python27\lib\ssl.py", line 754, in recv
return self.read(buflen)
File "C:\Python27\lib\ssl.py", line 641, in read
v = self._sslobj.read(len)
... retries 4 times total ...
Traceback (most recent call last):
File "job.py", line 117, in <module>
newness.pipeline.run_dataflow(sys.argv)
File "C:\Users\LeeW\Desktop\newness\newness\pipeline.py", line 480, in run_dataflow
result = pipe.run()
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 407, in run
self._options).run(False)
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 420, in run
return self.runner.run_pipeline(self, self._options)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 485, in run_pipeline
self.dataflow_client.create_job(self.job), self)
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 206, in wrapper
return fun(*args, **kwargs)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 546, in create_job
return self.submit_job_description(job)
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 219, in wrapper
raise_with_traceback(exn, exn_traceback)
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 206, in wrapper
return fun(*args, **kwargs)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 593, in submit_job_description
response = self._client.projects_locations_jobs.Create(request)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\clients\dataflow\dataflow_v1b3_client.py", line 657, in Create
config, request, global_params=global_params)
File "C:\Python27\lib\site-packages\apitools\base\py\base_api.py", line 729, in _RunMethod
http, http_request, **opts)
File "C:\Python27\lib\site-packages\apitools\base\py\http_wrapper.py", line 346, in MakeRequest
check_response_func=check_response_func)
File "C:\Python27\lib\site-packages\apitools\base\py\http_wrapper.py", line 396, in _MakeRequestNoRetry
redirections=redirections, connection_type=connection_type)
File "C:\Python27\lib\site-packages\oauth2client\transport.py", line 169, in new_request
redirections, connection_type)
File "C:\Python27\lib\site-packages\oauth2client\transport.py", line 169, in new_request
redirections, connection_type)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1694, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1434, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1390, in _conn_request
response = conn.getresponse()
File "C:\Python27\lib\httplib.py", line 1121, in getresponse
response.begin()
File "C:\Python27\lib\httplib.py", line 438, in begin
version, status, reason = self._read_status()
File "C:\Python27\lib\httplib.py", line 394, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "C:\Python27\lib\socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
File "C:\Python27\lib\ssl.py", line 754, in recv
return self.read(buflen)
File "C:\Python27\lib\ssl.py", line 641, in read
v = self._sslobj.read(len)
socket.error: [Errno 10053] An established connection was aborted by the software in your host machine
Which version of Beam Python SDK are you using?

Categories

Resources