I have a script that takes the postgres backup at some time intervals and uploads the backup to s3. The script was designed more than two years ago and was working perfectly which was deployed on Linux server(Ubuntu 14.04 LTS (GNU/Linux 3.13.0-24-generic x86_64)). But from some days the backup was not being uploaded to s3 and facing some errors and so had a look at the log and it was as below
uploading test_db.backup.tar.gz to Amazon S3...............
Traceback (most recent call last):
File "/user/projects/test_folder/test/manage.py", line 9, in <module>
execute_from_command_line(sys.argv)
File "/user/projects_envs/testlocal/lib/python2.7/site-packages/django/core/management/__init__.py", line 399, in execute_from_command_line
utility.execute()
File "/user/projects_envs/testlocal/lib/python2.7/site-packages/django/core/management/__init__.py", line 392, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/user/projects_envs/testlocal/lib/python2.7/site-packages/django/core/management/base.py", line 242, in run_from_argv
self.execute(*args, **options.__dict__)
File "/user/projects_envs/testlocal/lib/python2.7/site-packages/django/core/management/base.py", line 285, in execute
output = self.handle(*args, **options)
File "/user/projects_envs/testlocal/lib/python2.7/site-packages/django/core/management/base.py", line 415, in handle
return self.handle_noargs(**options)
File "/user/projects/test_folder/test/core/management/commands/db_backup.py", line 41, in handle_noargs
self.upload_to_s3(file_name, file_path)
File "/user/projects/test_folder/test/core/management/commands/db_backup.py", line 64, in upload_to_s3
response = conn.put(settings.BACKUP_BUCKET_NAME, file_name, S3.S3Object(tardata))
File "/user/projects/test_folder/test/storage/S3.py", line 192, in put
object.metadata))
File "/user/projects/test_folder/test/storage/S3.py", line 276, in _make_request
connection.request(method, path, data, final_headers)
File "/usr/lib/python2.7/httplib.py", line 973, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1007, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 969, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 829, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 805, in send
self.sock.sendall(data)
File "/usr/lib/python2.7/ssl.py", line 329, in sendall
v = self.send(data[count:])
File "/usr/lib/python2.7/ssl.py", line 298, in send
v = self._sslobj.write(data)
socket.error: [Errno 104] Connection reset by peer
Code
from storage import S3
def upload_to_s3(self, file_name, file_path):
print "uploading " + file_name +'.tar.gz' + " to Amazon S3..............."
conn = S3.AWSAuthConnection(settings.AWS_ACCESS_KEY_ID, settings.AWS_SECRET_ACCESS_KEY)
#get all buckets from amazon S3
response = conn.list_all_my_buckets()
buckets = response.entries
#is the bucket which you have specified is already there
flag = False
for bucket in buckets:
if bucket.name == settings.BACKUP_BUCKET_NAME:
flag = True
#if there is no bucket with that name
if flag == False:
print "There is no bucket with name " + BUCKET_NAME + " in your Amazon S3 account"
print "Error : Please enter an appropriate bucket name and re-run the script"
return
#upload file to Amazon S3
tardata = open(file_path+'.tar.gz', "rb").read()
response = conn.put(settings.BACKUP_BUCKET_NAME, file_name, S3.S3Object(tardata))
...............
...............
So whats wrong with the s3 package ? and why it suddenly stopped working ? does this really relates to s3 or else something about linux packages ?
Related
My goal is to read file from hdfs in airflow and do further manipulations.
After researching, I found that url I need to use is as follows:
df = pd.read_parquet('http://localhost:9870/webhdfs/v1/hadoop_files/sample_2022_01.parquet?op=OPEN'),
where localhost/172.20.80.1/computer-name.mshome.net can be interchangeably used,
9870 - namenode port,
hadoop_files/sample_2022_01.parquet - my folder and file created in the root.
I can access and read file locally in PyCharm, but I am unable to get the same result inside airflow in docker. I tried using local hdfs and hdfs hosted in docker and changing host to the host.docker.internal, but I am getting the same error.
Stack trace:
[2022-06-12, 17:52:45 UTC] {taskinstance.py:1889} ERROR - Task failed with exception
Traceback (most recent call last):
File "/usr/local/lib/python3.7/urllib/request.py", line 1350, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/usr/local/lib/python3.7/http/client.py", line 1281, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1327, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1276, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1036, in _send_output
self.send(msg)
File "/usr/local/lib/python3.7/http/client.py", line 976, in send
self.connect()
File "/usr/local/lib/python3.7/http/client.py", line 948, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/local/lib/python3.7/socket.py", line 728, in create_connection
raise err
File "/usr/local/lib/python3.7/socket.py", line 716, in create_connection
sock.connect(sa)
OSError: [Errno 113] No route to host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/operators/python.py", line 207, in execute
branch = super().execute(context)
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/operators/python.py", line 171, in execute
return_value = self.execute_callable()
File "/home/airflow/.local/lib/python3.7/site-packages/airflow/operators/python.py", line 189, in execute_callable
return self.python_callable(*self.op_args, **self.op_kwargs)
File "/opt/airflow/dags/includes/parquet_dag/main.py", line 15, in main
df_parquet = read('hdfs://localhost:9000/hadoop_files/sample_2022_01.parquet')
File "/opt/airflow/dags/includes/parquet_dag/utils.py", line 29, in read
df = pd.read_parquet('http://172.20.80.1:9870/webhdfs/v1/hadoop_files/sample_2022_01.parquet?op=OPEN')
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/parquet.py", line 500, in read_parquet
**kwargs,
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/parquet.py", line 236, in read
mode="rb",
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/parquet.py", line 102, in _get_path_or_handle
path_or_handle, mode, is_text=False, storage_options=storage_options
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/common.py", line 614, in get_handle
storage_options=storage_options,
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/common.py", line 312, in _get_filepath_or_buffer
with urlopen(req_info) as req:
File "/home/airflow/.local/lib/python3.7/site-packages/pandas/io/common.py", line 212, in urlopen
return urllib.request.urlopen(*args, **kwargs)
File "/usr/local/lib/python3.7/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/lib/python3.7/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/usr/local/lib/python3.7/urllib/request.py", line 543, in _open
'_open', req)
File "/usr/local/lib/python3.7/urllib/request.py", line 503, in _call_chain
result = func(*args)
File "/usr/local/lib/python3.7/urllib/request.py", line 1378, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/local/lib/python3.7/urllib/request.py", line 1352, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 113] No route to host>
With host.docker.internal:
urllib.error.URLError: <urlopen error [Errno 99] Cannot assign requested address>
you need to use any routable address inside airflow docker container.
if hadoop is inside docker container as well, check it ip address using docker inspect CONTAINER (doc). if hadoop is on localhost you can set network_mode: "host" (doc)
also there is an important notice if you are on macos and have the docker desktop app which basically a virtual machine. so in this case you need some extra settings, check this, for example.
where localhost/172.20.80.1/computer-name.mshome.net can be interchangeably used,
They shouldn't be interchangeable inside Docker network.
From Airflow, you could use Docker service names, not IP addresses, and ensure the containers are in the same bridge network (not host mode, which only works on Linux). host.docker.internal isn't correct either since you're trying to reach another container, not your host
https://docs.docker.com/network/bridge/
I'd also recommend using Airflow Spark operators to actually read Parquet from HDFS, using Spark, not Pandas or WebHDFS. You can convert Spark dataframes to Pandas, if needed
In one my application files, I am running a query on BQ Public Tables.
from google.cloud import bigquery
import pyarrow
client = bigquery.Client(project = "project-name")
os.environ.setdefault("GCLOUD_PROJECT", "project-name")
sql = """
SELECT *
FROM `bigquery-public-data.geo_us_boundaries.cbsa`
WHERE name IN UNNEST(%s)
""" %(metros)
df_geom = client.query(sql).to_dataframe()
When I run the Application, I am getting a google.auth.exception out of nowhere.
File "/Users/...tab1.py", line 72, in <module>
df_geom = client.query(sql).to_dataframe()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 3391, in query
future = do_query()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 3368, in do_query
query_job._begin(retry=retry, timeout=timeout)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/job/query.py", line 1249, in _begin
super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/job/base.py", line 509, in _begin
api_response = client._call_api(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/bigquery/client.py", line 782, in _call_api
return call()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/api_core/retry.py", line 283, in retry_wrapped_func
return retry_target(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/api_core/retry.py", line 190, in retry_target
return target()
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 469, in api_request
response = self._make_request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 333, in _make_request
return self._do_request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/cloud/_http/__init__.py", line 371, in _do_request
return self.http.request(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/auth/transport/requests.py", line 476, in request
self.credentials.before_request(auth_request, method, url, request_headers)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/auth/credentials.py", line 133, in before_request
self.refresh(request)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/credentials.py", line 302, in refresh
) = reauth.refresh_grant(
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/reauth.py", line 347, in refresh_grant
_client._handle_error_response(response_data)
File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/google/oauth2/_client.py", line 60, in _handle_error_response
raise exceptions.RefreshError(error_details, response_data)
google.auth.exceptions.RefreshError: ('invalid_grant: Token has been expired or revoked.', {'error': 'invalid_grant', 'error_description': 'Token has been expired or revoked.'})
I have no issues whatsoever before this for months. My application bundle includes a project-name-xxxx.json file from Google Cloud Console. Everytime, I attempt to run the application it fails.
I have looked through some of the other answers that suggests tokens expire after 7 days for publishing mode Testing. The publishing mode is Production in our case.
I have bucket URL, name and files list object key, and need to download the file.
What have I tried:
import boto3
import botocore
BUCKET_NAME = 'my-bucket'
KEY = 'my_file'
s3 = boto3.resource('s3')
try:
s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_file')
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
And caught errors:
Traceback (most recent call last):
File "D:\me\work\Mackpaw Data Engeneering test tsk\main.py", line 10, in <module>
s3.Bucket(BUCKET_NAME).download_file(KEY, 'data.json')
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\boto3\s3\inject.py", line 244, in bucket_download_file
return self.meta.client.download_file(
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\boto3\s3\inject.py", line 170, in download_file
return transfer.download_file(
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\boto3\s3\transfer.py", line 307, in download_file
future.result()
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\s3transfer\futures.py", line 106, in result
return self._coordinator.result()
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\s3transfer\futures.py", line 265, in result
raise self._exception
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\s3transfer\tasks.py", line 255, in _main
self._submit(transfer_future=transfer_future, **kwargs)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\s3transfer\download.py", line 340, in _submit
response = client.head_object(
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\client.py", line 357, in _api_call
return self._make_api_call(operation_name, kwargs)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\client.py", line 662, in _make_api_call
http, parsed_response = self._make_request(
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\client.py", line 682, in _make_request
return self._endpoint.make_request(operation_model, request_dict)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\endpoint.py", line 102, in make_request
return self._send_request(request_dict, operation_model)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\endpoint.py", line 132, in _send_request
request = self.create_request(request_dict, operation_model)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\endpoint.py", line 115, in create_request
self._event_emitter.emit(event_name, request=request,
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\hooks.py", line 356, in emit
return self._emitter.emit(aliased_event_name, **kwargs)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\hooks.py", line 228, in emit
return self._emit(event_name, kwargs)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\hooks.py", line 211, in _emit
response = handler(**kwargs)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\signers.py", line 90, in handler
return self.sign(operation_name, request)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\signers.py", line 162, in sign
auth.add_auth(request)
File "C:\Users\Katherine\AppData\Local\Programs\Python\Python39\lib\site-packages\botocore\auth.py", line 373, in add_auth
raise NoCredentialsError()
botocore.exceptions.NoCredentialsError: Unable to locate credentials
Seems that I'm trying to download the file as if the bucket is private and it needs some configs. But this bucket is public and I haven't found info how to get access to the file in that situation.
If the bucket and its objects are public, you can just use the object's public URL to retrieve the required object. The public URL is in the following format:
https://<image_name>.s3-.amazonaws.com/<file_path_in_s3>
I guess you have not setup the aws credentials - secret and access key. You can follow the below link for the same:
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
I'm trying to upload a .csv.gz file to GCS after extracting it to .csv, the file size changes from 500MB to around 5GB. I'm able to extract the .csv.gz file to a temporary path and it fails when I try to upload that file to GCS. I get the following error:
[2019-11-11 13:59:58,180] {models.py:1796} ERROR - [Errno 32] Broken pipe
Traceback (most recent call last)
File "/usr/local/lib/airflow/airflow/models.py", line 1664, in _run_raw_tas
result = task_copy.execute(context=context
File "/home/airflow/gcs/dags/operators/s3_to_gcs_transform_operator.py", line 220, in execut
gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, target_file, gzip=True
File "/home/airflow/gcs/dags/hooks/gcs_hook_conn.py", line 208, in uploa
.insert(bucket=bucket, name=object, media_body=media)
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/_helpers.py", line 130, in positional_wrappe
return wrapped(*args, **kwargs
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 835, in execut
method=str(self.method), body=self.body, headers=self.headers
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 179, in _retry_reques
raise exceptio
File "/opt/python3.6/lib/python3.6/site-packages/googleapiclient/http.py", line 162, in _retry_reques
resp, content = http.request(uri, method, *args, **kwargs
File "/opt/python3.6/lib/python3.6/site-packages/google_auth_httplib2.py", line 198, in reques
uri, method, body=body, headers=request_headers, **kwargs
File "/usr/local/lib/airflow/airflow/contrib/hooks/gcp_api_base_hook.py", line 155, in new_reques
redirections, connection_type
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1924, in reques
cachekey
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1595, in _reques
conn, request_uri, method, body, header
File "/opt/python3.6/lib/python3.6/site-packages/httplib2/__init__.py", line 1502, in _conn_reques
conn.request(method, request_uri, body, headers
File "/opt/python3.6/lib/python3.6/http/client.py", line 1239, in reques
self._send_request(method, url, body, headers, encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1285, in _send_reques
self.endheaders(body, encode_chunked=encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1234, in endheader
self._send_output(message_body, encode_chunked=encode_chunked
File "/opt/python3.6/lib/python3.6/http/client.py", line 1065, in _send_outpu
self.send(chunk
File "/opt/python3.6/lib/python3.6/http/client.py", line 986, in sen
self.sock.sendall(data
File "/opt/python3.6/lib/python3.6/ssl.py", line 975, in sendal
v = self.send(byte_view[count:]
File "/opt/python3.6/lib/python3.6/ssl.py", line 944, in sen
return self._sslobj.write(data
File "/opt/python3.6/lib/python3.6/ssl.py", line 642, in writ
return self._sslobj.write(data
BrokenPipeError: [Errno 32] Broken pip
From what I understood, the error could be due to the following:
Your server process has received a SIGPIPE writing to a socket. This
usually happens when you write to a socket fully closed on the other
(client) side. This might be happening when a client program doesn't
wait till all the data from the server is received and simply closes a
socket (using close function).
But I have no idea whether this is the issue or how I can fix this. Can someone help?
You should try to uploads big files in chunks.
from google.cloud import storage
CHUNK_SIZE = 128 * 1024 * 1024
client = storage.Client()
bucket = client.bucket('destination')
blob = bucket.blob('really-big-blob', chunk_size=CHUNK_SIZE)
blob.upload_from_filename('/path/to/really-big-file')
Also you can check Parallel Composite Uploads
Similar SO question link.
Currently I am using a dynamic backend instance for my site, but I still got timeout issue. The background of my site is:
GAE used as the frontend which handles user requests.
Requests will be sent to a EC2 rest server (using bottle) to process.
Below is the error information from GAE log. It seems like GAE stopped my backend thread after 60 seconds. I appreciate any suggestions.
ERROR 2014-02-20 14:42:18,365 module.py:660] Request to '/_ah/background' failed
Traceback (most recent call last):
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\module.py", line 637, in _handle_request
environ, wrapped_start_response)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\request_rewriter.py", line 311, in _rewriter_middleware
response_body = iter(application(environ, wrapped_start_response))
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\module.py", line 1524, in _handle_script_request
request_type)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\module.py", line 1476, in _handle_instance_request
request_id, request_type)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\instance.py", line 382, in handle
request_type))
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\tools\devappserver2\http_runtime.py", line 244, in handle
response = connection.getresponse()
File "C:\Python27\Lib\httplib.py", line 1045, in getresponse
response.begin()
File "C:\Python27\Lib\httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "C:\Python27\Lib\httplib.py", line 365, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "C:\Python27\Lib\socket.py", line 476, in readline
data = self._sock.recv(self._rbufsize)
error: [Errno 10054] An existing connection was forcibly closed by the remote host
app.yaml
- url: /backend.html
script: przm_batchmodel.py
backends.yaml
backends:
- name: mybackend
class: B1
instances: 1
options: dynamic
Output.py
def loop_html(thefile):
####do some thing###
response = urlfetch.fetch(url=my_REST_server_URL, payload=data, method=urlfetch.POST, headers=http_headers, deadline=6000)
return response
class przmBatchOutputPage(webapp.RequestHandler):
def post(self):
form = cgi.FieldStorage()
thefile = form['file-0']
html = background_thread.BackgroundThread(target=loop_html, args=[thefile, generate_batch_jid()])
html.start()
self.response.out.write(html)
app = webapp.WSGIApplication([('/.*', przmBatchOutputPage)], debug=True)