PUT Data on HDFS via HTTP WEB API - python

I'm trying to implement a PUT request on HDFS via the HDFS Web API.
So I looked up the Documentation on how to do that : https://hadoop.apache.org/docs/r1.0.4/webhdfs.html#CREATE
First do a PUT without redirect, to get a 307, catch the new URL and then PUT on that URL with DATA.
When I do my first put, I do get the 307, but the URL is the same has the first one. So I'm note sure if I'm already on the "good" datanode or not.
Any way, I get this URL and try to add DATA to it, but I get an error, from what I understand, it is a connection error. Host is cutting down the connection.
class HttpFS:
def __init__(self, url=settings.HTTPFS_URL):
self.httpfs = url
self.auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL)
def put(self, local_file, hdfs_dir):
url = "{}{}".format(self.httpfs, hdfs_dir)
params = {"op": "CREATE", "overwrite": True}
print(url)
r = requests.put(url, auth=self.auth, params=params, stream=True, verify=settings.CA_ROOT_PATH, allow_redirects=False)
r = requests.put(r.headers['Location'], auth=self.auth, data=open(local_file, 'rb'), params=params, stream=True, verify=settings.CA_ROOT_PATH)
Here is the error given:
r = requests.put(r.headers['Location'], auth=self.auth, data=open(local_file, 'rb'), params=params, stream=True, verify=settings.CA_ROOT_PATH)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/api.py", line 134, in put
return request('put', url, data=data, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BrokenPipeError(32, 'Broken pipe'))
Edit 1:
I tried also with https://github.com/pywebhdfs/pywebhdfs repos. Since it is supposed to do exactly what i'm looking for. But I still have this Broken Pipe Error.
from requests_kerberos import OPTIONAL, HTTPKerberosAuth
from pywebhdfs.webhdfs import PyWebHdfsClient
from utils import settings
auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL)
url = f"https://{settings.HDFS_HTTPFS_HOST}:{settings.HDFS_HTTPFS_PORT}/webhdfs/v1/"
hdfs_client = PyWebHdfsClient(base_uri_pattern=url, request_extra_opts={'auth':auth, 'verify': settings.CA_ROOT_PATH})
with open(data_dir + file_name, 'rb') as file_data:
hdfs_client.create_file(hdfs_path + file_name, file_data=file_data, overwrite=True)
Same error:
hdfs_client.create_file(hdfs_path + file_name, file_data=file_data, overwrite=True)
File "/home/cdsw/.local/lib/python3.6/site-packages/pywebhdfs/webhdfs.py", line 115, in create_file
**self.request_extra_opts)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/api.py", line 134, in put
return request('put', url, data=data, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/home/cdsw/.local/lib/python3.6/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BrokenPipeError(32, 'Broken pipe'))
Edit 2:
I found out I was sending too much data at once. So now I create a file on HDFS, then I append to it with chunk of data. But it is slow... And I still can get the same error as above randomly. The bigger are the are the chunk the more I have chances to get a Connection aborted. My files more in range of 200Mb, so it take ages comparing to the Hadoop binary "hdfs dfs -put"

Related

Python Requests OS Error 104 Connection Broken Error

Hi I am trying to a hit an API using requests module of python. The Api has to be hit 20000 times as the number of pages are around 20000. In every hit the data comes around 10 mb. By the end of the process it creates a json file of around 100gb. Here is the code I have written
with open('file.json','wb',buffering=100*1048567) as f:
while(next_page_cursor != ""):
with request.get(url,headers=headers) as response:
json_response = json.loads(response.content.decode('utf-8'))
"""
json response looks something like this
{
content:[{},{},{}........50 dictionaries]
next_page_cursor : "abcd"
}
"""
next_page_cursor = json_response['next_page_cursor']
for data in json_response['content']:
f.write((json.dumps(data) + "\n").encode())
But after running successfully for few pages the code fails giving the below error:
Traceback (most recent call last):
File "<command-1206920060120926>", line 65, in <module>
with requests.get(data_url, headers = headers) as response:
File "/databricks/python/lib/python3.7/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/databricks/python/lib/python3.7/site-packages/requests/sessions.py", line 686, in send
r.content
File "/databricks/python/lib/python3.7/site-packages/requests/models.py", line 828, in content
self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
File "/databricks/python/lib/python3.7/site-packages/requests/models.py", line 753, in generate
raise ChunkedEncodingError(e)
requests.exceptions.ChunkedEncodingError: ('Connection broken: OSError("(104, \'ECONNRESET\')")', OSError("(104, 'ECONNRESET')"))
you need to use response.iter_content
https://2.python-requests.org/en/master/api/#requests.Response.iter_content

Python requests: using proxy but get connect 'Connection aborted'

My vpn works WELL, and can visit google(from China).
sock5 Port of my VPN: 1080
But when I run the following code, I get error.
import requests
headers = {'user-agent': ''}
proxies = {"http": "socks5://127.0.0.1:1080",'https': 'socks5://127.0.0.1:1080'}
# url = 'https://www.baidu.com/'
url = 'https://www.google.com/search?q=python' #
res = requests.get(url, headers=headers, proxies=proxies)
print("res.status_code:\n",res.status_code)
if I remove , proxies=proxies, and change the url to baidu it works.
...
url = 'https://www.baidu.com/'
# url = 'https://www.google.com/search?q=python'
res = requests.get(url, headers=headers)
print("res.status_code:\n",res.status_code)
the error in 3:
Traceback (most recent call last):
File "Try.py", line 17, in <module>
res = requests.get(url, headers=headers, proxies=proxies)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/requests-2.22.0-py3.7.egg/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', OSError(0, 'Error'))
from 1 to 4, 1 is contradictory to 4. I don't really know where the problem is. I'd be extremely grateful If someone can help.
Solved by substituting sock5 with sock5h

Python: Requests patch method doesn't work

I have the code below which works fine and brings back what I need
import requests
from requests.auth import HTTPBasicAuth
response = requests.get('https://example/answers/331', auth=HTTPBasicAuth('username', 'password'),json={"solution": "12345"})
print response.content
However when I change it to a patch method, which is accepted by the server, I get the following errors. Any idea on why?
Traceback (most recent call last):
File "auth.py", line 8, in <module>
response = requests.patch('https://example/answers/331', auth=HTTPBasicAuth('username', 'password'),json={"solution": "12345"})
File "C:\Python27\lib\site-packages\requests-2.12.0-py2.7.egg\requests\api.py", line 138, in patch
return request('patch', url, data=data, **kwargs)
File "C:\Python27\lib\site-packages\requests-2.12.0-py2.7.egg\requests\api.py", line 56, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests-2.12.0-py2.7.egg\requests\sessions.py", line 488, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests-2.12.0-py2.7.egg\requests\sessions.py", line 609, in send
r = adapter.send(request, **kwargs)
File "C:\Python27\lib\site-packages\requests-2.12.0-py2.7.egg\requests\adapters.py", line 473, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
Thanks
Try using a POST request with the following header: X-HTTP-Method-Override: PATCH
This is unique to the Oracle Service Cloud REST API implementation and is documented.
In cases where the browser or client application does not support PATCH requests, or network intermediaries block PATCH requests, HTTP tunneling can be used with a POST request by supplying an X-HTTP-Method-Override header.
Example:
import requests
restURL = <Your REST URL>
params = {'field': 'val'}
headers = {'X-HTTP-Method-Override':'PATCH'}
try:
resp = requests.post(restURL, json=params, auth=('<uname>', '<pwd>'), headers=headers)
print resp
except requests.exceptions.RequestException as err:
errMsg = "Error: %s" % err
print errMsg

How to move on if the error occur in response on python in beautiful Soup

I have made a web crawler that takes thousands of Urls from a text file and then crawls the data on that webpage.
Now that it has many Urls; some Urls are broken too.
So it gives me the error:
Traceback (most recent call last):
File "C:/Users/khize_000/PycharmProjects/untitled3/new.py", line 57, in <module>
crawl_data("http://www.foasdasdasdasdodily.com/r/126e7649cc-sweetssssie-pies-mac-and-cheese-recipe-by-the-dr-oz-show")
File "C:/Users/khize_000/PycharmProjects/untitled3/new.py", line 18, in crawl_data
data = requests.get(url)
File "C:\Python27\lib\site-packages\requests\api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "C:\Python27\lib\site-packages\requests\api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "C:\Python27\lib\site-packages\requests\adapters.py", line 437, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.foasdasdasdasdodily.com', port=80): Max retries exceeded with url: /r/126e7649cc-sweetssssie-pies-mac-and-cheese-recipe-by-the-dr-oz-show (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x0310FCB0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))
Here's my code:
def crawl_data(url):
global connectString
data = requests.get(url)
response = str( data )
if response != "<Response [200]>":
return
soup = BeautifulSoup(data.text,"lxml")
titledb = soup.h1.string
But it still gives me the same exception or error.
I simply want it to ignore that Urls from which there is no response
and move on to the next Url.
You need to learn about exception handling. The easiest way to ignore these errors is to surround the code that processes a single URL with a try-except construct, making you code read something like:
try:
<process a single URL>
except requests.exceptions.ConnectionError:
pass
This will mean that if the specified exception occurs your program will just execute the pass (do nothing) statement and move on to the next
Use try-except:
def crawl_data(url):
global connectString
try:
data = requests.get(url)
except requests.exceptions.ConnectionError:
return
response = str( data )
soup = BeautifulSoup(data.text,"lxml")
titledb = soup.h1.string

How to post data using requests module to a flask application?

I am trying to develop a small tiny application using flask and requests module. I tried to post some data to a flask web application. But I stuck at this error.
flask.py
#app.route('/add/', methods=['POST'])
def add_paths():
paths = request.form['paths']
tags = 'others'
for path in paths:
g.db.execute('insert into entries (path, tags) values(?, ?)',
[path, tags])
g.db.commit()
message = 'New paths are posted'
return jsonify(result = message)
command line file for posting the data
import json
import glob2
import requests
list = glob2.glob('/home/sheeshmohsin/*/**')
post_data = {'paths' : list }
headers = {'content-type': 'application/json'}
post_response = requests.post(url='http://localhost:9696/add/', data=json.dumps(post_data), headers=headers)
print post_response
print post_response.text
And the error i am getting is:-
File "commandline.py", line 8, in <module>
post_response = requests.post(url='http://127.0.0.1:9696/add/', data=json.dumps(post_data), headers=headers)
File "/usr/lib/python2.7/site-packages/requests/api.py", line 88, in post
return request('post', url, data=data, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/sessions.py", line 335, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python2.7/site-packages/requests/sessions.py", line 438, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/adapters.py", line 327, in send
raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=9696): Max retries exceeded with url: /add/ (Caused by <class 'socket.error'>: [Errno 111] Connection refused)
The connection refused error message indicates it could be your firewall, or loopback:
Check that firewall isn't blocking port 9696.
I've had the loopback (127.0.0.1) cause similar issues, so replace localhost by actual IP address so that Python doesn't use loopback interface (requests.post(url='http://localhost:9696/add/',...). You might have to do same with Flask (app..run(host='192.168...', port=9696)).

Categories

Resources