I have a HDFS to which I want to read and write using a Python script.
import requests
import json
import os
import kerberos
import sys
node = os.getenv("namenode").split(",")
print (node)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
read_or_write = sys.argv[3]
print (local_file_path,remote_file_path)
def check_node_status(node):
for name in node:
print (name)
request = requests.get("%s/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"%name,
verify=False).json()
status = request["beans"][0]["State"]
if status =="active":
nnhost = request["beans"][0]["HostAndPort"]
splitaddr = nnhost.split(":")
nnaddress = splitaddr[0]
print(nnaddress)
break
return status,name,nnaddress
def kerberos_auth(nnaddress):
__, krb_context = kerberos.authGSSClientInit("HTTP#%s"%nnaddress)
kerberos.authGSSClientStep(krb_context, "")
negotiate_details = kerberos.authGSSClientResponse(krb_context)
headers = {"Authorization": "Negotiate " + negotiate_details,
"Content-Type":"application/binary"}
return headers
def kerberos_hdfs_upload(status,name,headers):
print("running upload function")
if status =="active":
print("if function")
data=open('%s'%local_file_path, 'rb').read()
write_req = requests.put("%s/webhdfs/v1%s?op=CREATE&overwrite=true"%(name,remote_file_path),
headers=headers,
verify=False,
allow_redirects=True,
data=data)
print(write_req.text)
def kerberos_hdfs_read(status,name,headers):
if status == "active":
read = requests.get("%s/webhdfs/v1%s?op=OPEN"%(name,remote_file_path),
headers=headers,
verify=False,
allow_redirects=True)
if read.status_code == 200:
data=open('%s'%local_file_path, 'wb')
data.write(read.content)
data.close()
else :
print(read.content)
status, name, nnaddress= check_node_status(node)
headers = kerberos_auth(nnaddress)
if read_or_write == "write":
kerberos_hdfs_upload(status,name,headers)
elif read_or_write == "read":
print("fun")
kerberos_hdfs_read(status,name,headers)
The code works on my own machine which is not behind any proxy. But when running it in the office machine, which is behind a proxy, it is giving the following proxy error:
$ python3 python_hdfs.py ./1.png /user/testuser/2018-02-07_1.png write
['https://<servername>:50470', 'https:// <servername>:50470']
./1.png /user/testuser/2018-02-07_1.png
https://<servername>:50470
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 555, in urlopen
self._prepare_proxy(conn)
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 753, in _prepare_proxy
conn.connect()
File "/usr/lib/python3/dist-packages/urllib3/connection.py", line 230, in connect
self._tunnel()
File "/usr/lib/python3.5/http/client.py", line 832, in _tunnel
message.strip()))
OSError: Tunnel connection failed: 504 Unknown Host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 376, in send
timeout=timeout
File "/usr/lib/python3/dist-packages/urllib3/connectionpool.py", line 610, in urlopen
_stacktrace=sys.exc_info()[2])
File "/usr/lib/python3/dist-packages/urllib3/util/retry.py", line 273, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
requests.packages.urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='<servername>', port=50470): Max retries exceeded with url: /jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 504 Unknown Host',)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "python_hdfs.py", line 68, in <module>
status, name, nnaddress= check_node_status(node)
File "python_hdfs.py", line 23, in check_node_status
verify=False).json()
File "/usr/lib/python3/dist-packages/requests/api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 437, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='<server_name>', port=50470): Max retries exceeded with url: /jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 504 Unknown Host',)))
I tried giving proxy info in the code, like so:
proxies = {
"http": "<proxy_username>:<proxy_password>#<proxy_IP>:<proxy_port>",
"https": "<proxy_username>:<proxy_password>#<proxy_IP>:<proxy_port>",
}
node = os.getenv("namenode").split(",")
print (node)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
print (local_file_path,remote_file_path)
local_file_path = sys.argv[1]
remote_file_path = sys.argv[2]
read_or_write = sys.argv[3]
print (local_file_path,remote_file_path)
def check_node_status(node):
for name in node:
print (name)
request = requests.get("%s/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"%name,proxies=proxies,
verify=False).json()
status = request["beans"][0]["State"]
if status =="active":
nnhost = request["beans"][0]["HostAndPort"]
splitaddr = nnhost.split(":")
nnaddress = splitaddr[0]
print(nnaddress)
break
return status,name,nnaddress
### Rest of the code is the same
Now it is giving the following error:
$ python3 python_hdfs.py ./1.png /user/testuser/2018-02-07_1.png write
['https://<servername>:50470', 'https:// <servername>:50470']
./1.png /user/testuser/2018-02-07_1.png
https://<servername>:50470
Traceback (most recent call last):
File "python_hdfs.py", line 73, in <module>
status, name, nnaddress= check_node_status(node)
File "python_hdfs.py", line 28, in check_node_status
verify=False).json()
File "/usr/lib/python3/dist-packages/requests/api.py", line 67, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3/dist-packages/requests/api.py", line 53, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 468, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3/dist-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 343, in send
conn = self.get_connection(request.url, proxies)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 254, in get_connection
proxy_manager = self.proxy_manager_for(proxy)
File "/usr/lib/python3/dist-packages/requests/adapters.py", line 160, in proxy_manager_for
**proxy_kwargs)
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 281, in proxy_from_url
return ProxyManager(proxy_url=url, **kw)
File "/usr/lib/python3/dist-packages/urllib3/poolmanager.py", line 232, in __init__
raise ProxySchemeUnknown(proxy.scheme)
requests.packages.urllib3.exceptions.ProxySchemeUnknown: Not supported proxy scheme <proxy_username>
So, my question is, do I need to set up the proxy in kerberos for it to be working? If so, how? I am not too familiar with kerberos. I run kinit before running the python code, in order to enter into the kerberos realm, which runs fine and connects to the appropriate HDFS servers without the proxy. So I don't know why this error occurs when reading or writing to the same HDFS servers. Any help is appreciated.
I also have the proxy set up in /etc/apt/apt.conf like so:
Acquire::http::proxy "http://<proxy_username>:<proxy_password>#<proxy_IP>:<proxy_port>/";
Acquire::https::proxy "https://<proxy_username>:<proxy_password>#<proxy_IP>:<proxy_port>/";
I have also tried the following:
$ export http_proxy="http://<user>:<pass>#<proxy>:<port>"
$ export HTTP_PROXY="http://<user>:<pass>#<proxy>:<port>"
$ export https_proxy="http://<user>:<pass>#<proxy>:<port>"
$ export HTTPS_PROXY="http://<user>:<pass>#<proxy>:<port>"
import os
proxy = 'http://<user>:<pass>#<proxy>:<port>'
os.environ['http_proxy'] = proxy
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy
#rest of the code is same
But the error persists.
UPDATE: I have also tried the following.
Somebody suggested that we already have a proxy set up in /etc/apt/apt.conf to connect to the web. But maybe we don't need proxy to connect to the HDFS. So, try commenting the proxies in /etc/apt/apt.conf, and run the python script again. I did that.
$ env | grep proxy
http_proxy=http://hfli:Test6969#192.168.44.217:8080
https_proxy=https://hfli:Test6969#192.168.44.217:8080
$ unset http_proxy
$ unset https_proxy
$ env | grep proxy
$
And ran the python script again - (i) without defining proxies in the python script, and also (ii) with the proxies defined in the python script. I got the same original proxy error in both cases.
I found the following Java program that supposedly gives access to run Java programs on the HDFS:
import com.sun.security.auth.callback.TextCallbackHandler;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import javax.security.auth.Subject;
import javax.security.auth.login.LoginContext;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
public class HDFS_RW_Secure
{
public static void main(String[] args) throws Exception
{
System.setProperty("java.security.auth.login.config", "/tmp/sc3_temp/hadoop_kdc.txt");
System.setProperty("java.security.krb5.conf", "/tmp/sc3_temp/hadoop_krb.txt");
Configuration hadoopConf= new Configuration();
//this example use password login, you can change to use Keytab login
LoginContext lc;
Subject subject;
lc = new LoginContext("JaasSample", new TextCallbackHandler());
lc.login();
System.out.println("login");
subject = lc.getSubject();
UserGroupInformation.setConfiguration(hadoopConf);
UserGroupInformation ugi = UserGroupInformation.getUGIFromSubject(subject);
UserGroupInformation.setLoginUser(ugi);
Path pt=new Path("hdfs://edhcluster"+args[0]);
FileSystem fs = FileSystem.get(hadoopConf);
//write
FSDataOutputStream fin = fs.create(pt);
fin.writeUTF("Hello!");
fin.close();
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt)));
String line;
line=br.readLine();
while (line != null)
{
System.out.println(line);
line=br.readLine();
}
fs.close();
System.out.println("This is the end.");
}
}
We need to take its jar file, HDFS.jar, and run the following shell script to enable Java programs to be run on the HDFS.
nano run.sh
# contents of the run.sh file:
/tmp/sc3_temp/jre1.8.0_161/bin/java -Djavax.net.ssl.trustStore=/tmp/sc3_temp/cacerts -Djavax.net.ssl.trustStorePassword=changeit -jar /tmp/sc3_temp/HDFS.jar $1
So, I can run this shell script with /user/testuser as the argument to give it access to run Java programs in the HDFS:
./run.sh /user/testuser/test2
which gives the following output:
Debug is true storeKey false useTicketCache false useKeyTab false doNotPrompt false ticketCache is null isInitiator true KeyTab is null refreshKrb5Config is false principal is null tryFirstPass is false useFirstPass is false storePass is false clearPass is false
Kerberos username [testuser]: testuser
Kerberos password for testuser:
[Krb5LoginModule] user entered username: testuser
principal is testuser#KRB.REALM
Commit Succeeded
login
2018-02-08 14:09:30,020 WARN [main] util.NativeCodeLoader (NativeCodeLoader.java:<clinit>(62)) - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Hello!
This is the end.
So, that's working I suppose. But how do I write an equivalent shell script to run Python codes?
I found the solution. It turns out, I was looking in the wrong place. It seems the user account was set up wrongly. I tried to do something simpler, like downloading a webpage into the server. And I noticed that it was downloading the page, but had no permissions to fix it. So I explored a bit more and found that when the user account was created, it was not assigned proper ownership. So, once I assigned the proper owner to the user account, the proxy error was gone. (Sigh, so much time wasted.)
I have written about it in more detail here.
I would like to authenticate to server from my client using certificate that is generated from server.I have a server-ca.crt and below is the CURL command that is working.How to send similar request using python requests module .
$ curl -X GET -u sat_username:sat_password \
-H "Accept:application/json" --cacert katello-server-ca.crt \
https://satellite6.example.com/katello/api/organizations
I have tried following way and it is getting some exception, can someone help in resolving this issue.
python requestsCert.py
Traceback (most recent call last):
File "requestsCert.py", line 2, in <module>
res=requests.get('https://satellite6.example.com/katello/api/organizations', cert='/certificateTests/katello-server-ca.crt', verify=True)
File "/usr/lib/python2.7/site-packages/requests/api.py", line 68, in get
return request('get', url, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/api.py", line 50, in request
response = session.request(method=method, url=url, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/sessions.py", line 464, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python2.7/site-packages/requests/sessions.py", line 576, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python2.7/site-packages/requests/adapters.py", line 431, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: [SSL] PEM lib (_ssl.c:2554)
res=requests.get('https://...', cert='/certificateTests/katello-server-ca.crt', verify=True)
The cert argument in requests.get is used to specify the client certificate and key which should be used for mutual authentication. It is not used to specify the trusted CA as the --cacert argument in curl does. Instead you should use the verify argument:
res=requests.get('https://...', verify='/certificateTests/katello-server-ca.crt')
For more information see SSL Cert Verification and Client Side Certificates in the documentation for requests.
I call instagram api but this error How can I solve it?
this is my code:
params = {
"access_token" :"xxxxxxxxxxxxxxx",
"count": 1,
"q": "xxxxxxxxxxxxxxxx"
}
api_uri = 'https://api.instagram.com/v1/users/search'
req = requests.get(api_uri, params = params)
searched_user_data = json.loads(req.text)
logging.info(searched_user_data)
This is error code:
INFO 2017-11-14 05:08:10,743 connectionpool.py:788] Starting new HTTPS
connection (1): api.instagram.com
ERROR 2017-11-14 05:08:11,734 spiderlogic.py:110] ('Connection aborted.',
error(13, 'Permission denied'))
ERROR 2017-11-14 05:08:11,736 baselogic.py:48] Traceback (most recent call
last):
File "D:\Python\GitHub\tagzy\api\logic\spiderlogic.py", line 88, in
__save_instagram_account_info__
req = requests.get(api_uri, params = params)
File "D:\Python\GitHub\tagzy\lib\requests\api.py", line 71, in get
return request('get', url, params=params, **kwargs)
File "D:\Python\GitHub\tagzy\lib\requests\api.py", line 57, in request
return session.request(method=method, url=url, **kwargs)
File "D:\Python\GitHub\tagzy\lib\requests\sessions.py", line 475, in request
resp = self.send(prep, **send_kwargs)
File "D:\Python\GitHub\tagzy\lib\requests\sessions.py", line 585, in send
r = adapter.send(request, **kwargs)
File "D:\Python\GitHub\tagzy\lib\requests\adapters.py", line 453, in send
raise ConnectionError(err, request=request)
ConnectionError: ('Connection aborted.', error(13, 'Permission denied'))
My sdk version
Google Cloud SDK 177.0.0
app-engine-python 1.9.62 bq 2.0.27 core 2017.10.20 gsutil 4.28
Solution 1:
Downgrade requests module to version 2.1.0 using following command.
pip install requests==2.1.0
Solution 2:
As per official docs you can monkey patch requests to play nicely with Google App Engine.
install requests-toolbelt:
pip install -t lib requests-toolbelt
Then in your main.py file (or equivalent):
import requests_toolbelt.adapters.appengine
requests_toolbelt.adapters.appengine.monkeypatch()
I try to access the IBM Hyperledger Blockchain from python. Unfortunately I get an SSL Protocol error while trying to connect. I already searched the internet and specially stackoverflow to find a solution for it. Here is what I did:
Setup an IBM Hyperledger Blockchain service and get the URL to work with.
I tried the CURL call to access the API
curl -X GET --header "Accept: application/json" "https://SOMETHING_vp0.us.blockchain.ibm.com:443/network/peers"
This works fine for me (wonder why I don't need a passwd but it works).
I tried to access the same API from python, but got an error.
Python2.7 => 2.7.12, requests => 2.10.0 on mac
import requests
url = "https://SOMETHING_vp0.us.blockchain.ibm.com:443/network/peers"
response = requests.get(url)
print response.status_code
Accessing https://www.google.com works without a problem, but blockchain returns:
Traceback (most recent call last):
File "/Users/ansi/development/hyperledger/mcp.py", line 9, in <module>
response = requests.get(url)
File "/Users/ansi/development/virtualenv/general/lib/python2.7/site-packages/requests/api.py", line 71, in get
return request('get', url, params=params, **kwargs)
File "/Users/ansi/development/virtualenv/general/lib/python2.7/site-packages/requests/api.py", line 57, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/ansi/development/virtualenv/general/lib/python2.7/site-packages/requests/sessions.py", line 475, in request
resp = self.send(prep, **send_kwargs)
File "/Users/ansi/development/virtualenv/general/lib/python2.7/site-packages/requests/sessions.py", line 585, in send
r = adapter.send(request, **kwargs)
File "/Users/ansi/development/virtualenv/general/lib/python2.7/site-packages/requests/adapters.py", line 477, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: EOF occurred in violation of protocol (_ssl.c:590)
What can I do to solve the problem?
Thanks a lot
I tried to do the first command in the Quickstart for requests:
>>> import requests
>>> r = requests.get('https://github.com/timeline.json')
But I get the following error message:
Traceback (most recent call last):
File "./main.py", line 16, in <module>
requests.get('https://github.com/timeline.json')
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/api.py", line 55, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 383, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 486, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 385, in send
raise SSLError(e)
requests.exceptions.SSLError: [Errno 1] _ssl.c:499: error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
I am totally new to SSL certificates, but I suspect it has something to do with Python looking in the wrong place. I downloaded Python 2.7 and am using it as my default Python (I am running Mac OSX 10.6 (Snow Leopard), which came with Python 2.6). I had a lot of trouble with my Mac looking in the wrong place for Python stuff until I fixed the paths and made symbolic links, but I wonder if there is something else that has to do with the upgrade that is causing this SSL error? Or it could be something that doesn't have anything to do with that.
I have tried searching for similar questions and read some people's suggestions just to add the argument verify=False in requests.get(), but I don't want to do that, since I think that just avoids the real problem. Thanks for helping out a complete newbie.
You can try this.
Verify the path to the cert:
>>> requests.get('https://whatever.com', verify='/path/to/certfile')
Or
>>> requests.get('https://whatever.com', cert=('/path/server.crt', '/path/key'))
http://docs.python-requests.org/en/latest/user/advanced/