Intermittent DownloadError Application Error 2 on Google App Engine - python

We have two applications that are both running on Google App Engine. App1 makes requests to app2 as an authenticated user. The authentication works by requesting an authentication token from Google ClientLogin that is exchanged for a cookie. The cookie is then used for subsequent requests (as described here). App1 runs the following code:
class AuthConnection:
def __init__(self):
self.cookie_jar = cookielib.CookieJar()
self.opener = urllib2.OpenerDirector()
self.opener.add_handler(urllib2.ProxyHandler())
self.opener.add_handler(urllib2.UnknownHandler())
self.opener.add_handler(urllib2.HTTPHandler())
self.opener.add_handler(urllib2.HTTPRedirectHandler())
self.opener.add_handler(urllib2.HTTPDefaultErrorHandler())
self.opener.add_handler(urllib2.HTTPSHandler())
self.opener.add_handler(urllib2.HTTPErrorProcessor())
self.opener.add_handler(urllib2.HTTPCookieProcessor(self.cookie_jar))
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; ' +\
'Windows NT 6.1; en-US; rv:1.9.1.2) ' +\
'Gecko/20090729 Firefox/3.5.2 ' +\
'(.NET CLR 3.5.30729)'
}
def fetch(self, url, method, payload=None):
self.__updateJar(url)
request = urllib2.Request(url)
request.get_method = lambda: method
for key, value in self.headers.iteritems():
request.add_header(key, value)
response = self.opener.open(request)
return response.read()
def __updateJar(self, url):
cache = memcache.Client()
cookie = cache.get('auth_cookie')
if cookie:
self.cookie_jar.set_cookie(cookie)
else:
cookie = self.__retrieveCookie(url=url)
cache.set('auth_cookie', cookie, 5000)
def __getCookie(self, url):
auth_url = 'https://www.google.com/accounts/ClientLogin'
auth_data = urllib.urlencode({'Email': USER_NAME,
'Passwd': PASSPHRASE,
'service': 'ah',
'source': 'app1',
'accountType': 'HOSTED_OR_GOOGLE' })
auth_request = urllib2.Request(auth_url, data=auth_data)
auth_response_body = self.opener.open(auth_request).read()
auth_response_dict = dict(x.split('=')
for x in auth_response_body.split('\n') if x)
cookie_args = {}
cookie_args['continue'] = url
cookie_args['auth'] = auth_response_dict['Auth']
cookie_url = 'https://%s/_ah/login?%s' %\
('app2.appspot.com', (urllib.urlencode(cookie_args)))
cookie_request = urllib2.Request(cookie_url)
for key, value in self.headers.iteritems():
cookie_request.add_header(key, value)
try:
self.opener.open(cookie_request)
except:
pass
for cookie in self.cookie_jar:
if cookie.domain == 'app2domain':
return cookie
For 10-30% of the requests a DownloadError is raised:
Error fetching https://app2/Resource
Traceback (most recent call last):
File "/base/data/home/apps/app1/5.344034030246386521/source/main/connection/authenticate.py", line 112, in fetch
response = self.opener.open(request)
File "/base/python_runtime/python_dist/lib/python2.5/urllib2.py", line 381, in open
response = self._open(req, data)
File "/base/python_runtime/python_dist/lib/python2.5/urllib2.py", line 399, in _open
'_open', req)
File "/base/python_runtime/python_dist/lib/python2.5/urllib2.py", line 360, in _call_chain
result = func(*args)
File "/base/python_runtime/python_dist/lib/python2.5/urllib2.py", line 1115, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/base/python_runtime/python_dist/lib/python2.5/urllib2.py", line 1080, in do_open
r = h.getresponse()
File "/base/python_runtime/python_dist/lib/python2.5/httplib.py", line 197, in getresponse
self._allow_truncated, self._follow_redirects)
File "/base/data/home/apps/app1/5.344034030246386521/source/main/connection/monkeypatch_urlfetch_deadline.py", line 18, in new_fetch
follow_redirects, deadline, *args, **kwargs)
File "/base/python_runtime/python_lib/versions/1/google/appengine/api/urlfetch.py", line 241, in fetch
return rpc.get_result()
File "/base/python_runtime/python_lib/versions/1/google/appengine/api/apiproxy_stub_map.py", line 501, in get_result
return self.__get_result_hook(self)
File "/base/python_runtime/python_lib/versions/1/google/appengine/api/urlfetch.py", line 325, in _get_fetch_result
raise DownloadError(str(err))
DownloadError: ApplicationError: 2
The request logs for app2 (the "server") seem fine, as expected (according to the docs DownloadError is only raised if there was no valid HTTP response).
Why is the exception raised?

see this:
http://bitbucket.org/guilin/gae-rproxy/src/tip/gae_rproxy/niceurllib.py
because of urllib and urllib2 default to handle http 302 code, and automatically redirect to what the server told it. But when redirect it does not contains the cookie which the server told it.
for example:
urllib2 request //server/login
server response 302,
//server/profile , set-cookie :
session-id:xxxx
urllib2 request
//server/profile
server response not login error or
500 error cause there is no
session-id found.
urllib2 throw error
so, there is no chance for you to set cookie.
self.opener.add_handler(urllib2.HTTPRedirectHandler())
I think you should remove this line and add your own HTTPRedirectHandler which neighter throw error nor automatically redirect , just return the http code and headers, so you have the chance to set cookie.

Related

Error while accessing workdy RAAS report through python

I am trying to access the workday report through python. i am able to access this through browser with userid and passwd. But when i run through python i am getting the below error.
import os
import platform
import ssl
import urllib.request
import urllib.parse
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
ssl_context.verify_mode = ssl.CERT_REQUIRED
ssl_context.check_hostname = True
ssl_context.load_default_certs()
if platform.system().lower() == 'windows':
import certifi
print(os.path.relpath(certifi.where()))
ssl_context.load_verify_locations(
#cafile=os.path.relpath(certifi.where()),
cafile="C:\\abc_Tools\\TDX_INT166\\Lib\\site-packages\\certifi\\workday.pem",
capath=None,
cadata=None)
print(platform.system().lower())
url = 'https://wd5-impl-services1.workday.com/ccx/service/customreport2/xxx/ISU_INT167/CR_-
_FIN_Report'
username = 'XXXXXXXXX' # 10 digit ID
password = 'XXXXXXXXX'
values ={'username' : username, 'password':password}
data = urllib.parse.urlencode(values).encode("utf-8")
#cookies = cookielib.CookieJar()
https_handler = urllib.request.HTTPSHandler(context=ssl_context)
opener = urllib.request.build_opener(https_handler)
ret = opener.open(url, timeout=2)
print(ret)
I am getting the below error.
site-packages\certifi\cacert.pem
windows
Traceback (most recent call last):
File "C:\SLU_Tools\TDX_INT166\Lib\Certificate_testing.py", line 38, in <module>
ret = opener.open(url, timeout=2)
File
"C:\Users\prasannakumaravel\AppData\Local\Programs\Python\Python39\lib\urllib\request.py",
line 523, in open
response = meth(req, response)
File
"C:\Users\prasannakumaravel\AppData\Local\Programs\Python\Python39\lib\urllib\request.py",
line 632, in http_response
response = self.parent.error(
File
"C:\Users\prasannakumaravel\AppData\Local\Programs\Python\Python39\lib\urllib\request.py",
line 561, in error
return self._call_chain(*args)
File
"C:\Users\prasannakumaravel\AppData\Local\Programs\Python\Python39\lib\urllib\request.py",
line 494, in _call_chain
result = func(*args)
File
"C:\Users\prasannakumaravel\AppData\Local\Programs\Python\Python39\lib\urllib\request.py",
line 641, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 401: Unauthorized
I tried other ways as well. But nothing worked. so far. Is this something doable?
When calling a RAAS this way, you need to authenticate using Basic Auth. The username/password needs to be Base64 encoded and the result is added as an HTTP header. See this answer for an example.
You will need to set up the authorizations using the Basic Auth type, and you will need to encode your username/password following the Basic Auth schemes -- converting the credentials to base64 first, then back to a string format and using it in the authorization string.
I used code similar to the snippet below in our web applications:
import requests, base64
url = '<Workday_RaaS_API_Endpoint>'
username = '<your_RaaS_user_username>'
password = '<your_RaaS_user_password>' # replace with the password here
auth = 'Basic %s' % base64.b64encode(bytes('%s:%s' % (username, password), 'utf-8')).decode('utf-8')
headers = { 'Authorization': auth }
res = requests.get(url=url, headers=headers)
print(res.json())

How to remove dependency on "HTTP For Humans" Requests library and use urllib instead?

I need to use the urllib for this, not urllib2, urllib3, or Requests.
The curl equivalent...
curl -H "Content-Type: application/json" -X POST -d '{"title":"My New Title","content":"blahblah","excerpt":"blah"}' http://localhost/wp-json/wp/v2/posts --user 'user:xxx'
... and the following code work fine:
r = requests.post(url + '/wp-json/wp/v2/posts',
auth=HTTPBasicAuth(username, password),
data=json.dumps(params),
headers={'content-type': 'application/json'})
if(not(r.ok)):
raise Exception('failed with rc=' + r.status_code)
return json.loads(r.text)
But this fails:
params = {
'title': 'The Title',
'content': content,
'exerpt': 'blah'
}
postparam = json.dumps(params).encode('utf-8')
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, username, password)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(handler)
req = urllib.request.Request(url + '/wp-json/wp/v2/posts', method='POST', data=postparam)
req.add_header('Content-Type', 'application/json')
r = opener.open(req)
if(r.getcode() != 200):
raise Exception('failed with rc=' + r.getcode())
Traceback:
Traceback (most recent call last):
File "test_wp_api.py", line 10, in <module>
rs = create(endpoint, "test")
File "C:\...\wordpress.py", line 28, in create
r = opener.open(req)
File "c:\usr\Python37-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "c:\usr\Python37-32\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "c:\usr\Python37-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "c:\usr\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "c:\usr\Python37-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 401: Unauthorized
What's the cause, and how to fix it?
I will venture a guess. Please, try this out and if it does not address your issue, I can ditch the answer (but it won't easily fit into a comment as is).
Not sure about requests. but curl when provided with credentials does not wait for challenge (HTTP 401 and info on supported authentication methods) to try (respond with) authentication (attempt). Some servers do not send such challenge and just expect pre-authenticated session or a force (lucky guess a bit) authentication. I have to talk to some hosts like that too. If that is the case, you cannot change server setup and you are OK to just assume HTTP basic is supported, you can try forcing your credentials into each request. Reusing your opener,username, and password and you also need to import base64 for this snippet which does just that:
credentials = '{}:{}'.format(username, password).encode('ascii')
credentials = base64.b64encode(credentials)
credentials = b"Basic " + credentials
opener.addheaders.append(("Authorization", credentials))

HTTP Error 403: Forbidden w/ User Agents Added

Usually I've been able to get around 403 Errors once I've added a known User Agent but I'm now trying to login and then eventually scrape and cannot figure out how to bypass this error.
Code:
import urllib
import http.cookiejar
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
authentication_url = 'https://www.linkedin.com/'
payload = {
'session_key': 'email',
'session_password': 'password'
}
data = urllib.parse.urlencode(payload)
binary_data = data.encode('UTF-8')
req = urllib.request.Request(authentication_url, binary_data)
resp = urllib.request.urlopen(req)
contents = resp.read()
Traceback:
Traceback (most recent call last):
File "C:/Python34/loginLinked.py", line 16, in <module>
resp = urllib.request.urlopen(req)
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 469, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 579, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 507, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 441, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 587, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
See my answer to this question:
why isn't Requests not signing into a website correctly?
I should start with stating that you really should use their API:
http://developer.linkedin.com/apis
There does not seem to be any POST login on the frontpage of linkedin using those parameters?
This is the login URL you must POST to:
https://www.linkedin.com/uas/login-submit
Be aware that this probably wont work either, as you need at least the csrfToken parameter from the login form.
You probably need the loginCsrfParam too, also from the login form on the frontpage.
Something like this might work. Not tested, you might need to add the other POST parameters.
import requests
s = requests.session()
def get_csrf_tokens():
url = "https://www.linkedin.com/"
req = s.get(url).text
csrf_token = req.split('name="csrfToken" value=')[1].split('" id="')[0]
login_csrf_token = req.split('name="loginCsrfParam" value="')[1].split('" id="')[0]
return csrf_token, login_csrf_token
def login(username, password):
url = "https://www.linkedin.com/uas/login-submit"
csrfToken, loginCsrfParam = get_csrf_tokens()
data = {
'session_key': username,
'session_password': password,
'csrfToken': csrfToken,
'loginCsrfParam': loginCsrfParams
}
req = s.post(url, data=data)
login('username', 'password')

Python Requests Timeout Value error

Good Evening,
I cannot get my https request to go through. I'm having to use SSLv3, so I'm specifying the protocol with:
import requests
from requests.adapters import HTTPAdapter
from urllib3.poolmanager import PoolManager
import ssl
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_SSLv3)
username = 'username'
password = 'password'
email = 'email#example.com'
url = 'https://api.example.com/'
headers = {'Accept': 'application/json', 'content-type': 'application/json'}
params = {'emailaddress': email}
auth = (username, password)
s = requests.Session()
s.mount(url, MyAdapter())
r = s.get(url+'customer.svc/search', params=params, auth=auth, headers=headers)
When I run my get request I get the following error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/requests/sessions.py", line 473, in get
return self.request('GET', url, **kwargs)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/requests/adapters.py", line 370, in send
timeout=timeout
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/urllib3/connectionpool.py", line 517, in urlopen
timeout_obj = self._get_timeout(timeout)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/urllib3/connectionpool.py", line 283, in _get_timeout
return Timeout.from_float(timeout)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/urllib3/util/timeout.py", line 152, in from_float
return Timeout(read=timeout, connect=timeout)
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/urllib3/util/timeout.py", line 95, in __init__
self._connect = self._validate_timeout(connect, 'connect')
File "/home/ubuntu/workspace/app/venv/lib/python2.7/site-packages/urllib3/util/timeout.py", line 125, in _validate_timeout
"int or float." % (name, value))
ValueError: Timeout value connect was Timeout(connect=None, read=None, total=None), but it must be an int or float.
Any ideas? I can't figure it out.
Additional Context: I'm on an Amazon EC2 Ubuntu Instance, running requests 2.5.1 and python 2.7.6
Looks like requests doesn't always play nice when you bring in an outside urllib3 distribution. What I ended up doing was calling requests internal urllib3 instead.
from requests.packages.urllib3.poolmanager import PoolManager
instead of:
from urllib3.poolmanager import PoolManager
No issues since I started doing this.

Python program to search videos using Bing

I have been trying to search videos using bing search engine. But every-time I try I get error HTTPError:HTTPError 403:Forbidden
import urllib
import urllib2
import json
def main():
query = "'pyscripter'"
print bing_search(query, 'Video')
def bing_search(query, search_type):
#search_type: Web, Image, News, Video
key= 'LsE7jElMmTDfbrnCEmrCmCEBbaPxMG5BvKr9CsfmSNS'
query = urllib.quote(query)
#create credential for authentication
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; FDM; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 1.1.4322)'
credentials = (':%s' % key).encode('base64')[:-1]
auth = 'Basic %s' % credentials
url = 'https://api.datamarket.azure.com/Data.ashx/Bing/Search/'+search_type+'?Query=%27'+query+'%27&$top=5&$format=json'
request = urllib2.Request(url)
request.add_header('Authorization', auth)
request.add_header('User-Agent', user_agent)
request_opener = urllib2.build_opener()
response = request_opener.open(request)
response_data = response.read()
json_result = json.loads(response_data)
result_list = json_result['d']['results']
print result_list
return result_list
if __name__ == '__main__':
main()
The error shown is:
Traceback (most recent call last):
File "<module1>", line 30, in <module>
File "<module1>", line 7, in main
File "<module1>", line 22, in bing_search
File "C:\Python27\lib\urllib2.py", line 410, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 448, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 403: Forbidden
Before trying this I worked with YouTube search API which worked fine. But the only problem was that it was limited to the videos present in YouTube database. What I want is the list of URL's of all the videos related to the keyword present in internet. So I started with Bing search engine. Any help regarding this would be appreciated.
I had save issue,
A web server may return a 403 Forbidden HTTP status code in response to a request from a client for a web page or resource to indicate that the server can be reached and understood the request, but refuses to take any further action. Status code 403 responses are the result of the web server being configured to deny access, for some reason, to the requested resource by the client.
in my case, I forgot to activate "bing search" subscription, so go to "https://datamarket.azure.com/dataset/bing/search" and activate "bing search" subscription

Categories

Resources