Uploading Newline-delimited JSON From a File to BigQuery - python

I recently wrote a Python script that uploads local, newline-delimited JSON files to a BigQuery table. It's very similar to the example provided in the official documentation here. The problem I'm having is that non-ASCII characters in the file I'm trying to upload are making my POST request barf.
Here's the relevant part of the script...
def upload(dataFilePath, loadJob, recipeJSON, logger):
body = '--xxx\n'
body += 'Content-Type: application/json; charset=UTF-8\n\n'
body += loadJob
body += '\n--xxx\n'
body += 'Content-Type: application/octet-stream\n\n'
dataFile = io.open(dataFilePath, 'r', encoding = 'utf-8')
body += dataFile.read()
dataFile.close()
body += '\n--xxx--\n'
credentials = buildCredentials(recipeJSON['keyPath'], recipeJSON['accountEmail'])
http = httplib2.Http()
http = credentials.authorize(http)
service = build('bigquery', 'v2', http=http)
projectId = recipeJSON['projectId']
url = BIGQUERY_URL_BASE + projectId + "/jobs"
headers = {'Content-Type': 'multipart/related; boundary=xxx'}
response, content = http.request(url, method="POST", body=body, headers=headers)
...and here's the stack trace I get when it runs...
Traceback (most recent call last):
File "/usr/local/uploader/upload_data.py", line 179, in <module>
main(sys.argv)
File "/usr/local/uploader/upload_data.py", line 170, in main
if (upload(unprocessedFile, loadJob, recipeJSON, logger)):
File "/usr/local/uploader/upload_data.py", line 100, in upload
response, content = http.request(url, method="POST", body=body, headers=headers)
File "/usr/local/lib/python2.7/site-packages/oauth2client/util.py", line 128, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/oauth2client/client.py", line 490, in new_request
redirections, connection_type)
File "/usr/local/lib/python2.7/site-packages/httplib2/__init__.py", line 1570, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "/usr/local/lib/python2.7/site-packages/httplib2/__init__.py", line 1317, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/usr/local/lib/python2.7/site-packages/httplib2/__init__.py", line 1253, in _conn_request
conn.request(method, request_uri, body, headers)
File "/usr/local/lib/python2.7/httplib.py", line 973, in request
self._send_request(method, url, body, headers)
File "/usr/local/lib/python2.7/httplib.py", line 1007, in _send_request
self.endheaders(body)
File "/usr/local/lib/python2.7/httplib.py", line 969, in endheaders
self._send_output(message_body)
File "/usr/local/lib/python2.7/httplib.py", line 833, in _send_output
self.send(message_body)
File "/usr/local/lib/python2.7/httplib.py", line 805, in send
self.sock.sendall(data)
File "/usr/local/lib/python2.7/ssl.py", line 229, in sendall
v = self.send(data[count:])
File "/usr/local/lib/python2.7/ssl.py", line 198, in send
v = self._sslobj.write(data)
UnicodeEncodeError: 'ascii' codec can't encode characters in position 4586-4611: ordinal not in range(128)
I'm using Python 2.7 and the following libraries:
distribute (0.6.36)
google-api-python-client (1.1)
httplib2 (0.8)
oauth2client (1.1)
pyOpenSSL (0.13)
python-gflags (2.0)
wsgiref (0.1.2)
Has anyone else had this problem?
It seems like httplib2's request method takes "body" as a string, which means that it later needs to be encoded before being sent over the wire. I've been searching for a way to override the encoding to UTF-8, but no luck so far.
Thanks in advance!
EDIT:
I was able to resolve this by doing two things:
1.) Reading the contents of my file raw with no decoding. (I could have also just encoded the "body" in my first attempt above...)
2.) Encoding to bytes the url and headers.
The code ended up looking like this:
def upload(dataFilePath, loadJob, recipeJSON, logger):
part_one = '--xxx\n'
part_one += 'Content-Type: application/json; charset=UTF-8\n\n'
part_one += loadJob
part_one += '\n--xxx\n'
part_one += 'Content-Type: application/octet-stream\n\n'
dataFile = io.open(dataFilePath, 'rb')
part_two = dataFile.read()
dataFile.close()
part_three = '\n--xxx--\n'
body = part_one.encode('utf-8')
body += part_two
body += part_three.encode('utf-8')
credentials = buildCredentials(recipeJSON['keyPath'], recipeJSON['accountEmail'])
http = httplib2.Http()
http = credentials.authorize(http)
service = build('bigquery', 'v2', http=http)
projectId = recipeJSON['projectId']
url = BIGQUERY_URL_BASE + projectId + "/jobs"
headers = {'Content-Type'.encode('utf-8'): 'multipart/related; boundary=xxx'.encode('utf-8')}
response, content = http.request(url.encode('utf-8'), method="POST", body=body, headers=headers)

io.open() will open the file as unicode text. Either use plain open(), or use binary mode:
dataFile = io.open(dataFilePath, 'rb')
You are sending the file contents straight out over the network, so you need to send bytes, not unicode, and as you found out, mixing Unicode and bytes leads to painful errors as python tries to automatically encode back to bytes using the ASCII codec when concatenating the two different types. There is no need to decode to Unicode at all here.

Related

Python download images with alernating variables

I was trying to download images with url's that change but got an error.
url_image="http://www.joblo.com/timthumb.php?src=/posters/images/full/"+str(title_2)+"-poster1.jpg&h=333&w=225"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url_image, None, headers)
print(url_image)
#image, h = urllib.request.urlretrieve(url_image)
with urllib.request.urlopen(req) as response:
the_page = response.read()
#print (the_page)
with open('poster.jpg', 'wb') as f:
f.write(the_page)
Traceback (most recent call last):
File "C:\Users\luke\Desktop\scraper\imager finder.py", line 97, in
with urllib.request.urlopen(req) as response:
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
My advice is to use urlib2. In addition, I've written a nice function (I think) that will also allow gzip encoding (reduce bandwidth) if the server supports it. I use this for downloading social media files, but should work for anything.
I would try to debug your code, but since it's just a snippet (and the error messages are formatted badly), it's hard to know exactly where your error is occurring (it's certainly not line 97 in your code snippet).
This isn't as short as it could be, but it's clear and reusable. This is python 2.7, it looks like you're using 3 - in which case you google some other questions that address how to use urllib2 in python 3.
import urllib2
import gzip
from StringIO import StringIO
def download(url):
"""
Download and return the file specified in the URL; attempt to use
gzip encoding if possible.
"""
request = urllib2.Request(url)
request.add_header('Accept-Encoding', 'gzip')
try:
response = urllib2.urlopen(request)
except Exception, e:
raise IOError("%s(%s) %s" % (_ERRORS[1], url, e))
payload = response.read()
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(payload)
f = gzip.GzipFile(fileobj=buf)
payload = f.read()
return payload
def save_media(filename, media):
file_handle = open(filename, "wb")
file_handle.write(media)
file_handle.close()
title_2 = "10-cloverfield-lane"
media = download("http://www.joblo.com/timthumb.php?src=/posters/images/full/{}-poster1.jpg&h=333&w=225".format(title_2))
save_media("poster.jpg", media)

Urllib Unicode Error, no unicode involved

EDIT: I've majorly edited the content of this post since the original to specify my problem:
I am writing a program to download webcomics, and I'm getting this weird error when downloading a page of the comic. The code I am running essentially boils down to the following line followed by the error. I do not know what is causing this error, and it is confusing me greatly.
>>> urllib.request.urlopen("http://abominable.cc/post/47699281401")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.4/urllib/request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.4/urllib/request.py", line 470, in open
response = meth(req, response)
File "/usr/lib/python3.4/urllib/request.py", line 580, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.4/urllib/request.py", line 502, in error
result = self._call_chain(*args)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 685, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.4/urllib/request.py", line 464, in open
response = self._open(req, data)
File "/usr/lib/python3.4/urllib/request.py", line 482, in _open
'_open', req)
File "/usr/lib/python3.4/urllib/request.py", line 442, in _call_chain
result = func(*args)
File "/usr/lib/python3.4/urllib/request.py", line 1211, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.4/urllib/request.py", line 1183, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.4/http/client.py", line 1137, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.4/http/client.py", line 1172, in _send_request
self.putrequest(method, url, **skips)
File "/usr/lib/python3.4/http/client.py", line 1014, in putrequest
self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 37-38: ordinal not in range(128)
The entirety of my program can be found here: https://github.com/nstephenh/pycomic
I was having the same problem. The root cause is that the remote server isn't playing by the rules. HTTP Headers are supposed to be US-ASCII only but apparently the leading http webservers (apache2, nginx) doesn't care and send direct UTF-8 encoded string.
However in http.client the parse_header function fetch the headers as iso-8859, and the default HTTPRedirectHandler in urllib doesn't care to quote the location or URI header, resulting in the aformentioned error.
I was able to 'work around' both thing by overriding the default HTTPRedirectHandler and adding three line to counter the latin1 decoding and add a path quote:
import urllib.request
from urllib.error import HTTPError
from urllib.parse import (
urlparse, quote, urljoin, urlunparse)
class UniRedirectHandler(urllib.request.HTTPRedirectHandler):
# Implementation note: To avoid the server sending us into an
# infinite loop, the request object needs to track what URLs we
# have already seen. Do this by adding a handler-specific
# attribute to the Request object.
def http_error_302(self, req, fp, code, msg, headers):
# Some servers (incorrectly) return multiple Location headers
# (so probably same goes for URI). Use first header.
if "location" in headers:
newurl = headers["location"]
elif "uri" in headers:
newurl = headers["uri"]
else:
return
# fix a possible malformed URL
urlparts = urlparse(newurl)
# For security reasons we don't allow redirection to anything other
# than http, https or ftp.
if urlparts.scheme not in ('http', 'https', 'ftp', ''):
raise HTTPError(
newurl, code,
"%s - Redirection to url '%s' is not allowed" % (msg, newurl),
headers, fp)
if not urlparts.path:
urlparts = list(urlparts)
urlparts[2] = "/"
else:
urlparts = list(urlparts)
# Header should only contain US-ASCII chars, but some servers do send unicode data
# that should be quoted back before reused
# Need to re-encode the string as iso-8859-1 before use of ""quote"" to cancel the effet of parse_header() in http/client.py
urlparts[2] = quote(urlparts[2].encode('iso-8859-1'))
newurl = urlunparse(urlparts)
newurl = urljoin(req.full_url, newurl)
# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
new = self.redirect_request(req, fp, code, msg, headers, newurl)
if new is None:
return
# loop detection
# .redirect_dict has a key url if url was previously visited.
if hasattr(req, 'redirect_dict'):
visited = new.redirect_dict = req.redirect_dict
if (visited.get(newurl, 0) >= self.max_repeats or
len(visited) >= self.max_redirections):
raise HTTPError(req.full_url, code,
self.inf_msg + msg, headers, fp)
else:
visited = new.redirect_dict = req.redirect_dict = {}
visited[newurl] = visited.get(newurl, 0) + 1
# Don't close the fp until we are sure that we won't use it
# with HTTPError.
fp.read()
fp.close()
return self.parent.open(new, timeout=req.timeout)
http_error_301 = http_error_303 = http_error_307 = http_error_302
[...]
# Change default Redirect Handler in urllib, should be done once at the beginning of the program
opener = urllib.request.build_opener(UniRedirectHandler())
urllib.request.install_opener(opener)
This is python3 code but should be easily adapted for python2 if need be.

JIRA OAuth process in Python

I'm using the below snippet to sign the request and get request tokens for JIRA OAuth process.
import base64
import urlparse
from tlslite.utils import keyfactory
import oauth2 as oauth
consumer_key = 'oauth-sample-consumer'
consumer_secret = 'dont_care'
request_token_url = 'https://localhost:8090/jira/plugins/servlet/oauth/request-token'
access_token_url = 'https://localhost:8090/jira/plugins/servlet/oauth/access-token'
authorize_url = 'https://localhost:8090/jira/plugins/servlet/oauth/authorize'
class SignatureMethod_RSA_SHA1(oauth.SignatureMethod):
name = 'RSA-SHA1'
def signing_base(self, request, consumer, token):
if not hasattr(request, 'normalized_url') or request.normalized_url is None:
raise ValueError("Base URL for request is not set.")
sig = (
oauth.escape(request.method),
oauth.escape(request.normalized_url),
oauth.escape(request.get_normalized_parameters()),
)
key = '%s&' % oauth.escape(consumer.secret)
if token:
key += oauth.escape(token.secret)
raw = '&'.join(sig)
return key, raw
def sign(self, request, consumer, token):
"""Builds the base signature string."""
key, raw = self.signing_base(request, consumer, token)
with open('../rsa.pem', 'r') as f:
data = f.read()
privateKeyString = data.strip()
privatekey = keyfactory.parsePrivateKey(privateKeyString)
signature = privatekey.hashAndSign(raw)
return base64.b64encode(signature)
if __name__=='__main__':
consumer = oauth.Consumer(consumer_key, consumer_secret)
client = oauth.Client(consumer)
client.set_signature_method(SignatureMethod_RSA_SHA1())
resp, content = client.request(request_token_url, "POST")
if resp['status'] != '200':
raise Exception("Invalid response %s: %s" % (resp['status'], content))
I have added the public key to JIRA consumer application. Now executing the above snippet always gives me this error:
Traceback (most recent call last):
File "views.py", line 80, in <module>
resp, content = client.request(request_token_url, "GET")
File "/usr/local/lib/python2.7/dist-packages/oauth2/__init__.py", line 682, in request
connection_type=connection_type)
File "/usr/local/lib/python2.7/dist-packages/httplib2/__init__.py", line 1570, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "/usr/local/lib/python2.7/dist-packages/httplib2/__init__.py", line 1317, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/usr/local/lib/python2.7/dist-packages/httplib2/__init__.py", line 1252, in _conn_request
conn.connect()
File "/usr/local/lib/python2.7/dist-packages/httplib2/__init__.py", line 1044, in connect
raise SSLHandshakeError(e)
httplib2.SSLHandshakeError: [Errno 1] _ssl.c:503: error:14090086:SSL routines:SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
I actually deleted my public key and again entered it in my consumer app to make sure there are no white spaces.
JIRA doesn't give any option to upload a public key file, so it has to be copied anyhow.
I got it solved using this certifi package
sudo pip install certifi
In code:
client.ca_certs = certifi.where()

Error in getting OAuth authentication url

I have the following code in a Python desktop application that authorizes users before using the AppHarbor API. I am following the steps mentioned in the knowledge base and have the following authentication code:
def OnAuthenticate(self, event):
client_id = "" # My App's client id
client_secret_key = "" # My App's secret key
consumer = oauth2.Consumer(key=client_id, secret=client_secret_key)
request_token_url = "https://appharbor.com/user/authorizations/new?client_id="+client_id+"&redirect_uri=http://localhost:8095"
client = oauth2.Client( consumer )
resp, content = client.request(request_token_url, "GET")
...
However, on sending the request, the response is incorrect, this is the error:
client.request(request_token_url, "GET")
TypeError: must be string or buffer, not None
Is there something that I am missing here?
Edit: Following is the stack trace that is thrown up:
resp, content = client.request(request_token_url, "GET")
File "C:\Python27\Lib\site-packages\oauth2-1.5.211-py2.7.egg\oauth2\__init__.py", line 682, in request
connection_type=connection_type)
File "C:\Python27\lib\site-packages\httplib2-0.7.4-py2.7.egg\httplib2\__init__.py", line 1544, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "C:\Python27\lib\site-packages\httplib2-0.7.4-py2.7.egg\httplib2\__init__.py", line 1342, in _request
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
File "C:\Python27\Lib\site-packages\oauth2-1.5.211-py2.7.egg\oauth2\__init__.py", line 662, in request
req.sign_request(self.method, self.consumer, self.token)
File "C:\Python27\Lib\site-packages\oauth2-1.5.211-py2.7.egg\oauth2\__init__.py", line 493, in sign_request
self['oauth_body_hash'] = base64.b64encode(sha(self.body).digest())
TypeError: must be string or buffer, not None
Upon debugging into the call, I reached httplib2._request function that issued a request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
This resulted in the following page with error response = 302 (presented in content object)
<html><head><title>Object moved</title></head><body>
<h2>Object moved to <a href="https://appharbor.com/session/new?returnUrl=%2Fuser%
2Fauthorizations%2Fnew%3Foauth_body_hash%3D2jmj7l5rSw0yVb%252FvlWAYkK%252FYBwk%
253D%26oauth_nonce%3D85804131%26oauth_timestamp%3D1340873274%
26oauth_consumer_key%3D26bacb38-ce5a-4699-9342-8e496c16dc49%26oauth_signature_method%
3DHMAC-SHA1%26oauth_version%3D1.0%26redirect_uri%3Dhttp%253A%252F%252Flocalhost%
253A8095%26client_id%3D26bacb38-ce5a-4699-9342-8e496c16dc49%26oauth_signature%
3DXQtYvWIsvML9ZM6Wfs1Wp%252Fy3No8%253D">here</a>.</h2>
</body></html>
The function next removed the body from the content to call another request with body set to None, resulting in the error that was thrown.
(response, content) = self.request(location, redirect_method, body=body, headers = headers, redirections = redirections - 1)
I'm not familiar with the Python lib, but have you considered whether this is because you need to take the user through the three-legged flow Twitter flow and use the url you mention in your question as the authorize_url? Once you have the code, you retrieve the token by POST'ing to this url: https://appharbor.com/tokens.
You might also want to take a closer look at the desktop OAuth .NET sample to get a better understanding of how this works.

python unhashable type - posting xml data

First, I'm not a python programmer. I'm an old C dog that's learned new Java and PHP tricks, but python looks like a pretty cool language.
I'm getting an error that I can't quite follow. The error follows the code below.
import httplib, urllib
url = "pdb-services-beta.nipr.com"
xml = '<?xml version="1.0"?><!DOCTYPE SCB_Request SYSTEM "http://www.nipr.com/html/SCB_XML_Request.dtd"><SCB_Request Request_Type="Create_Report"><SCB_Login_Data CustomerID="someuser" Passwd="somepass" /><SCB_Create_Report_Request Title=""><Producer_List><NIPR_Num_List_XML><NIPR_Num NIPR_Num="8980608" /><NIPR_Num NIPR_Num="7597855" /><NIPR_Num NIPR_Num="10166016" /></NIPR_Num_List_XML></Producer_List></SCB_Create_Report_Request></SCB_Request>'
params = {}
params['xmldata'] = xml
headers = {}
headers['Content-type'] = 'text/xml'
headers['Accept'] = '*/*'
headers['Content-Length'] = "%d" % len(xml)
connection = httplib.HTTPSConnection(url)
connection.set_debuglevel(1)
connection.request("POST", "/pdb-xml-reports/scb_xmlclient.cgi", params, headers)
response = connection.getresponse()
print response.status, response.reason
data = response.read()
print data
connection.close
Here's the error:
Traceback (most recent call last):
File "C:\Python27\tutorial.py", line 14, in connection.request("POST", "/pdb-xml-reports/scb_xmlclient.cgi", params, headers)
File "C:\Python27\lib\httplib.py", line 958, in request self._send_request(method, url, body, headers)
File "C:\Python27\lib\httplib.py", line 992, in _send_request self.endheaders(body)
File "C:\Python27\lib\httplib.py", line 954, in endheaders self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 818, in _send_output self.send(message_body)
File "C:\Python27\lib\httplib.py", line 790, in send self.sock.sendall(data)
File "C:\Python27\lib\ssl.py", line 229, in sendall v = self.send(data[count:])
TypeError: unhashable type
My log file says that the xmldata parameter is empty.
Any ideas?
I guess params has to be a string when passing to .request, this would explain the error, due to the fact, that a hash is not hashable
Try to encode your params first with
params = urllib.urlencode(params)
You can find another code example too at the bottom of:
http://docs.python.org/release/3.1.5/library/http.client.html
Thanks for the feedback.
I guess I was making this too hard. I went a different route and it seems to work.
import urllib2
URL = "https://pdb-services-beta.nipr.com/pdb-xml-reports/scb_xmlclient.cgi"
DATA = 'xmldata=<?xml version="1.0"?><!DOCTYPE SCB_Request SYSTEM "http://www.nipr.com/html/SCB_XML_Request.dtd"><SCB_Request Request_Type="Create_Report"><SCB_Login_Data CustomerID="someuser" Passwd="somepass" /><SCB_Create_Report_Request Title=""><Producer_List><NIPR_Num_List_XML><NIPR_Num NIPR_Num="8980608" /></NIPR_Num_List_XML></Producer_List></SCB_Create_Report_Request></SCB_Request>'
req = urllib2.Request(url=URL, data=DATA)
f = urllib2.urlopen(req)
print f.read()

Categories

Resources