Python (requests) - incorrect encoding when fetching headers - python

I am using requests library (python 3.9) to get filename from URL.[1] For some reason a file name is incorrectly encoded.
I should get "Ogłoszenie_0320.pdf" instead of "OgÅ\x82oszenie_0320.pdf".
My code looks something like this:
import requests
import re
def getFilenameFromRequest(url : str, headers):
# Parses from header information
contentDisposition = headers.get('content-disposition')
if contentDisposition:
filename = re.findall('filename=(.+)', contentDisposition)
print("oooooooooo: " + contentDisposition + " : " + str(filename))
if len(filename) != 0:
return filename[0]
# Parses from url
parsedUrl = urlparse(url)
return os.path.basename(parsedUrl.path)
def getFilenameFromUrl(url : str):
request = requests.head(url)
headers = request.headers
return getFilenameFromRequest(url, headers)
getFilenameFromUrl('https://przedszkolekw.bip.gov.pl'+
'/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html')
Any idea how to fix it?
I know for standard request I can set encoding directly:
request.encoding = 'utf-8'
But what am I supposed to do with this case?
[1]
https://przedszkolekw.bip.gov.pl/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html

Only characters from the ascii based latin-1 should be used as header values [rfc]. Here the file name has been escaped.
>>> s = "Ogłoszenie_0320.pdf"
>>> s.encode("utf8").decode("unicode-escape")
'OgÅ\x82oszenie_0320.pdf'
To reverse the process you can do
>>> sx = 'OgÅ\x82oszenie_0320.pdf'
>>> sx.encode("latin-1").decode("utf8")
'Ogłoszenie_0320.pdf'
(updated after conversation in comments)

Related

HTTP Error 400 bad request python

from urllib.request import *
import urllib
def read_text():
text = open("/home/pizzapablo666/Desktop/Test")
contents_of_file = text.read()
print(contents_of_file)
text.close()
check_profanity(contents_of_file)
def check_profanity(text_to_check):
text_to_check = urllib.parse.quote_plus(text_to_check)
connection = urlopen(
"http://www.wdylike.appspot.com/?q=" + text_to_check)
output = connection.read()
print(output)
connection.close()
read_text()
THis is updated version
HTTP 400 error bad request, what is the cause ? and how can I fix this error?
I think it is because you are not encoding your string before appending it to your url.
For example, in python3 you should do the following to 'text_to_check' before appending it to your url:
text_to_check = urllib.parse.quote_plus(text_to_check)
Python2 would be something like this (urllib was broken into smaller components in python3):
text_to_check = urllib.quote_plus(text_to_check)
This means that, when appending a string with whitespace to your url it will appear as something like "Am+I+cursing%3F" instead of "Am I cursing?".
Full check_profanity() example:
def check_profanity(text_to_check):
text_to_check = urllib.parse.quote_plus(text_to_check)
connection = urlopen(
"http://www.wdylike.appspot.com/?q=" + text_to_check)
output = connection.read()
print(output)
connection.close()

Python re.search function regex issue

I have the following code:
#!/usr/bin/python
import time, uuid, hmac, hashlib, base64, json
import urllib3
import certifi
import datetime
import requests
import re
from datetime import datetime
http = urllib3.PoolManager(
cert_reqs='CERT_REQUIRED', # Force certificate check.
ca_certs=certifi.where(), # Path to the Certifi bundle.
)
#Get the status response from pritunl api
BASE_URL = 'https://www.vpn.trimble.cloud:443'
API_TOKEN = 'gvwrfQZQPryTbX3l03AQMwTyaE0aFywE'
API_SECRET = 'B0vZp5dDyOrshW1pmFFjAnIUyeGtFy9y'
LOG_PATH = '/var/log/developer_vpn/'
def auth_request(method, path, headers=None, data=None):
auth_timestamp = str(int(time.time()))
auth_nonce = uuid.uuid4().hex
auth_string = '&'.join([API_TOKEN, auth_timestamp, auth_nonce,
method.upper(), path] + ([data] if data else []))
auth_signature = base64.b64encode(hmac.new(
API_SECRET, auth_string, hashlib.sha256).digest())
auth_headers = {
'Auth-Token': API_TOKEN,
'Auth-Timestamp': auth_timestamp,
'Auth-Nonce': auth_nonce,
'Auth-Signature': auth_signature,
}
if headers:
auth_headers.update(headers)
return http.request(method, BASE_URL + path, headers=auth_headers, body=data)
response1 = auth_request('GET',
'/server',
)
if response1.status == 200:
pritunlServResponse = (json.loads(response1.data))
#print pritunlServResponse
#print response1.data
Name = [y['name'] for y in pritunlServResponse]
Server_id = [x['id'] for x in pritunlServResponse]
for srv_name, srv_id in zip(Name, Server_id):
response2 = auth_request('GET',
'/server/' + srv_id + '/output',
)
pritunlServResponse2 = (json.loads(response2.data))
py_pritunlServResponse2 = pritunlServResponse2['output']
print("value of srv_id: ", srv_id, "\n")
print("value of srv_name: ", srv_name, "\n")
logfile = open(LOG_PATH + srv_name +'_vpn_out.log', 'w')
for log in py_pritunlServResponse2:
if re.search(r'(?!52\.39\.62\.8)', log):
logfile.write("%s\n" % log)
logfile.close()
else:
raise SystemExit
This code visits a website using authentication (the address has been redacted), grabs some text formatted in JSON, and parses two values from the output: "srv_name" and "srv_id". This code then uses the "srv_id" to construct additional HTTP requests to get log files from the server. It then grabs the log files - one for each "srv_id" and names them with the values obtained from "srv_name" and saves them on the local system.
I want to do some additional grep-style processing before the files are written to the local system. Specifically I'd like to exclude any text exactly containing "52.39.62.8" from being written. When I run the code above, it looks like the regex is not being processed as I still see "52.39.62.8" in my output files.
If the IP address is always flanked by specific characters, e.g.: (52.39.62.8):, you can use in for exact contains:
if '(52.39.62.8):' not in log:
logfile.write(log + '\n')
re.search(r'(?!52\.39\.62\.8)', log)
You're matching any empty string that is not followed by the ip address - every string will match, as this will match the end of any string.
reverse your logic and output the line to the log only if re.search for the ip address comes back as None.
if re.search(r'(?<!\d)52\.39\.62\.8(?!\d)', log) is None:
logfile.write("%s\n" % log)
note that this also includes it's own negative look-behind and look-ahead assertions to ensure no digits precede or follow the ip address.

what's wrong with this python code

This is a code with Web crawler.
I'm a beginer in learning python.So i don't know how to solve.
It seems wrong with search()
# -*- coding:utf-8 -*-
import urllib,urllib2,re
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz' + str(seeLZ)
def getPage(self,pageNum):
try:
url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read().decode('utf-8')
return response
except urllib2.URLError,e:
if hasattr(e,'reason'):
print u'连接百度贴吧失败,错误原因',e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class.*?px">(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/4095047339'
bdtb = BDTB(baseURL,1)
bdtb.getTitle()
This will raise a TypeError: expected string or buffer because you are passing the object returned from urllib2.urlopen(request) to re.search() when it requires an str.
If you change the return value from:
return responce # returns the object
to one that returns the text contained in the request:
return responce.read() # returns the text contained in the responce
Your script works and after executing it returns:
广告兼职及二手物品交易集中贴
Additionally, since you're working with Python 2.x you might want to change you object from class BDTB: to class BDTB(object) in order to use new style classes.

Issues using strip() on a variable

I'm trying to strip the characters b,'().
The issue I'm having is that it says TypeError 'str' does not support the buffer interface.
Here are the relevant parts of code in this:
import urllib3
def command_uptime():
http = urllib3.PoolManager()
r = http.request('GET', 'https://nightdev.com/hosted/uptime.php?channel=TrippedNW')
rawData = r.data
liveTime = bytes(rawData.strip("b,\'()", rawData))
message = "Tripped has been live for: ", liveTime
send_message(CHAN, message)
What you have is binary data. Its not a string. You need to decode it first.
Also you don't need to pass rawData to itself in strip method.
import urllib3
def command_uptime():
http = urllib3.PoolManager()
r = http.request('GET', 'https://nightdev.com/hosted/uptime.php?channel=TrippedNW')
strData = r.data.decode('utf-8')
liveTime = strData.strip("b,\'()")
message = "Tripped has been live for: %s" % liveTime
print(message)
command_uptime()
Be also aware that your message variable is a tuple not a string. I dont know if send_message expects this. I formatted it into a single string.
Just remove the second argument.
import urllib3
def command_uptime():
http = urllib3.PoolManager()
r = http.request('GET', 'https://nightdev.com/hosted/uptime.php?channel=TrippedNW')
rawData = r.data
liveTime = bytes(rawData.strip("b,'()"))
print("Tripped has been live for: %s" % liveTime)
command_uptime()
Output:
Tripped has been live for: 1 hour, 18 minutes

Oauth1.0 API issue with Python

I'm trying to get magiccardmarket.eu API authentication to work in Python, but no matter whether I'm using rauth or requests_oauthlib, I get 403.
My code is:
#!/usr/bin/python
import logging
import rauth
import requests_oauthlib
logging.basicConfig(level=logging.DEBUG)
mkm_app_token = 'B7VI9Qg2xh855WtR'
mkm_app_secret = '<cut>'
mkm_access_token = 'LQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo'
mkm_token_secret = '<cut>'
url = 'https://sandbox.mkmapi.eu/ws/v1.1/account'
# session = rauth.OAuth1Session(
# consumer_key=mkm_app_token,
# consumer_secret=mkm_app_secret,
# access_token=mkm_access_token,
# access_token_secret=mkm_token_secret,
# )
session = requests_oauthlib.OAuth1Session(
mkm_app_token,
client_secret=mkm_app_secret,
resource_owner_key=mkm_access_token,
resource_owner_secret=mkm_token_secret,
)
r = session.get(url)
print(r)
When I look at debugging info, everything seems fine (of course besides 403 response):
DEBUG:requests_oauthlib.oauth1_auth:Signing request <PreparedRequest [GET]> using client <Client nonce=None, signature_method=HMAC-SHA1, realm=None, encoding=utf-8, timestamp=None, resource_owner_secret=****, decoding=utf-8, verifier=None, signature_type=AUTH_HEADER, rsa_key=None, resource_owner_key=LQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo, client_secret=****, callback_uri=None, client_key=B7VI9Qg2xh855WtR>
DEBUG:requests_oauthlib.oauth1_auth:Including body in call to sign: False
DEBUG:oauthlib.oauth1.rfc5849:Collected params: [(u'oauth_nonce', u'87129670621454425921416648590'), (u'oauth_timestamp', u'1416648590'), (u'oauth_consumer_key', u'B7VI9Qg2xh855WtR'), (u'oauth_signature_method', u'HMAC-SHA1'), (u'oauth_version', u'1.0'), (u'oauth_token', u'LQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo')]
DEBUG:oauthlib.oauth1.rfc5849:Normalized params: oauth_consumer_key=B7VI9Qg2xh855WtR&oauth_nonce=87129670621454425921416648590&oauth_signature_method=HMAC-SHA1&oauth_timestamp=1416648590&oauth_token=LQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo&oauth_version=1.0
DEBUG:oauthlib.oauth1.rfc5849:Normalized URI: https://sandbox.mkmapi.eu/ws/v1.1/account
DEBUG:oauthlib.oauth1.rfc5849:Base signing string: GET&https%3A%2F%2Fsandbox.mkmapi.eu%2Fws%2Fv1.1%2Faccount&oauth_consumer_key%3DB7VI9Qg2xh855WtR%26oauth_nonce%3D87129670621454425921416648590%26oauth_signature_method%3DHMAC-SHA1%26oauth_timestamp%3D1416648590%26oauth_token%3DLQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo%26oauth_version%3D1.0
DEBUG:oauthlib.oauth1.rfc5849:Signature: 291LTesHZR6W4bjZ1NqSW5hEgoM=
DEBUG:oauthlib.oauth1.rfc5849:Encoding URI, headers and body to utf-8.
DEBUG:requests_oauthlib.oauth1_auth:Updated url: https://sandbox.mkmapi.eu/ws/v1.1/account
DEBUG:requests_oauthlib.oauth1_auth:Updated headers: {'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', 'Authorization': 'OAuth oauth_nonce="87129670621454425921416648590", oauth_timestamp="1416648590", oauth_version="1.0", oauth_signature_method="HMAC-SHA1", oauth_consumer_key="B7VI9Qg2xh855WtR", oauth_token="LQj2rUwOFUJsmuJvCTlny1UzGZSXzHjo", oauth_signature="291LTesHZR6W4bjZ1NqSW5hEgoM%3D"', 'User-Agent': 'python-requests/2.4.3 CPython/2.7.8 Darwin/14.0.0'}
DEBUG:requests_oauthlib.oauth1_auth:Updated body: None
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): sandbox.mkmapi.eu
DEBUG:requests.packages.urllib3.connectionpool:"GET /ws/v1.1/account HTTP/1.1" 403 None
This is not an issue of authentication details, which are provided on account profile page when you request dedicated application API access, since those details work fine with PHP example provided by the site: https://www.mkmapi.eu/ws/documentation/API:Auth_libcurl
When I go through site's documentation, nothing seems out of ordinary: https://www.mkmapi.eu/ws/documentation/API:Auth_Overview
I honestly don't know where to go from here...
I realized that the code above with requests_oauthlib didn't build the header like it was layed out in the documentation, so I ended up inventing the wheel again and building the header myself, following the steps outlined in the documentation: https://www.mkmapi.eu/ws/documentation/API:Auth_OAuthHeader
The following script is not very beautiful, but it does its job.
import requests
from urllib import quote_plus as rawurlencode
import time
import string
import random
import operator
from hashlib import sha1
from hmac import new as hmac
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
# personal Info - taken from https://www.mkmapi.eu/ws/documentation/API:Auth_Overview
mkmAppToken = 'bfaD9xOU0SXBhtBP'
mkmAppSecret = 'pChvrpp6AEOEwxBIIUBOvWcRG3X9xL4Y'
mkmAccessToken = 'lBY1xptUJ7ZJSK01x4fNwzw8kAe5b10Q'
mkmAccessSecret = 'hc1wJAOX02pGGJK2uAv1ZOiwS7I9Tpoe'
# Url to access on mkm
# note that this deviates from the example in the header documentation (https://www.mkmapi.eu/ws/documentation/API:Auth_OAuthHeader) which uses
#accessUrl = 'https://www.mkmapi.eu/ws/v1.1/account'
accessUrl = 'https://www.mkmapi.eu/ws/v1.1/output.json/account'
#Method for access
MyMethod = "GET"
baseString = MyMethod + "&" + rawurlencode(accessUrl) + "&"
# create a random string
# the documentation in https://www.mkmapi.eu/ws/documentation/API:Auth_OAuthHeader uses
#nonce = 53eb1f44909d6
nonce = id_generator(8)
# what time is it?
# the documentation in https://www.mkmapi.eu/ws/documentation/API:Auth_OAuthHeader uses
#now = 1407917892
now = str(int(time.time()))
MyOauthmethod = "HMAC-SHA1"
MyOauthver = "1.0"
# define Parameters and values, order doesn't matter
paramDict ={"oauth_consumer_key":mkmAppToken, "oauth_token" :mkmAccessToken, "oauth_nonce":nonce, "oauth_timestamp":now, "oauth_signature_method":MyOauthmethod, "oauth_version":MyOauthver}
# sorting of parameters is done here
sorted_paramDict = sorted(paramDict.items(), key=operator.itemgetter(0))
#collect the full parameters string
paramStr = ''
for kv in sorted_paramDict:
paramStr = paramStr + kv[0] + "=" + kv[1] + "&"
# and get rid of the trailing ampersand
paramStr = paramStr[:-1]
#concatenate request and oauth parameters
baseString = baseString + rawurlencode(paramStr)
# concatenate both keys
signingKey = rawurlencode(mkmAppSecret) + "&" + rawurlencode(mkmAccessSecret)
# and create a hased signature with the key and the baseString
Signature = hmac(signingKey, baseString, sha1).digest().encode('base64')[:-1]
# construct the header from the parameters and the URL and the signature
MyHeader = 'OAuth ' + 'realm="' + accessUrl + '", '
for kv in sorted_paramDict:
MyHeader += kv[0] + '="' + kv[1] + '",'
MyHeader += 'oauth_signature="' + Signature +'"'
headers = {'Authorization': MyHeader}
# and now requests can do its magic (pun intended)
r = requests.get(accessUrl, headers=headers)
outjson = r.json()
You need to provide the realm as an argument to the OAuth1Session, like so:
session = requests_oauthlib.OAuth1Session(
mkm_app_token,
client_secret=mkm_app_secret,
resource_owner_key=mkm_access_token,
resource_owner_secret=mkm_token_secret,
realm=url
)
Other things I have run into in the past include the fact that the mkm api doesn't (or at least didn't) accept URI-escaped parameters, so you may need to unescape them.
For anyone who's reading in 2020, there's no need to reinvent the wheel, just pass the Oauth header and the parameters to requests, here's an example with metaproducts/find:
import requests
from requests_oauthlib import OAuth1
import json
import passwords
card_name = 'Tarmogoyf'
output = 'output.json'
base_url = 'https://api.cardmarket.com/ws/v2.0/' + output + '/'
url = base_url + 'metaproducts/find'
params={'search': card_name}
headeroauth = OAuth1(
realm = url,
client_key = passwords.mkm_app_token,
client_secret = passwords.mkm_app_secret,
resource_owner_key = passwords.mkm_access_token,
resource_owner_secret = passwords.mkm_token_secret,
)
response = requests.get(
url,
params,
auth = headeroauth
)
if (response.ok == True):
json_response = response.json()
print(json.dumps(json_response, indent=4, sort_keys=True))
else:
print(str(response.status_code) + " " + response.reason)
exit()
The /output.json/ part of the string makes the output JSON instead of XML

Categories

Resources