It seems i've run a problem with the encoding itself in where i need to pass Bing translation junks..
def _unicode_urlencode(params):
if isinstance(params, dict):
params = params.items()
return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v) for k, v in params])
def _run_query(args):
data = _unicode_urlencode(args)
sock = urllib.urlopen(api_url + '?' + data)
result = sock.read()
if result.startswith(codecs.BOM_UTF8):
result = result.lstrip(codecs.BOM_UTF8).decode('utf-8')
elif result.startswith(codecs.BOM_UTF16_LE):
result = result.lstrip(codecs.BOM_UTF16_LE).decode('utf-16-le')
elif result.startswith(codecs.BOM_UTF16_BE):
result = result.lstrip(codecs.BOM_UTF16_BE).decode('utf-16-be')
return json.loads(result)
def set_app_id(new_app_id):
global app_id
app_id = new_app_id
def translate(text, source, target, html=False):
"""
action=opensearch
"""
if not app_id:
raise ValueError("AppId needs to be set by set_app_id")
query_args = {
'appId': app_id,
'text': text,
'from': source,
'to': target,
'contentType': 'text/plain' if not html else 'text/html',
'category': 'general'
}
return _run_query(query_args)
...
text = translate(sys.argv[2], 'en', 'tr')
HOST = '127.0.0.1'
PORT = 894
s = socket.socket()
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
s.connect((HOST, PORT))
s.send("Bing translation: " + text.encode('utf8') + "\r");
s.close()
As you can see, if the translated text contains some turkish characters, the script fails to send the text to the socket..
Do you have any idea on how to get rid of this?
Regards.
Your problem is entirely unrelated to the socket. text is already a bytestring, and you're trying to encode it. What happens is that Python tries to converts the bytestring to a unicode via the safe ASCII encoding in order to be able to encode as UTF-8, and then fails because the bytestring contains non-ASCII characters.
You should fix translate to return a unicode object, by using a JSON variable that returns unicode objects.
Alternatively, if it is already encoding text encoded as UTF-8, you can simply use
s.send("Bing translation: " + text + "\r")
# -*- coding:utf-8 -*-
text = u"text in you language"
s.send(u"Bing translation: " + text.encode('utf8') + u"\r");
This must work. text must be spelled in utf-8 encoding.
Related
I have image urls coming from FB like this
I want to decode this using Python
from urllib.parse import unquote
unquote('https\3a //scontent-yyz1-1.xx.fbcdn.net/v/t39.30808-6/276224522_526712219026525_574582522520082091_n.jpg?stp\3d cp0_dst-jpg_e15_p160x160_q65\26 _nc_cat\3d 101\26 ccb\3d 1-7\26 _nc_sid\3d 110474\26 efg\3d eyJpIjoidCJ9\26 _nc_ohc\3d GYg2KoaviA4AX_3gvNu\26 _nc_ht\3d scontent-yyz1-1.xx\26 oh\3d 00_AT9qampQP8qPyoKOJo4hW9hKRepgLX4krubFMFteOMaizA\26 oe\3d 62F8E686')
It returns this
'https\x03a //scontent-yyz1-1.xx.fbcdn.net/v/t39.30808-6/276224522_526712219026525_574582522520082091_n.jpg?stp\x03d cp0_dst-jpg_e15_p160x160_q65\x16 _nc_cat\x03d 101\x16 ccb\x03d 1-7\x16 _nc_sid\x03d 110474\x16 efg\x03d eyJpIjoidCJ9\x16 _nc_ohc\x03d GYg2KoaviA4AX_3gvNu\x16 _nc_ht\x03d scontent-yyz1-1.xx\x16 oh\x03d 00_AT9qampQP8qPyoKOJo4hW9hKRepgLX4krubFMFteOMaizA\x16 oe\x03d 62F8E686'
How do I decode it to proper URL?
A crude solution that replaces the characters in the string
def parse_fb_url(string):
"""parses facebook cdn urls"""
# convert to raw string
raw_string = string.encode("unicode_escape").decode()
# manual inspection of the url reveals the encoding
replacement_map = {
r"\x03a": ":",
r"\x03d": "=",
r"\x16": "&",
r" ": "",
}
for key, replacement in replacement_map.items():
raw_string = raw_string.replace(key, replacement)
return raw_string
string = "https\3a //scontent-yyz1-1.xx.fbcdn.net/v/t39.30808-6/276224522_526712219026525_574582522520082091_n.jpg?stp\3d cp0_dst-jpg_e15_p160x160_q65\26 _nc_cat\3d 101\26 ccb\3d 1-7\26 _nc_sid\3d 110474\26 efg\3d eyJpIjoidCJ9\26 _nc_ohc\3d GYg2KoaviA4AX_3gvNu\26 _nc_ht\3d scontent-yyz1-1.xx\26 oh\3d 00_AT9qampQP8qPyoKOJo4hW9hKRepgLX4krubFMFteOMaizA\26 oe\3d 62F8E686"
parsed_string = parse_fb_url(string)
print(parsed_string)
This gave me the following output:
https://scontent-yyz1-1.xx.fbcdn.net/v/t39.30808-6/276224522_526712219026525_574582522520082091_n.jpg?stp=cp0_dst-jpg_e15_p160x160_q65&_nc_cat=101&ccb=1-7&_nc_sid=110474&efg=eyJpIjoidCJ9&_nc_ohc=GYg2KoaviA4AX_3gvNu&_nc_ht=scontent-yyz1-1.xx&oh=00_AT9qampQP8qPyoKOJo4hW9hKRepgLX4krubFMFteOMaizA&oe=62F8E686
I am using requests library (python 3.9) to get filename from URL.[1] For some reason a file name is incorrectly encoded.
I should get "Ogłoszenie_0320.pdf" instead of "OgÅ\x82oszenie_0320.pdf".
My code looks something like this:
import requests
import re
def getFilenameFromRequest(url : str, headers):
# Parses from header information
contentDisposition = headers.get('content-disposition')
if contentDisposition:
filename = re.findall('filename=(.+)', contentDisposition)
print("oooooooooo: " + contentDisposition + " : " + str(filename))
if len(filename) != 0:
return filename[0]
# Parses from url
parsedUrl = urlparse(url)
return os.path.basename(parsedUrl.path)
def getFilenameFromUrl(url : str):
request = requests.head(url)
headers = request.headers
return getFilenameFromRequest(url, headers)
getFilenameFromUrl('https://przedszkolekw.bip.gov.pl'+
'/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html')
Any idea how to fix it?
I know for standard request I can set encoding directly:
request.encoding = 'utf-8'
But what am I supposed to do with this case?
[1]
https://przedszkolekw.bip.gov.pl/fobjects/download/880287/ogloszenie-uzp-nr-613234-pdf.html
Only characters from the ascii based latin-1 should be used as header values [rfc]. Here the file name has been escaped.
>>> s = "Ogłoszenie_0320.pdf"
>>> s.encode("utf8").decode("unicode-escape")
'OgÅ\x82oszenie_0320.pdf'
To reverse the process you can do
>>> sx = 'OgÅ\x82oszenie_0320.pdf'
>>> sx.encode("latin-1").decode("utf8")
'Ogłoszenie_0320.pdf'
(updated after conversation in comments)
I am working with an exchange for cryptocurrency which requires an encoding of the secret API key to gain access to private API calls. I have copied and pasted their Python code to begin executing my calls with it, but I receive this error every time I make a request.
TypeError: Unicode-objects must be encoded before hashing
I know what this means; I am a programmer. I can not find the root of the problem in the code I have received from the exchange, as I have not worked with hmac, hashlib, or base64. I have replaced all instances of the name of the exchange with the word "exchange" in the following code. No API keys are shown.
exchangeconfig = Exchange('key', 'secret')
base = 'https://exchange.com/'
def post_request(key, secret, path, data):
hmac_obj = hmac.new(secret, path + chr(0) + data, hashlib.sha512)
hmac_sign = base64.b64encode(hmac_obj.digest())
header = {
'Content-Type': 'application/json',
'User-Agent': 'exchangev2 based client',
'Rest-Key': key,
'Rest-Sign': hmac_sign,
}
proxy = ProxyHandler({'http': '127.0.0.1:8888'})
opener = build_opener(proxy)
install_opener(opener)
request = Request(base + path, data, header)
response = urlopen(request, data)
return json.load(response)
def gen_tonce():
return str(int(time.time() * 1e6))
class Exchange:
def __init__(self, key, secret):
self.key = key
self.secret = base64.b64decode(secret)
def request(self, path, params={}):
params = dict(params)
params['tonce'] = gen_tonce()
# data = urllib.urlencode(params)
data = json.dumps(params)
result = post_request(self.key, self.secret, path, data)
if result['result'] == 'success':
return result['data']
else:
raise Exception(result['result'])
exchangeconfig.request("api/3/account")
Please help me figure this out.
By the way: It seems to have a problem with this line in particular:
hmac_obj = hmac.new(secret, path + chr(0) + data, hashlib.sha512)
Thanks.
UPDATE: Fixed that error. Now onto this one:
TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
These hash libraries deal with bytes objects, so you should encode your strings to bytes first (assuming the decoding end uses UTF-8):
hmac_obj = hmac.new(secret, path.encode('utf-8') + b'\0' + data.encode('utf-8'), hashlib.sha512)
I am trying to encode a text string to base64.
i tried doing this :
name = "your name"
print('encoding %s in base64 yields = %s\n'%(name,name.encode('base64','strict')))
But this gives me the following error:
LookupError: 'base64' is not a text encoding; use codecs.encode() to handle arbitrary codecs
How do I go about doing this ? ( using Python 3.4)
Remember to import base64 and that b64encode takes bytes as an argument.
import base64
b = base64.b64encode(bytes('your string', 'utf-8')) # bytes
base64_str = b.decode('utf-8') # convert bytes to string
It turns out that this is important enough to get it's own module...
import base64
base64.b64encode(b'your name') # b'eW91ciBuYW1l'
base64.b64encode('your name'.encode('ascii')) # b'eW91ciBuYW1l'
For py3, base64 encode and decode string:
import base64
def b64e(s):
return base64.b64encode(s.encode()).decode()
def b64d(s):
return base64.b64decode(s).decode()
1) This works without imports in Python 2:
>>>
>>> 'Some text'.encode('base64')
'U29tZSB0ZXh0\n'
>>>
>>> 'U29tZSB0ZXh0\n'.decode('base64')
'Some text'
>>>
>>> 'U29tZSB0ZXh0'.decode('base64')
'Some text'
>>>
(although this doesn't work in Python3 )
2) In Python 3 you'd have to import base64 and do base64.b64decode('...')
- will work in Python 2 too.
To compatibility with both py2 and py3
import six
import base64
def b64encode(source):
if six.PY3:
source = source.encode('utf-8')
content = base64.b64encode(source).decode('utf-8')
It looks it's essential to call decode() function to make use of actual string data even after calling base64.b64decode over base64 encoded string. Because never forget it always return bytes literals.
import base64
conv_bytes = bytes('your string', 'utf-8')
print(conv_bytes) # b'your string'
encoded_str = base64.b64encode(conv_bytes)
print(encoded_str) # b'eW91ciBzdHJpbmc='
print(base64.b64decode(encoded_str)) # b'your string'
print(base64.b64decode(encoded_str).decode()) # your string
Whilst you can of course use the base64 module, you can also to use the codecs module (referred to in your error message) for binary encodings (meaning non-standard & non-text encodings).
For example:
import codecs
my_bytes = b"Hello World!"
codecs.encode(my_bytes, "base64")
codecs.encode(my_bytes, "hex")
codecs.encode(my_bytes, "zip")
codecs.encode(my_bytes, "bz2")
This can come in useful for large data as you can chain them to get compressed and json-serializable values:
my_large_bytes = my_bytes * 10000
codecs.decode(
codecs.encode(
codecs.encode(
my_large_bytes,
"zip"
),
"base64"),
"utf8"
)
Refs:
https://docs.python.org/3/library/codecs.html#binary-transforms
https://docs.python.org/3/library/codecs.html#standard-encodings
https://docs.python.org/3/library/codecs.html#text-encodings
Use the below code:
import base64
#Taking input through the terminal.
welcomeInput= raw_input("Enter 1 to convert String to Base64, 2 to convert Base64 to String: ")
if(int(welcomeInput)==1 or int(welcomeInput)==2):
#Code to Convert String to Base 64.
if int(welcomeInput)==1:
inputString= raw_input("Enter the String to be converted to Base64:")
base64Value = base64.b64encode(inputString.encode())
print "Base64 Value = " + base64Value
#Code to Convert Base 64 to String.
elif int(welcomeInput)==2:
inputString= raw_input("Enter the Base64 value to be converted to String:")
stringValue = base64.b64decode(inputString).decode('utf-8')
print "Base64 Value = " + stringValue
else:
print "Please enter a valid value."
Base64 encoding is a process of converting binary data to an ASCII
string format by converting that binary data into a 6-bit character
representation. The Base64 method of encoding is used when binary
data, such as images or video, is transmitted over systems that are
designed to transmit data in a plain-text (ASCII) format.
Follow this link for further details about understanding and working of base64 encoding.
For those who want to implement base64 encoding from scratch for the sake of understanding, here's the code that encodes the string to base64.
encoder.py
#!/usr/bin/env python3.10
class Base64Encoder:
#base64Encoding maps integer to the encoded text since its a list here the index act as the key
base64Encoding:list = None
#data must be type of str or bytes
def encode(data)->str:
#data = data.encode("UTF-8")
if not isinstance(data, str) and not isinstance(data, bytes):
raise AttributeError(f"Expected {type('')} or {type(b'')} but found {type(data)}")
if isinstance(data, str):
data = data.encode("ascii")
if Base64Encoder.base64Encoding == None:
#construction base64Encoding
Base64Encoder.base64Encoding = list()
#mapping A-Z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 65))
#mapping a-z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 97))
#mapping 0-9
for key in range(0, 10):
Base64Encoder.base64Encoding.append(chr(key + 48))
#mapping +
Base64Encoder.base64Encoding.append('+')
#mapping /
Base64Encoder.base64Encoding.append('/')
if len(data) == 0:
return ""
length=len(data)
bytes_to_append = -(length%3)+(3 if length%3 != 0 else 0)
#print(f"{bytes_to_append=}")
binary_list = []
for s in data:
ascii_value = s
binary = f"{ascii_value:08b}"
#binary = bin(ascii_value)[2:]
#print(s, binary, type(binary))
for bit in binary:
binary_list.append(bit)
length=len(binary_list)
bits_to_append = -(length%6) + (6 if length%6 != 0 else 0)
binary_list.extend([0]*bits_to_append)
#print(f"{binary_list=}")
base64 = []
value = 0
for index, bit in enumerate(reversed(binary_list)):
#print (f"{bit=}")
#converting block of 6 bits to integer value
value += ( 2**(index%6) if bit=='1' else 0)
#print(f"{value=}")
#print(bit, end = '')
if (index+1)%6 == 0:
base64.append(Base64Encoder.base64Encoding[value])
#print(' ', end="")
#resetting value
value = 0
pass
#print()
#padding if there is less bytes and returning the result
return ''.join(reversed(base64))+''.join(['=']*bytes_to_append)
testEncoder.py
#!/usr/bin/env python3.10
from encoder import Base64Encoder
if __name__ == "__main__":
print(Base64Encoder.encode("Hello"))
print(Base64Encoder.encode("1 2 10 13 -7"))
print(Base64Encoder.encode("A"))
with open("image.jpg", "rb") as file_data:
print(Base64Encoder.encode(file_data.read()))
Output:
$ ./testEncoder.py
SGVsbG8=
MSAyIDEwIDEzIC03
QQ==
I am trying to work with sqlite on python:
from pysqlite2 import dbapi2 as sqlite
con = sqlite.connect('/home/argon/super.db')
cur = con.cursor()
cur.execute('select * from notes')
for i in cur.fetchall():
print i[2]
And I sometimes get something like this (I am from Russia):
ÐÑÐ²ÐµÑ etc...
And if I pass this string to this function(it helped me in other projects):
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
I get even more weird result:
ÐÑвеÑиÑÑ Ñ ÑиÑиÑованием etc
What should I do to get normal Cyrillic symbols?
Ð looks like a UTF-8 byte pair for \xD0\x9E, or \u1054. Better known as the cyrillic character О (Capital O).
In other words, you have strangely encoded UTF-8 data on your hand. Turn the { digits into bytes (chr(208) would do) then decode from UTF-8:
>>> (chr(208) + chr(158)).decode('utf-8')
u'\u1054'
>>> print (chr(208) + chr(158)).decode('utf-8')
О
>>> print (chr(208) + chr(158) + chr(209) + chr(130) + chr(208) + chr(178)).decode('utf-8')
Отв