How to correctly encode responses from an AD search? - python

I search in an MS Active Directory with Python and I stumbled upon an issue with the encoding of an answer I get.
I use the code below to recursively go through the AD
import ldap
import pickle
class SearchAD():
def __init__(self):
self.l = ldap.initialize("ldap://ldap.example.com")
self.l.protocol_version = ldap.VERSION3
self.l.set_option(ldap.OPT_REFERRALS, 0)
bind = self.l.simple_bind_s("user", "password")
self.base = "DC=example,DC=com"
self.all = list()
def searchmgr(self, criteria, m):
print criteria
m += 1
attributes = ['dn', 'title']
result = self.l.search_s(self.base, ldap.SCOPE_SUBTREE, u'manager='+criteria, attributes)
for u in result:
cn, t = u
if cn is not None and "Disabled Users" not in cn and t.get('title'):
self.all.append({'dn': cn, 'title': t['title'][0], 'm': m})
self.searchmgr(cn, m)
s = SearchAD()
s.searchmgr("CN=EXAMPLE Top,DC=example,DC=com", 0)
with open("dir.pickle", "wb") as f:
pickle.dump(s.all, f)
and get
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position
16: ordinal not in range(128)
when calling
result = self.l.search_s(self.base, ldap.SCOPE_SUBTREE, 'manager='+criteria, attributes)
The search works fine (I get the expected output via the print) until the string 'CN=DOE Marie-h\xc3\xa9l\xc3\xa8ne,OU=User Accounts,...' which translates to 'CN=DOE Marie-hélène,OU=User Accounts,...'.
This is expected as AD returns its results in UTF-8
I therefore tried to .encode('utf-8') the string and use
result = self.l.search_s(self.base, ldap.SCOPE_SUBTREE, u'manager='+criteria.encode('utf-8'), attributes)
but I get the same error.
How should I handle the result returned by AD?

Related

VerQueryValueW issue python 3

I'm trying to get version of files through GetFileVersionInfoSizeW and VerQueryValueW. I got partial of the version printed out but not the entire thing. It also has some weird spaces between each character of the file version. Anyone has an idea what is wrong with it?
My guess is it is related to the Unicode interpretation of python3 since I had to change the GetFileVersionInfoSizeW and VerQueryValueW from the original GetFileVersionInfoSizeA and VerQueryValueA that ran normally in python2 (https://stackoverflow.com/a/38924793/7144869).
import array
from ctypes import *
def get_file_info(filename):
"""
Extract information from a file.
"""
# Get size needed for buffer (0 if no info)
size = windll.version.GetFileVersionInfoSizeW(filename, None)
# If no info in file -> empty string
if not size:
return 'Failed'
# Create buffer
res = create_string_buffer(size)
# Load file informations into buffer res
windll.version.GetFileVersionInfoW(filename, None, size, res)
r = c_uint()
l = c_uint()
# Look for codepages
windll.version.VerQueryValueW(res, '\\VarFileInfo\\Translation',
byref(r), byref(l))
# If no codepage -> empty string
if not l.value:
return ''
# Take the first codepage (what else ?)
codepages = array.array('H', string_at(r.value, l.value))
codepage = tuple(codepages[:2].tolist())
# Extract information
windll.version.VerQueryValueW(res, ('\\StringFileInfo\\%04x%04x\\'
+ 'FileVersion') % codepage, byref(r), byref(l))
return string_at(r.value, l.value)
print (get_file_info(r'C:\WINDOWS\system32\calc.exe').decode())
The functions return what Microsoft calls "Unicode" strings, but it is really encoded UTF-16LE that ctypes.wstring can convert. l.value is a count of UTF16 characters, not bytes, so use the following to decode it properly. You won't need to .decode() the result as you are doing now.
return wstring_at(r.value, l.value)
Here's my working code:
from ctypes import *
from ctypes import wintypes as w
ver = WinDLL('version')
ver.GetFileVersionInfoSizeW.argtypes = w.LPCWSTR, w.LPDWORD
ver.GetFileVersionInfoSizeW.restype = w.DWORD
ver.GetFileVersionInfoW.argtypes = w.LPCWSTR, w.DWORD, w.DWORD, w.LPVOID
ver.GetFileVersionInfoW.restype = w.BOOL
ver.VerQueryValueW.argtypes = w.LPCVOID, w.LPCWSTR, POINTER(w.LPVOID), w.PUINT
ver.VerQueryValueW.restype = w.BOOL
def get_file_info(filename):
size = ver.GetFileVersionInfoSizeW(filename, None)
if not size:
raise RuntimeError('version info not found')
res = create_string_buffer(size)
if not ver.GetFileVersionInfoW(filename, 0, size, res):
raise RuntimeError('GetFileVersionInfoW failed')
buf = w.LPVOID()
length = w.UINT()
# Look for codepages
if not ver.VerQueryValueW(res, r'\VarFileInfo\Translation', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find translation')
if length.value == 0:
raise RuntimeError('no code pages')
codepages = array.array('H', string_at(buf.value, length.value))
codepage = tuple(codepages[:2])
# Extract information
if not ver.VerQueryValueW(res, rf'\StringFileInfo\{codepage[0]:04x}{codepage[1]:04x}\FileVersion', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find file version')
return wstring_at(buf.value,length.value)
print(get_file_info(r'c:\windows\system32\calc.exe'))
Output:
10.0.19041.1 (WinBuild.160101.0800)

Handling accents in Oracle from Python

I have the following code:
#!/usr/bin/python
# coding=UTF-8
import cx_Oracle
def oracle_connection(user, passwd, host, port, service):
oracle_con_details = user+'/'+passwd+'#'+host+':'+port+'/'+service
try:
oracle_connection = cx_Oracle.connect(oracle_con_details)
except cx_Oracle.DatabaseError as e:
error, = e.args
if error.code == 1017:
log.warning('Please check your credentials.')
else:
log.error('Database connection error: ')
log.error(e)
return oracle_connection
user_oracle = "user"
passw_oracle = "pass"
host_oracle = "host"
port_oracle = "port"
service_oracle = "service"
con_oracle = oracle_connection(user_oracle, passw_oracle, host_oracle, port_oracle, service_oracle)
query = """ SELECT COUNT(*) FROM TABLE WHERE MYDATA = 'REUNIÓN'"""
cursor_oracle = con_oracle.cursor()
cursor_oracle.execute(query)
data_tuple = cursor_oracle.fetchall()
Of course, Oracle credentials and query are just examples. Notice the query has 'Ó' character. This is the one giving me the following error:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xd3' in
position 49: ordinal not in range(128)
I've tried the solutions from some other questions here:
query.decode('utf-8')
query.encode('utf-8')
query.decode('unicode')
query.encode('unicode')
I understand my string (query) is encoded in unicode but I just don't understand why decoding it in utf-8 doesn't work.
Because of this my query doesn't get to Oracle how it should.
ADDITIONAL INFO:
Based on this answer I thought mystring.encode('utf-8) would work.
I cheked the type of my string with this method just in case and the result is 'ordinary string'.
Adding this to my python code solved it.
import os
os.environ["NLS_LANG"] = "SPANISH_SPAIN.UTF8"

Mutagen 1.22 Encoding Issue

I am having an issue with character encoding with Mutagen.
I casted the dict[key] to Unicode, bu all I receive are errors. The character in question is U+00E9 or é, but what I prints is ├⌐. I am assuming the default character set for Mutagen is UTF-8, but is there a way to fix this?
Output:
Winter Wonderland.mp3
Album : Christmas
Album Artist: Michael Bublé
Artist : Michael Bublé
Composer : None
Disk : None
Encoded By : None
Genre : Christmas
Title : Winter Wonderland
Track : 17/19
Year : 2011
Code:
#!/usr/bin/env python
import os
import re
from mutagen.mp3 import MP3
first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
def convertCamelCase2Underscore(name):
s1 = first_cap_re.sub(r'\1_\2', name)
return all_cap_re.sub(r'\1_\2', s1).lower()
def convertCamelCase2CapitalizedWords(name):
return ' '.join([x.capitalize() for x in convertCamelCase2Underscore(name).split('_')])
def safeValue(dict, key):
return None if key not in dict else dict[key]
class Track:
def __init__(self, path):
audio = MP3(path)
self.title = safeValue(audio, 'TIT2')
self.artist = safeValue(audio, 'TPE1')
self.albumArtist = safeValue(audio, 'TPE2')
self.album = safeValue(audio, 'TALB')
self.genre = safeValue(audio, 'TCON')
self.year = safeValue(audio, 'TDRL')
self.encodedBy = safeValue(audio, 'TENC')
self.composer = safeValue(audio, 'TXXX:TCM')
self.track = safeValue(audio, 'TRCK')
self.disk = safeValue(audio, 'TXXX:TPA')
def __repr__(self):
ret = ''
fields = self.__dict__
for k, v in sorted(self.__dict__.iteritems()):
ret += '{:12s}: {:s}\n'.format(convertCamelCase2CapitalizedWords(k), v)
return ret
files = os.listdir('.')
for filename in files:
print filename
print Track(filename)
I am assuming the default character set for Mutagen is UTF-8
Mutagen returns Unicode strings, though wrapped in a TextFrame object. When you print that object it's an implicit str() conversion of the text property to bytes, and Mutagen (arbitrarily) chooses UTF-8 for that encoding.
Unfortunately the Windows console doesn't support UTF-8[1]. The encoding it uses varies but in your case you are getting the US DOS code page 437 where the byte sequence 0xC3 0xA9 represents ├⌐ and not é. You could try to print to the console in the encoding that it wants by explicitly encoding to it:
print unicode(audio['TIT2']).encode(sys.stdout.encoding) # 'cp437'
but this will still only allow you to print characters that are supported in that code page. 437 is OK for Michael Bublé, but not so good for 東京事変. There isn't a good way to get Unicode out to the Windows console.[2]
[1] There is code page 65001 which is supposed to be UTF-8, but there are bugs in the MS implementation which usually make it unusable.
[2] You can, if you must, call the Win32 API WriteConsoleW directly using ctypes, but then you have to take care to only do that when you are connected to a Windows console and not any other type of stream so you don't break everywhere else. It's usually not worth it; Windows users are assumed to be used to a console where non-ASCII characters just break all the time.

Python socket.send encoding

It seems i've run a problem with the encoding itself in where i need to pass Bing translation junks..
def _unicode_urlencode(params):
if isinstance(params, dict):
params = params.items()
return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v) for k, v in params])
def _run_query(args):
data = _unicode_urlencode(args)
sock = urllib.urlopen(api_url + '?' + data)
result = sock.read()
if result.startswith(codecs.BOM_UTF8):
result = result.lstrip(codecs.BOM_UTF8).decode('utf-8')
elif result.startswith(codecs.BOM_UTF16_LE):
result = result.lstrip(codecs.BOM_UTF16_LE).decode('utf-16-le')
elif result.startswith(codecs.BOM_UTF16_BE):
result = result.lstrip(codecs.BOM_UTF16_BE).decode('utf-16-be')
return json.loads(result)
def set_app_id(new_app_id):
global app_id
app_id = new_app_id
def translate(text, source, target, html=False):
"""
action=opensearch
"""
if not app_id:
raise ValueError("AppId needs to be set by set_app_id")
query_args = {
'appId': app_id,
'text': text,
'from': source,
'to': target,
'contentType': 'text/plain' if not html else 'text/html',
'category': 'general'
}
return _run_query(query_args)
...
text = translate(sys.argv[2], 'en', 'tr')
HOST = '127.0.0.1'
PORT = 894
s = socket.socket()
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
s.connect((HOST, PORT))
s.send("Bing translation: " + text.encode('utf8') + "\r");
s.close()
As you can see, if the translated text contains some turkish characters, the script fails to send the text to the socket..
Do you have any idea on how to get rid of this?
Regards.
Your problem is entirely unrelated to the socket. text is already a bytestring, and you're trying to encode it. What happens is that Python tries to converts the bytestring to a unicode via the safe ASCII encoding in order to be able to encode as UTF-8, and then fails because the bytestring contains non-ASCII characters.
You should fix translate to return a unicode object, by using a JSON variable that returns unicode objects.
Alternatively, if it is already encoding text encoded as UTF-8, you can simply use
s.send("Bing translation: " + text + "\r")
# -*- coding:utf-8 -*-
text = u"text in you language"
s.send(u"Bing translation: " + text.encode('utf8') + u"\r");
This must work. text must be spelled in utf-8 encoding.

encodings again

I am trying to work with sqlite on python:
from pysqlite2 import dbapi2 as sqlite
con = sqlite.connect('/home/argon/super.db')
cur = con.cursor()
cur.execute('select * from notes')
for i in cur.fetchall():
print i[2]
And I sometimes get something like this (I am from Russia):
Ответ etc...
And if I pass this string to this function(it helped me in other projects):
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
I get even more weird result:
ÐÑвеÑиÑÑ Ñ ÑиÑиÑованием etc
What should I do to get normal Cyrillic symbols?
О looks like a UTF-8 byte pair for \xD0\x9E, or \u1054. Better known as the cyrillic character О (Capital O).
In other words, you have strangely encoded UTF-8 data on your hand. Turn the { digits into bytes (chr(208) would do) then decode from UTF-8:
>>> (chr(208) + chr(158)).decode('utf-8')
u'\u1054'
>>> print (chr(208) + chr(158)).decode('utf-8')
О
>>> print (chr(208) + chr(158) + chr(209) + chr(130) + chr(208) + chr(178)).decode('utf-8')
Отв

Categories

Resources