Fixing faulty unicode strings

Fixing faulty unicode strings - python

A faulty unicode string is one that has accidentally encoded bytes in it.
For example:
Text: שלום, Windows-1255-encoded: \x99\x8c\x85\x8d, Unicode: u'\u05e9\u05dc\u05d5\u05dd', Faulty Unicode: u'\x99\x8c\x85\x8d'
I sometimes bump into such strings when parsing ID3 tags in MP3 files. How can I fix these strings? (e.g. convert u'\x99\x8c\x85\x8d' into u'\u05e9\u05dc\u05d5\u05dd')

You could convert u'\x99\x8c\x85\x8d' to '\x99\x8c\x85\x8d' using the latin-1 encoding:
In [9]: x = u'\x99\x8c\x85\x8d'
In [10]: x.encode('latin-1')
Out[10]: '\x99\x8c\x85\x8d'
However, it seems like this is not a valid Windows-1255-encoded string. Did you perhaps mean '\xf9\xec\xe5\xed'? If so, then
In [22]: x = u'\xf9\xec\xe5\xed'
In [23]: x.encode('latin-1').decode('cp1255')
Out[23]: u'\u05e9\u05dc\u05d5\u05dd'
converts u'\xf9\xec\xe5\xed' to u'\u05e9\u05dc\u05d5\u05dd' which matches the desired unicode you posted.
If you really want to convert u'\x99\x8c\x85\x8d' into u'\u05e9\u05dc\u05d5\u05dd', then this happens to work:
In [27]: u'\x99\x8c\x85\x8d'.encode('latin-1').decode('cp862')
Out[27]: u'\u05e9\u05dc\u05d5\u05dd'
The above encoding/decoding chain was found using this script:
guess_chain_encodings.py
"""
Usage example: guess_chain_encodings.py "u'баба'" "u'\xe1\xe0\xe1\xe0'"
"""
import six
import argparse
import binascii
import zlib
import utils_string as us
import ast
import collections
import itertools
import random
encodings = us.all_encodings()
Errors = (IOError, UnicodeEncodeError, UnicodeError, LookupError,
TypeError, ValueError, binascii.Error, zlib.error)
def breadth_first_search(text, all = False):
seen = set()
tasks = collections.deque()
tasks.append(([], text))
while tasks:
encs, text = tasks.popleft()
for enc, newtext in candidates(text):
if repr(newtext) not in seen:
if not all:
seen.add(repr(newtext))
newtask = encs+[enc], newtext
tasks.append(newtask)
yield newtask
def candidates(text):
f = text.encode if isinstance(text, six.text_type) else text.decode
results = []
for enc in encodings:
try:
results.append((enc, f(enc)))
except Errors as err:
pass
random.shuffle(results)
for r in results:
yield r
def fmt(encs, text):
encode_decode = itertools.cycle(['encode', 'decode'])
if not isinstance(text, six.text_type):
next(encode_decode)
chain = '.'.join( "{f}('{e}')".format(f = func, e = enc)
for enc, func in zip(encs, encode_decode) )
return '{t!r}.{c}'.format(t = text, c = chain)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('start', type = ast.literal_eval, help = 'starting unicode')
parser.add_argument('stop', type = ast.literal_eval, help = 'ending unicode')
parser.add_argument('--all', '-a', action = 'store_true')
args = parser.parse_args()
min_len = None
for encs, text in breadth_first_search(args.start, args.all):
if min_len is not None and len(encs) > min_len:
break
if type(text) == type(args.stop) and text == args.stop:
print(fmt(encs, args.start))
min_len = len(encs)
if __name__ == '__main__':
main()
Running
% guess_chain_encodings.py "u'\x99\x8c\x85\x8d'" "u'\u05e9\u05dc\u05d5\u05dd'" --all
yields
u'\x99\x8c\x85\x8d'.encode('latin_1').decode('cp862')
u'\x99\x8c\x85\x8d'.encode('charmap').decode('cp862')
u'\x99\x8c\x85\x8d'.encode('rot_13').decode('cp856')
etc.

Related

Extracting Json Objects from string

Lets say I have an incoming string oversocket and it look like this
'text.....text {"foo": {"bar":100}} text {"bar":2} test {"foo"'
What is the best way/library to extract only the json objects out of the incoming string?
I have tried simplejson.JSONDecoder from simple json library. However, it is not only finding objects or I didn't know how to use it.
I have tried something like this so far
import simplejson as json
input_buffer = ""
def in_data(data):
input_buffer += data
try:
dict, idx = json.JSONDecoder().raw_decode(input_buffer)
except:
#handle exception in case nothing found
self.handle_input(dict) #send the dictionary for processing
input_buffer = input_buffer[idx:]

Pure python solution based on simple bracket counting and trying to parse gathered text.
import json
inp = list(r'text {"foo": {"bar":100}} text {"bar":2} test ')
stack = ''
bracket_counter = 0
jsons = []
while inp:
char = inp.pop(0)
if char == '{':
bracket_counter += 1
if bracket_counter:
stack += char
if char == '}':
bracket_counter -= 1
if bracket_counter == 0:
try:
parsed = json.loads(stack)
jsons.append(parsed)
except json.JSONDecodeError:
pass
stack = ''
print(jsons) # -> [{'foo': {'bar': 100}}, {'bar': 2}]

VerQueryValueW issue python 3

I'm trying to get version of files through GetFileVersionInfoSizeW and VerQueryValueW. I got partial of the version printed out but not the entire thing. It also has some weird spaces between each character of the file version. Anyone has an idea what is wrong with it?
My guess is it is related to the Unicode interpretation of python3 since I had to change the GetFileVersionInfoSizeW and VerQueryValueW from the original GetFileVersionInfoSizeA and VerQueryValueA that ran normally in python2 (https://stackoverflow.com/a/38924793/7144869).
import array
from ctypes import *
def get_file_info(filename):
"""
Extract information from a file.
"""
# Get size needed for buffer (0 if no info)
size = windll.version.GetFileVersionInfoSizeW(filename, None)
# If no info in file -> empty string
if not size:
return 'Failed'
# Create buffer
res = create_string_buffer(size)
# Load file informations into buffer res
windll.version.GetFileVersionInfoW(filename, None, size, res)
r = c_uint()
l = c_uint()
# Look for codepages
windll.version.VerQueryValueW(res, '\\VarFileInfo\\Translation',
byref(r), byref(l))
# If no codepage -> empty string
if not l.value:
return ''
# Take the first codepage (what else ?)
codepages = array.array('H', string_at(r.value, l.value))
codepage = tuple(codepages[:2].tolist())
# Extract information
windll.version.VerQueryValueW(res, ('\\StringFileInfo\\%04x%04x\\'
+ 'FileVersion') % codepage, byref(r), byref(l))
return string_at(r.value, l.value)
print (get_file_info(r'C:\WINDOWS\system32\calc.exe').decode())

The functions return what Microsoft calls "Unicode" strings, but it is really encoded UTF-16LE that ctypes.wstring can convert. l.value is a count of UTF16 characters, not bytes, so use the following to decode it properly. You won't need to .decode() the result as you are doing now.
return wstring_at(r.value, l.value)
Here's my working code:
from ctypes import *
from ctypes import wintypes as w
ver = WinDLL('version')
ver.GetFileVersionInfoSizeW.argtypes = w.LPCWSTR, w.LPDWORD
ver.GetFileVersionInfoSizeW.restype = w.DWORD
ver.GetFileVersionInfoW.argtypes = w.LPCWSTR, w.DWORD, w.DWORD, w.LPVOID
ver.GetFileVersionInfoW.restype = w.BOOL
ver.VerQueryValueW.argtypes = w.LPCVOID, w.LPCWSTR, POINTER(w.LPVOID), w.PUINT
ver.VerQueryValueW.restype = w.BOOL
def get_file_info(filename):
size = ver.GetFileVersionInfoSizeW(filename, None)
if not size:
raise RuntimeError('version info not found')
res = create_string_buffer(size)
if not ver.GetFileVersionInfoW(filename, 0, size, res):
raise RuntimeError('GetFileVersionInfoW failed')
buf = w.LPVOID()
length = w.UINT()
# Look for codepages
if not ver.VerQueryValueW(res, r'\VarFileInfo\Translation', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find translation')
if length.value == 0:
raise RuntimeError('no code pages')
codepages = array.array('H', string_at(buf.value, length.value))
codepage = tuple(codepages[:2])
# Extract information
if not ver.VerQueryValueW(res, rf'\StringFileInfo\{codepage[0]:04x}{codepage[1]:04x}\FileVersion', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find file version')
return wstring_at(buf.value,length.value)
print(get_file_info(r'c:\windows\system32\calc.exe'))
Output:
10.0.19041.1 (WinBuild.160101.0800)

programmatically transform python code via lib2to3

I'd like to transform all occurrences of "some_func(a, b)" in a Python module to "assert a == b" via Python's standard lib's lib2to3. I wrote a script that would take source as input:
# convert.py
# convert assert_equal(a, b) to assert a == b
from lib2to3 import refactor
refac = refactor.RefactoringTool(['fix_assert_equal'])
result = refac.refactor_string('assert_equal(123, 456)\n', 'assert equal')
print(result)
and the actual fixer in a separate module:
# fix_assert_equal.py, in same folder as convert.py
from lib2to3 import fixer_base, pygram, pytree, pgen2
import ast
import logging
grammar = pygram.python_grammar
logger = logging.getLogger("RefactoringTool")
driver = pgen2.driver.Driver(grammar, convert=pytree.convert, logger=logger)
dest_tree = driver.parse_string('assert a == b\n')
class FixAssertEqual(fixer_base.BaseFix):
BM_compatible = True
PATTERN = """
power< 'assert_equal'
trailer<
'('
arglist<
obj1=any ','
obj2=any
>
')'
>
>
"""
def transform(self, node, results):
assert results
obj1 = results["obj1"]
obj1 = obj1.clone()
obj1.prefix = ""
obj2 = results["obj2"]
obj2 = obj2.clone()
obj2.prefix = ""
prefix = node.prefix
dest_tree2 = dest_tree.clone()
node = dest_tree2.children[0].children[0].children[1]
node.children[0] = obj1
node.children[2] = obj2
dest_tree2.prefix = prefix
return dest_tree2
However this produces the output assert123 ==456 instead of assert 123 == 456. Any idea how to fix this?

I found the solution: the obj2.prefix should not be set to "", this removes the space before the object.

How to encode text to base64 in python

I am trying to encode a text string to base64.
i tried doing this :
name = "your name"
print('encoding %s in base64 yields = %s\n'%(name,name.encode('base64','strict')))
But this gives me the following error:
LookupError: 'base64' is not a text encoding; use codecs.encode() to handle arbitrary codecs
How do I go about doing this ? ( using Python 3.4)

Remember to import base64 and that b64encode takes bytes as an argument.
import base64
b = base64.b64encode(bytes('your string', 'utf-8')) # bytes
base64_str = b.decode('utf-8') # convert bytes to string

It turns out that this is important enough to get it's own module...
import base64
base64.b64encode(b'your name') # b'eW91ciBuYW1l'
base64.b64encode('your name'.encode('ascii')) # b'eW91ciBuYW1l'

For py3, base64 encode and decode string:
import base64
def b64e(s):
return base64.b64encode(s.encode()).decode()
def b64d(s):
return base64.b64decode(s).decode()

1) This works without imports in Python 2:
>>>
>>> 'Some text'.encode('base64')
'U29tZSB0ZXh0\n'
>>>
>>> 'U29tZSB0ZXh0\n'.decode('base64')
'Some text'
>>>
>>> 'U29tZSB0ZXh0'.decode('base64')
'Some text'
>>>
(although this doesn't work in Python3 )
2) In Python 3 you'd have to import base64 and do base64.b64decode('...')
- will work in Python 2 too.

To compatibility with both py2 and py3
import six
import base64
def b64encode(source):
if six.PY3:
source = source.encode('utf-8')
content = base64.b64encode(source).decode('utf-8')

It looks it's essential to call decode() function to make use of actual string data even after calling base64.b64decode over base64 encoded string. Because never forget it always return bytes literals.
import base64
conv_bytes = bytes('your string', 'utf-8')
print(conv_bytes) # b'your string'
encoded_str = base64.b64encode(conv_bytes)
print(encoded_str) # b'eW91ciBzdHJpbmc='
print(base64.b64decode(encoded_str)) # b'your string'
print(base64.b64decode(encoded_str).decode()) # your string

Whilst you can of course use the base64 module, you can also to use the codecs module (referred to in your error message) for binary encodings (meaning non-standard & non-text encodings).
For example:
import codecs
my_bytes = b"Hello World!"
codecs.encode(my_bytes, "base64")
codecs.encode(my_bytes, "hex")
codecs.encode(my_bytes, "zip")
codecs.encode(my_bytes, "bz2")
This can come in useful for large data as you can chain them to get compressed and json-serializable values:
my_large_bytes = my_bytes * 10000
codecs.decode(
codecs.encode(
codecs.encode(
my_large_bytes,
"zip"
),
"base64"),
"utf8"
)
Refs:
https://docs.python.org/3/library/codecs.html#binary-transforms
https://docs.python.org/3/library/codecs.html#standard-encodings
https://docs.python.org/3/library/codecs.html#text-encodings

Use the below code:
import base64
#Taking input through the terminal.
welcomeInput= raw_input("Enter 1 to convert String to Base64, 2 to convert Base64 to String: ")
if(int(welcomeInput)==1 or int(welcomeInput)==2):
#Code to Convert String to Base 64.
if int(welcomeInput)==1:
inputString= raw_input("Enter the String to be converted to Base64:")
base64Value = base64.b64encode(inputString.encode())
print "Base64 Value = " + base64Value
#Code to Convert Base 64 to String.
elif int(welcomeInput)==2:
inputString= raw_input("Enter the Base64 value to be converted to String:")
stringValue = base64.b64decode(inputString).decode('utf-8')
print "Base64 Value = " + stringValue
else:
print "Please enter a valid value."

Base64 encoding is a process of converting binary data to an ASCII
string format by converting that binary data into a 6-bit character
representation. The Base64 method of encoding is used when binary
data, such as images or video, is transmitted over systems that are
designed to transmit data in a plain-text (ASCII) format.
Follow this link for further details about understanding and working of base64 encoding.
For those who want to implement base64 encoding from scratch for the sake of understanding, here's the code that encodes the string to base64.
encoder.py
#!/usr/bin/env python3.10
class Base64Encoder:
#base64Encoding maps integer to the encoded text since its a list here the index act as the key
base64Encoding:list = None
#data must be type of str or bytes
def encode(data)->str:
#data = data.encode("UTF-8")
if not isinstance(data, str) and not isinstance(data, bytes):
raise AttributeError(f"Expected {type('')} or {type(b'')} but found {type(data)}")
if isinstance(data, str):
data = data.encode("ascii")
if Base64Encoder.base64Encoding == None:
#construction base64Encoding
Base64Encoder.base64Encoding = list()
#mapping A-Z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 65))
#mapping a-z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 97))
#mapping 0-9
for key in range(0, 10):
Base64Encoder.base64Encoding.append(chr(key + 48))
#mapping +
Base64Encoder.base64Encoding.append('+')
#mapping /
Base64Encoder.base64Encoding.append('/')
if len(data) == 0:
return ""
length=len(data)
bytes_to_append = -(length%3)+(3 if length%3 != 0 else 0)
#print(f"{bytes_to_append=}")
binary_list = []
for s in data:
ascii_value = s
binary = f"{ascii_value:08b}"
#binary = bin(ascii_value)[2:]
#print(s, binary, type(binary))
for bit in binary:
binary_list.append(bit)
length=len(binary_list)
bits_to_append = -(length%6) + (6 if length%6 != 0 else 0)
binary_list.extend([0]*bits_to_append)
#print(f"{binary_list=}")
base64 = []
value = 0
for index, bit in enumerate(reversed(binary_list)):
#print (f"{bit=}")
#converting block of 6 bits to integer value
value += ( 2**(index%6) if bit=='1' else 0)
#print(f"{value=}")
#print(bit, end = '')
if (index+1)%6 == 0:
base64.append(Base64Encoder.base64Encoding[value])
#print(' ', end="")
#resetting value
value = 0
pass
#print()
#padding if there is less bytes and returning the result
return ''.join(reversed(base64))+''.join(['=']*bytes_to_append)
testEncoder.py
#!/usr/bin/env python3.10
from encoder import Base64Encoder
if __name__ == "__main__":
print(Base64Encoder.encode("Hello"))
print(Base64Encoder.encode("1 2 10 13 -7"))
print(Base64Encoder.encode("A"))
with open("image.jpg", "rb") as file_data:
print(Base64Encoder.encode(file_data.read()))
Output:
$ ./testEncoder.py
SGVsbG8=
MSAyIDEwIDEzIC03
QQ==

Unpickling mid-stream (python)

I am writing scripts to process (very large) files by repeatedly unpickling objects until EOF. I would like to partition the file and have separate processes (in the cloud) unpickle and process separate parts.
However my partitioner is not intelligent, it does not know about the boundaries between pickled objects in the file (since those boundaries depend on the object types being pickled, etc.).
Is there a way to scan a file for a "start pickled object" sentinel? The naive way would be to attempt unpickling at successive byte offsets until an object is successfully pickled, but that yields unexpected errors. It seems that for certain combinations of input, the unpickler falls out of sync and returns nothing for the rest of the file (see code below).
import cPickle
import os
def stream_unpickle(file_obj):
while True:
start_pos = file_obj.tell()
try:
yield cPickle.load(file_obj)
except (EOFError, KeyboardInterrupt):
break
except (cPickle.UnpicklingError, ValueError, KeyError, TypeError, ImportError):
file_obj.seek(start_pos+1, os.SEEK_SET)
if __name__ == '__main__':
import random
from StringIO import StringIO
# create some data
sio = StringIO()
[cPickle.dump(random.random(), sio, cPickle.HIGHEST_PROTOCOL) for _ in xrange(1000)]
sio.flush()
# read from subsequent offsets and find discontinuous jumps in object count
size = sio.tell()
last_count = None
for step in xrange(size):
sio.seek(step, os.SEEK_SET)
count = sum(1 for _ in stream_unpickle(file_obj))
if last_count is None or count == last_count - 1:
last_count = count
elif count != last_count:
# if successful, these should never print (but they do...)
print '%d elements read from byte %d' % (count, step)
print '(%d elements read from byte %d)' % (last_count, step-1)
last_count = count

The pickletools module has a dis function that shows the opcodes. It shows that there is a STOP opcode that you may be scan for:
>>> import pickle, pickletools, StringIO
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> p = s.getvalue()
>>> pickletools.dis(p)
0: S STRING 'abc'
7: p PUT 0
10: . STOP
highest protocol among opcodes = 0
Note, using the STOP opcode is a bit tricky because the codes are of variable length, but it may serve as a useful hint about where the cutoffs are.
If you control the pickling step on the other end, then you can improve the situation by adding your own unambiguous alternative separator:
>>> sep = '\xDE\xAD\xBE\xEF'
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> s.write(sep)
>>> pickle.dump([10, 20], s)
>>> s.write(sep)
>>> pickle.dump('def', s)
>>> s.write(sep)
>>> pickle.dump([30, 40], s)
>>> p = s.getvalue()
Before unpacking, split into separate pickles using the known separator:
>>> for pick in p.split(sep):
print pickle.loads(pick)
abc
[10, 20]
def
[30, 40]

In the pickled file, some opcodes have an argument -- a data value that follows the opcode. The data values vary in length, and can contain bytes identical to opcodes. Therefore, if you start reading the file from an arbitrary position, you have no way of knowing if you are looking at an opcode or in the middle of an argument. You must read the file from beginning and parse the opcodes.
I cooked up this function that skips one pickle from a file, i.e. reads it and parses opcodes, but does not construct the objects. It seems slightly faster than cPickle.loads on some files I have. You could rewrite this in C for more speed. (after testing this properly)
Then, you can make one pass over the whole file to get the seek position of each pickle.
from pickletools import code2op, UP_TO_NEWLINE, TAKEN_FROM_ARGUMENT1, TAKEN_FROM_ARGUMENT4
from marshal import loads as mloads
def skip_pickle(f):
"""Skip one pickle from file.
'f' is a file-like object containing the pickle.
"""
while True:
code = f.read(1)
if not code:
raise EOFError
opcode = code2op[code]
if opcode.arg is not None:
n = opcode.arg.n
if n > 0:
f.read(n)
elif n == UP_TO_NEWLINE:
f.readline()
elif n == TAKEN_FROM_ARGUMENT1:
n = ord(f.read(1))
f.read(n)
elif n == TAKEN_FROM_ARGUMENT4:
n = mloads('i' + f.read(4))
f.read(n)
if code == '.':
break

Sorry to answer my own question, and thanks to #RaymondHettinger for the idea of adding sentinels.
Here's what worked for me. I created readers and writers that use a sentinel '#S' followed by a data block length at the beginning of each 'record'. The writer has to take care to find any occurrences of '#' in the data being written and double them (into '##'). The reader then uses a look-behind regex to find sentinels, distinct from any matching values that might be in the original stream, and also verify the number of bytes between this sentinel and the subsequent one.
RecordWriter is a context manager (so multiple calls to write() can be encapsulated into a single record if needed). RecordReader is a generator.
Not sure how this is on performance. Any faster/elegant-er solutions are welcome.
import re
import cPickle
from functools import partial
from cStringIO import StringIO
SENTINEL = '#S'
# when scanning look for #S, but NOT ##S
sentinel_pattern = '(?<!#)#S' # uses negative look-behind
sentinel_re = re.compile(sentinel_pattern)
find_sentinel = sentinel_re.search
# when writing replace single # with double ##
write_pattern = '#'
write_re = re.compile(write_pattern)
fix_write = partial(write_re.sub, '##')
# when reading, replace double ## with single #
read_pattern = '##'
read_re = re.compile(read_pattern)
fix_read = partial(read_re.sub, '#')
class RecordWriter(object):
def __init__(self, stream):
self._stream = stream
self._write_buffer = None
def __enter__(self):
self._write_buffer = StringIO()
return self
def __exit__(self, et, ex, tb):
if self._write_buffer.tell():
self._stream.write(SENTINEL) # start
cPickle.dump(self._write_buffer.tell(), self._stream, cPickle.HIGHEST_PROTOCOL) # byte length of user's original data
self._stream.write(fix_write(self._write_buffer.getvalue()))
self._write_buffer = None
return False
def write(self, data):
if not self._write_buffer:
raise ValueError("Must use StreamWriter as a context manager")
self._write_buffer.write(data)
class BadBlock(Exception): pass
def verify_length(block):
fobj = StringIO(block)
try:
stated_length = cPickle.load(fobj)
except (ValueError, IndexError, cPickle.UnpicklingError):
raise BadBlock
data = fobj.read()
if len(data) != stated_length:
raise BadBlock
return data
def RecordReader(stream):
' Read one record '
accum = StringIO()
seen_sentinel = False
data = ''
while True:
m = find_sentinel(data)
if not m:
if seen_sentinel:
accum.write(data)
data = stream.read(80)
if not data:
if accum.tell():
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
return
else:
if seen_sentinel:
accum.write(data[:m.start()])
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
accum = StringIO()
else:
seen_sentinel = True
data = data[m.end():] # toss
if __name__ == '__main__':
import random
stream = StringIO()
data = [str(random.random()) for _ in xrange(3)]
# test with a string containing sentinel and almost-sentinel
data.append('abc12#jeoht38#SoSooihetS#')
count = len(data)
for i in data:
with RecordWriter(stream) as r:
r.write(i)
size = stream.tell()
start_pos = random.random() * size
stream.seek(start_pos, os.SEEK_SET)
read_data = [s for s in RecordReader(stream)]
print 'Original data: ', data
print 'After seeking to %d, RecordReader returned: %s' % (start_pos, read_data)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Fixing faulty unicode strings - python

Related

Extracting Json Objects from string

VerQueryValueW issue python 3

programmatically transform python code via lib2to3

How to encode text to base64 in python

Unpickling mid-stream (python)

Categories

Resources