encodings again - python

I am trying to work with sqlite on python:
from pysqlite2 import dbapi2 as sqlite
con = sqlite.connect('/home/argon/super.db')
cur = con.cursor()
cur.execute('select * from notes')
for i in cur.fetchall():
print i[2]
And I sometimes get something like this (I am from Russia):
Ответ etc...
And if I pass this string to this function(it helped me in other projects):
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
I get even more weird result:
ÐÑвеÑиÑÑ Ñ ÑиÑиÑованием etc
What should I do to get normal Cyrillic symbols?

О looks like a UTF-8 byte pair for \xD0\x9E, or \u1054. Better known as the cyrillic character О (Capital O).
In other words, you have strangely encoded UTF-8 data on your hand. Turn the { digits into bytes (chr(208) would do) then decode from UTF-8:
>>> (chr(208) + chr(158)).decode('utf-8')
u'\u1054'
>>> print (chr(208) + chr(158)).decode('utf-8')
О
>>> print (chr(208) + chr(158) + chr(209) + chr(130) + chr(208) + chr(178)).decode('utf-8')
Отв

Related

How to escape all HTML entities in show_popup() method and fix Parse Error in Sublime Text 3 plugin?

I am making a plugin for Sublime Text 3. It contacts my server in Java and receives a response in the form of a list of strings, that contains C code.
To display this code in a popup window you need to pass a string in HTML format to the method show_popup. Accordingly, all C-code characters that can be recognized by the parser as HTML entities should be replaced with their names (&name;) or numbers (&#number;). At first, I just replaced the most common characters with replace(), but it didn't always work out - Parse Error was displayed in the console:
Parse Error: <br> printf ("Decimals: %d %ld\n", 1977, 650000L);
<br> printf ("Preceding with blanks:&nbs
...
y</a></li><p><b>____________________________________________________</b></p>
</ul>
</body>
code: Unexpected character
I've tried to escape html entities with Python's html library:
import html
...
html.escape(string)
But Sublime doesn't see import and print in console that I was using a function without defining it - I guess he didn't see that I connected this library(Whyyy?). cgi.escape - is depricated, so I can't use this. I decided to write the function myself.
Then I saw a very interesting way to replace all the characters whose code is >127 and some other characters (&, <,>) with their numbers:
def escape_html (s):
out = ""
i = 0
while i < len(s):
c = s[i]
number = ord(c)
if number > 127 or c == '"' or c == '\'' or c == '<' or c == '>' or c == '&':
out += "&#"
out += str(number)
out += ";"
else:
out += c
i += 1
out = out.replace(" ", " ")
out = out.replace("\n", "<br>")
return out
This code works perfectly for displaying characters in a browser, but unfortunately it is not supported by Sublime Text 3.
As a result, I came to the conclusion that these characters should be replaced with their equivalent names:
def dumb_escape_html(s):
entities = [["&", "&"], ["<", "<"], [">", ">"], ["\n", "<br>"],
[" ", " "]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
But again I faced an obstacle: not all names are supported in Sublime. And again an error Parse Error.
I also attach a link to JSON file, which contains answer from my server, content of which should be displayed in pop-up window: Example of data from sever (codeshare.io)
I absolutely do not understand, in what I make a mistake - I hope, that great programmers know how to solve my problem.
Edit. Minimal, Reproducible Example:
import sublime
import sublime_plugin
import string
import sys
import json
def get_func_name(line, column):
return "printf"
def get_const_data(func_name):
input_file = open ("PATH_TO_JSON/data_printf.json")
results = json.load(input_file)
return results
def dumb_escape_html(s):
entities = [["&", "&"], ["<", "<"], [">", ">"], ["\n", "<br>"],
[" ", " "]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
def dumb_unescape_html(s):
entities = [["<", "<"], [">", ">"], ["<br>", "\n"],
[" ", " "], ["&", "&"]]
for entity in entities:
s = s.replace(entity[0], entity[1])
return s
class CoderecsysCommand(sublime_plugin.TextCommand):
def run(self, edit):
v = self.view
cur_line = v.substr(v.line(v.sel()[0]))
for sel in v.sel():
line_begin = v.rowcol(sel.begin())[0]
line_end = v.rowcol(sel.end())[0]
pos = v.rowcol(v.sel()[0].begin()) # (row, column)
try:
func_name = get_func_name(cur_line, pos[1]-1)
li_tree = ""
final_data = get_const_data(func_name)
for i in range(len(final_data)):
source = "source: " + final_data[i]["source"]
escaped = dumb_escape_html(final_data[i]["code"])
divider = "<b>____________________________________________________</b>"
li_tree += "<li><p>%s</p>%s <a href='%s'>Copy</a></li><p>%s</p>" %(source, escaped, escaped, divider)
# The html to be shown.
html = """
<body id=copy-multiline>
Examples of using <b>%s</b> function.
<ul>
%s
</ul>
</body>
""" %(func_name, li_tree)
self.view.show_popup(html, max_width=700, on_navigate=lambda example: self.copy_example(example, func_name, source))
except Exception as ex:
self.view.show_popup("<b style=\"color:#1c87c9\">CodeRec Error:</b> " + str(ex), max_width=700)
def copy_example(self, example, func_name, source):
# Copies the code to the clipboard.
unescaped = dumb_unescape_html(example)
unescaped = "// " + source + unescaped
sublime.set_clipboard(unescaped)
self.view.hide_popup()
sublime.status_message('Example of using ' + func_name + ' copied to clipboard !')

VerQueryValueW issue python 3

I'm trying to get version of files through GetFileVersionInfoSizeW and VerQueryValueW. I got partial of the version printed out but not the entire thing. It also has some weird spaces between each character of the file version. Anyone has an idea what is wrong with it?
My guess is it is related to the Unicode interpretation of python3 since I had to change the GetFileVersionInfoSizeW and VerQueryValueW from the original GetFileVersionInfoSizeA and VerQueryValueA that ran normally in python2 (https://stackoverflow.com/a/38924793/7144869).
import array
from ctypes import *
def get_file_info(filename):
"""
Extract information from a file.
"""
# Get size needed for buffer (0 if no info)
size = windll.version.GetFileVersionInfoSizeW(filename, None)
# If no info in file -> empty string
if not size:
return 'Failed'
# Create buffer
res = create_string_buffer(size)
# Load file informations into buffer res
windll.version.GetFileVersionInfoW(filename, None, size, res)
r = c_uint()
l = c_uint()
# Look for codepages
windll.version.VerQueryValueW(res, '\\VarFileInfo\\Translation',
byref(r), byref(l))
# If no codepage -> empty string
if not l.value:
return ''
# Take the first codepage (what else ?)
codepages = array.array('H', string_at(r.value, l.value))
codepage = tuple(codepages[:2].tolist())
# Extract information
windll.version.VerQueryValueW(res, ('\\StringFileInfo\\%04x%04x\\'
+ 'FileVersion') % codepage, byref(r), byref(l))
return string_at(r.value, l.value)
print (get_file_info(r'C:\WINDOWS\system32\calc.exe').decode())
The functions return what Microsoft calls "Unicode" strings, but it is really encoded UTF-16LE that ctypes.wstring can convert. l.value is a count of UTF16 characters, not bytes, so use the following to decode it properly. You won't need to .decode() the result as you are doing now.
return wstring_at(r.value, l.value)
Here's my working code:
from ctypes import *
from ctypes import wintypes as w
ver = WinDLL('version')
ver.GetFileVersionInfoSizeW.argtypes = w.LPCWSTR, w.LPDWORD
ver.GetFileVersionInfoSizeW.restype = w.DWORD
ver.GetFileVersionInfoW.argtypes = w.LPCWSTR, w.DWORD, w.DWORD, w.LPVOID
ver.GetFileVersionInfoW.restype = w.BOOL
ver.VerQueryValueW.argtypes = w.LPCVOID, w.LPCWSTR, POINTER(w.LPVOID), w.PUINT
ver.VerQueryValueW.restype = w.BOOL
def get_file_info(filename):
size = ver.GetFileVersionInfoSizeW(filename, None)
if not size:
raise RuntimeError('version info not found')
res = create_string_buffer(size)
if not ver.GetFileVersionInfoW(filename, 0, size, res):
raise RuntimeError('GetFileVersionInfoW failed')
buf = w.LPVOID()
length = w.UINT()
# Look for codepages
if not ver.VerQueryValueW(res, r'\VarFileInfo\Translation', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find translation')
if length.value == 0:
raise RuntimeError('no code pages')
codepages = array.array('H', string_at(buf.value, length.value))
codepage = tuple(codepages[:2])
# Extract information
if not ver.VerQueryValueW(res, rf'\StringFileInfo\{codepage[0]:04x}{codepage[1]:04x}\FileVersion', byref(buf), byref(length)):
raise RuntimeError('VerQueryValueW failed to find file version')
return wstring_at(buf.value,length.value)
print(get_file_info(r'c:\windows\system32\calc.exe'))
Output:
10.0.19041.1 (WinBuild.160101.0800)

How to encode text to base64 in python

I am trying to encode a text string to base64.
i tried doing this :
name = "your name"
print('encoding %s in base64 yields = %s\n'%(name,name.encode('base64','strict')))
But this gives me the following error:
LookupError: 'base64' is not a text encoding; use codecs.encode() to handle arbitrary codecs
How do I go about doing this ? ( using Python 3.4)
Remember to import base64 and that b64encode takes bytes as an argument.
import base64
b = base64.b64encode(bytes('your string', 'utf-8')) # bytes
base64_str = b.decode('utf-8') # convert bytes to string
It turns out that this is important enough to get it's own module...
import base64
base64.b64encode(b'your name') # b'eW91ciBuYW1l'
base64.b64encode('your name'.encode('ascii')) # b'eW91ciBuYW1l'
For py3, base64 encode and decode string:
import base64
def b64e(s):
return base64.b64encode(s.encode()).decode()
def b64d(s):
return base64.b64decode(s).decode()
1) This works without imports in Python 2:
>>>
>>> 'Some text'.encode('base64')
'U29tZSB0ZXh0\n'
>>>
>>> 'U29tZSB0ZXh0\n'.decode('base64')
'Some text'
>>>
>>> 'U29tZSB0ZXh0'.decode('base64')
'Some text'
>>>
(although this doesn't work in Python3 )
2) In Python 3 you'd have to import base64 and do base64.b64decode('...')
- will work in Python 2 too.
To compatibility with both py2 and py3
import six
import base64
def b64encode(source):
if six.PY3:
source = source.encode('utf-8')
content = base64.b64encode(source).decode('utf-8')
It looks it's essential to call decode() function to make use of actual string data even after calling base64.b64decode over base64 encoded string. Because never forget it always return bytes literals.
import base64
conv_bytes = bytes('your string', 'utf-8')
print(conv_bytes) # b'your string'
encoded_str = base64.b64encode(conv_bytes)
print(encoded_str) # b'eW91ciBzdHJpbmc='
print(base64.b64decode(encoded_str)) # b'your string'
print(base64.b64decode(encoded_str).decode()) # your string
Whilst you can of course use the base64 module, you can also to use the codecs module (referred to in your error message) for binary encodings (meaning non-standard & non-text encodings).
For example:
import codecs
my_bytes = b"Hello World!"
codecs.encode(my_bytes, "base64")
codecs.encode(my_bytes, "hex")
codecs.encode(my_bytes, "zip")
codecs.encode(my_bytes, "bz2")
This can come in useful for large data as you can chain them to get compressed and json-serializable values:
my_large_bytes = my_bytes * 10000
codecs.decode(
codecs.encode(
codecs.encode(
my_large_bytes,
"zip"
),
"base64"),
"utf8"
)
Refs:
https://docs.python.org/3/library/codecs.html#binary-transforms
https://docs.python.org/3/library/codecs.html#standard-encodings
https://docs.python.org/3/library/codecs.html#text-encodings
Use the below code:
import base64
#Taking input through the terminal.
welcomeInput= raw_input("Enter 1 to convert String to Base64, 2 to convert Base64 to String: ")
if(int(welcomeInput)==1 or int(welcomeInput)==2):
#Code to Convert String to Base 64.
if int(welcomeInput)==1:
inputString= raw_input("Enter the String to be converted to Base64:")
base64Value = base64.b64encode(inputString.encode())
print "Base64 Value = " + base64Value
#Code to Convert Base 64 to String.
elif int(welcomeInput)==2:
inputString= raw_input("Enter the Base64 value to be converted to String:")
stringValue = base64.b64decode(inputString).decode('utf-8')
print "Base64 Value = " + stringValue
else:
print "Please enter a valid value."
Base64 encoding is a process of converting binary data to an ASCII
string format by converting that binary data into a 6-bit character
representation. The Base64 method of encoding is used when binary
data, such as images or video, is transmitted over systems that are
designed to transmit data in a plain-text (ASCII) format.
Follow this link for further details about understanding and working of base64 encoding.
For those who want to implement base64 encoding from scratch for the sake of understanding, here's the code that encodes the string to base64.
encoder.py
#!/usr/bin/env python3.10
class Base64Encoder:
#base64Encoding maps integer to the encoded text since its a list here the index act as the key
base64Encoding:list = None
#data must be type of str or bytes
def encode(data)->str:
#data = data.encode("UTF-8")
if not isinstance(data, str) and not isinstance(data, bytes):
raise AttributeError(f"Expected {type('')} or {type(b'')} but found {type(data)}")
if isinstance(data, str):
data = data.encode("ascii")
if Base64Encoder.base64Encoding == None:
#construction base64Encoding
Base64Encoder.base64Encoding = list()
#mapping A-Z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 65))
#mapping a-z
for key in range(0, 26):
Base64Encoder.base64Encoding.append(chr(key + 97))
#mapping 0-9
for key in range(0, 10):
Base64Encoder.base64Encoding.append(chr(key + 48))
#mapping +
Base64Encoder.base64Encoding.append('+')
#mapping /
Base64Encoder.base64Encoding.append('/')
if len(data) == 0:
return ""
length=len(data)
bytes_to_append = -(length%3)+(3 if length%3 != 0 else 0)
#print(f"{bytes_to_append=}")
binary_list = []
for s in data:
ascii_value = s
binary = f"{ascii_value:08b}"
#binary = bin(ascii_value)[2:]
#print(s, binary, type(binary))
for bit in binary:
binary_list.append(bit)
length=len(binary_list)
bits_to_append = -(length%6) + (6 if length%6 != 0 else 0)
binary_list.extend([0]*bits_to_append)
#print(f"{binary_list=}")
base64 = []
value = 0
for index, bit in enumerate(reversed(binary_list)):
#print (f"{bit=}")
#converting block of 6 bits to integer value
value += ( 2**(index%6) if bit=='1' else 0)
#print(f"{value=}")
#print(bit, end = '')
if (index+1)%6 == 0:
base64.append(Base64Encoder.base64Encoding[value])
#print(' ', end="")
#resetting value
value = 0
pass
#print()
#padding if there is less bytes and returning the result
return ''.join(reversed(base64))+''.join(['=']*bytes_to_append)
testEncoder.py
#!/usr/bin/env python3.10
from encoder import Base64Encoder
if __name__ == "__main__":
print(Base64Encoder.encode("Hello"))
print(Base64Encoder.encode("1 2 10 13 -7"))
print(Base64Encoder.encode("A"))
with open("image.jpg", "rb") as file_data:
print(Base64Encoder.encode(file_data.read()))
Output:
$ ./testEncoder.py
SGVsbG8=
MSAyIDEwIDEzIC03
QQ==

parsing a string in python for #hashtag

I am wondering, how could I make an algorithm that parses a string for the hashtag symbol ' # ' and returns the full string, but where ever a word starts with a '#' symbol, it becomes a link. I am using python with Google app engine: webapp2 and Jinja2 and I am building a blog.
Thanks
A more efficient and complete way to find the "hashwords":
import functools
def hash_position(string):
return string.find('#')
def delimiter_position(string, delimiters):
positions = filter(lambda x: x >= 0, map(lambda delimiter: string.find(delimiter), delimiters))
try:
return functools.reduce(min, positions)
except TypeError:
return -1
def get_hashed_words(string, delimiters):
maximum_length = len(string)
current_hash_position = hash_position(string)
string = string[current_hash_position:]
results = []
counter = 0
while current_hash_position != -1:
current_delimiter_position = delimiter_position(string, delimiters)
if current_delimiter_position == -1:
results.append(string)
else:
results.append(string[0:current_delimiter_position])
# Update offsets and the haystack
string = string[current_delimiter_position:]
current_hash_position = hash_position(string)
string = string[current_hash_position:]
return results
if __name__ == "__main__":
string = "Please #clarify: What do you #mean with returning somthing as a #link. #herp"
delimiters = [' ', '.', ',', ':']
print(get_hashed_words(string, delimiters))
Imperative code with updates of the haystack looks a little bit ugly but hey, that's what we get for (ab-)using mutable variables.
And I still have no idea what do you mean with "returning something as a link".
Hope that helps.
not sure where do you get the data for the link, but maybe something like:
[('%s' % word) for word in input.split() if word[0]=='#']
Are you talking about twitter? Maybe this?
def get_hashtag_link(hashtag):
if hashtag.startswith("#"):
return '%s' % (hashtag[1:], hashtag)
>>> get_hashtag_link("#stackoverflow")
'#stackoverflow'
It will return None if hashtag is not a hashtag.

Python socket.send encoding

It seems i've run a problem with the encoding itself in where i need to pass Bing translation junks..
def _unicode_urlencode(params):
if isinstance(params, dict):
params = params.items()
return urllib.urlencode([(k, isinstance(v, unicode) and v.encode('utf-8') or v) for k, v in params])
def _run_query(args):
data = _unicode_urlencode(args)
sock = urllib.urlopen(api_url + '?' + data)
result = sock.read()
if result.startswith(codecs.BOM_UTF8):
result = result.lstrip(codecs.BOM_UTF8).decode('utf-8')
elif result.startswith(codecs.BOM_UTF16_LE):
result = result.lstrip(codecs.BOM_UTF16_LE).decode('utf-16-le')
elif result.startswith(codecs.BOM_UTF16_BE):
result = result.lstrip(codecs.BOM_UTF16_BE).decode('utf-16-be')
return json.loads(result)
def set_app_id(new_app_id):
global app_id
app_id = new_app_id
def translate(text, source, target, html=False):
"""
action=opensearch
"""
if not app_id:
raise ValueError("AppId needs to be set by set_app_id")
query_args = {
'appId': app_id,
'text': text,
'from': source,
'to': target,
'contentType': 'text/plain' if not html else 'text/html',
'category': 'general'
}
return _run_query(query_args)
...
text = translate(sys.argv[2], 'en', 'tr')
HOST = '127.0.0.1'
PORT = 894
s = socket.socket()
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
s.connect((HOST, PORT))
s.send("Bing translation: " + text.encode('utf8') + "\r");
s.close()
As you can see, if the translated text contains some turkish characters, the script fails to send the text to the socket..
Do you have any idea on how to get rid of this?
Regards.
Your problem is entirely unrelated to the socket. text is already a bytestring, and you're trying to encode it. What happens is that Python tries to converts the bytestring to a unicode via the safe ASCII encoding in order to be able to encode as UTF-8, and then fails because the bytestring contains non-ASCII characters.
You should fix translate to return a unicode object, by using a JSON variable that returns unicode objects.
Alternatively, if it is already encoding text encoded as UTF-8, you can simply use
s.send("Bing translation: " + text + "\r")
# -*- coding:utf-8 -*-
text = u"text in you language"
s.send(u"Bing translation: " + text.encode('utf8') + u"\r");
This must work. text must be spelled in utf-8 encoding.

Categories

Resources