I want to convert the betacode in an existing .tex-File to normal greek letters.
For example: I want to replace:
\bcode{lo/gos}
with simple:
λόγος
And so on for all other glyphs. Fortunately there seems to be a python-script that is supposed to do just that. But, being completely inexperienced I simply don’t know how to run it.
Here is the code of the python sript:
# beta2unicode.py
#
# Version 2004-11-23
#
# James Tauber
# http://jtauber.com/
#
# You are free to redistribute this, but please inform me of any errors
#
# USAGE:
#
# trie = beta2unicodeTrie()
# beta = "LO/GOS\n";
# unicode, remainder = trie.convert(beta)
#
# - to get final sigma, string must end in \n
# - remainder will contain rest of beta if not all can be converted
class Trie:
def __init__(self):
self.root = [None, {}]
def add(self, key, value):
curr_node = self.root
for ch in key:
curr_node = curr_node[1].setdefault(ch, [None, {}])
curr_node[0] = value
def find(self, key):
curr_node = self.root
for ch in key:
try:
curr_node = curr_node[1][ch]
except KeyError:
return None
return curr_node[0]
def findp(self, key):
curr_node = self.root
remainder = key
for ch in key:
try:
curr_node = curr_node[1][ch]
except KeyError:
return (curr_node[0], remainder)
remainder = remainder[1:]
return (curr_node[0], remainder)
def convert(self, keystring):
valuestring = ""
key = keystring
while key:
value, key = self.findp(key)
if not value:
return (valuestring, key)
valuestring += value
return (valuestring, key)
def beta2unicodeTrie():
t = Trie()
t.add("*A", u"\u0391")
t.add("*B", u"\u0392")
t.add("*G", u"\u0393")
t.add("*D", u"\u0394")
t.add("*E", u"\u0395")
t.add("*Z", u"\u0396")
t.add("*H", u"\u0397")
t.add("*Q", u"\u0398")
t.add("*I", u"\u0399")
t.add("*K", u"\u039A")
t.add("*L", u"\u039B")
t.add("*M", u"\u039C")
t.add("*N", u"\u039D")
t.add("*C", u"\u039E")
t.add("*O", u"\u039F")
t.add("*P", u"\u03A0")
t.add("*R", u"\u03A1")
t.add("*S", u"\u03A3")
t.add("*T", u"\u03A4")
t.add("*U", u"\u03A5")
t.add("*F", u"\u03A6")
t.add("*X", u"\u03A7")
t.add("*Y", u"\u03A8")
t.add("*W", u"\u03A9")
t.add("A", u"\u03B1")
t.add("B", u"\u03B2")
t.add("G", u"\u03B3")
t.add("D", u"\u03B4")
t.add("E", u"\u03B5")
t.add("Z", u"\u03B6")
t.add("H", u"\u03B7")
t.add("Q", u"\u03B8")
t.add("I", u"\u03B9")
t.add("K", u"\u03BA")
t.add("L", u"\u03BB")
t.add("M", u"\u03BC")
t.add("N", u"\u03BD")
t.add("C", u"\u03BE")
t.add("O", u"\u03BF")
t.add("P", u"\u03C0")
t.add("R", u"\u03C1")
t.add("S\n", u"\u03C2")
t.add("S,", u"\u03C2,")
t.add("S.", u"\u03C2.")
t.add("S:", u"\u03C2:")
t.add("S;", u"\u03C2;")
t.add("S]", u"\u03C2]")
t.add("S#", u"\u03C2#")
t.add("S_", u"\u03C2_")
t.add("S", u"\u03C3")
t.add("T", u"\u03C4")
t.add("U", u"\u03C5")
t.add("F", u"\u03C6")
t.add("X", u"\u03C7")
t.add("Y", u"\u03C8")
t.add("W", u"\u03C9")
t.add("I+", U"\u03CA")
t.add("U+", U"\u03CB")
t.add("A)", u"\u1F00")
t.add("A(", u"\u1F01")
t.add("A)\\", u"\u1F02")
t.add("A(\\", u"\u1F03")
t.add("A)/", u"\u1F04")
t.add("A(/", u"\u1F05")
t.add("E)", u"\u1F10")
t.add("E(", u"\u1F11")
t.add("E)\\", u"\u1F12")
t.add("E(\\", u"\u1F13")
t.add("E)/", u"\u1F14")
t.add("E(/", u"\u1F15")
t.add("H)", u"\u1F20")
t.add("H(", u"\u1F21")
t.add("H)\\", u"\u1F22")
t.add("H(\\", u"\u1F23")
t.add("H)/", u"\u1F24")
t.add("H(/", u"\u1F25")
t.add("I)", u"\u1F30")
t.add("I(", u"\u1F31")
t.add("I)\\", u"\u1F32")
t.add("I(\\", u"\u1F33")
t.add("I)/", u"\u1F34")
t.add("I(/", u"\u1F35")
t.add("O)", u"\u1F40")
t.add("O(", u"\u1F41")
t.add("O)\\", u"\u1F42")
t.add("O(\\", u"\u1F43")
t.add("O)/", u"\u1F44")
t.add("O(/", u"\u1F45")
t.add("U)", u"\u1F50")
t.add("U(", u"\u1F51")
t.add("U)\\", u"\u1F52")
t.add("U(\\", u"\u1F53")
t.add("U)/", u"\u1F54")
t.add("U(/", u"\u1F55")
t.add("W)", u"\u1F60")
t.add("W(", u"\u1F61")
t.add("W)\\", u"\u1F62")
t.add("W(\\", u"\u1F63")
t.add("W)/", u"\u1F64")
t.add("W(/", u"\u1F65")
t.add("A)=", u"\u1F06")
t.add("A(=", u"\u1F07")
t.add("H)=", u"\u1F26")
t.add("H(=", u"\u1F27")
t.add("I)=", u"\u1F36")
t.add("I(=", u"\u1F37")
t.add("U)=", u"\u1F56")
t.add("U(=", u"\u1F57")
t.add("W)=", u"\u1F66")
t.add("W(=", u"\u1F67")
t.add("*A)", u"\u1F08")
t.add("*)A", u"\u1F08")
t.add("*A(", u"\u1F09")
t.add("*(A", u"\u1F09")
#
t.add("*(\A", u"\u1F0B")
t.add("*A)/", u"\u1F0C")
t.add("*)/A", u"\u1F0C")
t.add("*A(/", u"\u1F0F")
t.add("*(/A", u"\u1F0F")
t.add("*E)", u"\u1F18")
t.add("*)E", u"\u1F18")
t.add("*E(", u"\u1F19")
t.add("*(E", u"\u1F19")
#
t.add("*(\E", u"\u1F1B")
t.add("*E)/", u"\u1F1C")
t.add("*)/E", u"\u1F1C")
t.add("*E(/", u"\u1F1D")
t.add("*(/E", u"\u1F1D")
t.add("*H)", u"\u1F28")
t.add("*)H", u"\u1F28")
t.add("*H(", u"\u1F29")
t.add("*(H", u"\u1F29")
t.add("*H)\\", u"\u1F2A")
t.add(")\\*H", u"\u1F2A")
t.add("*)\\H", u"\u1F2A")
#
t.add("*H)/", u"\u1F2C")
t.add("*)/H", u"\u1F2C")
#
t.add("*)=H", u"\u1F2E")
t.add("(/*H", u"\u1F2F")
t.add("*(/H", u"\u1F2F")
t.add("*I)", u"\u1F38")
t.add("*)I", u"\u1F38")
t.add("*I(", u"\u1F39")
t.add("*(I", u"\u1F39")
#
#
t.add("*I)/", u"\u1F3C")
t.add("*)/I", u"\u1F3C")
#
#
t.add("*I(/", u"\u1F3F")
t.add("*(/I", u"\u1F3F")
#
t.add("*O)", u"\u1F48")
t.add("*)O", u"\u1F48")
t.add("*O(", u"\u1F49")
t.add("*(O", u"\u1F49")
#
#
t.add("*(\O", u"\u1F4B")
t.add("*O)/", u"\u1F4C")
t.add("*)/O", u"\u1F4C")
t.add("*O(/", u"\u1F4F")
t.add("*(/O", u"\u1F4F")
#
t.add("*U(", u"\u1F59")
t.add("*(U", u"\u1F59")
#
t.add("*(/U", u"\u1F5D")
#
t.add("*(=U", u"\u1F5F")
t.add("*W)", u"\u1F68")
t.add("*W(", u"\u1F69")
t.add("*(W", u"\u1F69")
#
#
t.add("*W)/", u"\u1F6C")
t.add("*)/W", u"\u1F6C")
t.add("*W(/", u"\u1F6F")
t.add("*(/W", u"\u1F6F")
t.add("*A)=", u"\u1F0E")
t.add("*)=A", u"\u1F0E")
t.add("*A(=", u"\u1F0F")
t.add("*W)=", u"\u1F6E")
t.add("*)=W", u"\u1F6E")
t.add("*W(=", u"\u1F6F")
t.add("*(=W", u"\u1F6F")
t.add("A\\", u"\u1F70")
t.add("A/", u"\u1F71")
t.add("E\\", u"\u1F72")
t.add("E/", u"\u1F73")
t.add("H\\", u"\u1F74")
t.add("H/", u"\u1F75")
t.add("I\\", u"\u1F76")
t.add("I/", u"\u1F77")
t.add("O\\", u"\u1F78")
t.add("O/", u"\u1F79")
t.add("U\\", u"\u1F7A")
t.add("U/", u"\u1F7B")
t.add("W\\", u"\u1F7C")
t.add("W/", u"\u1F7D")
t.add("A)/|", u"\u1F84")
t.add("A(/|", u"\u1F85")
t.add("H)|", u"\u1F90")
t.add("H(|", u"\u1F91")
t.add("H)/|", u"\u1F94")
t.add("H)=|", u"\u1F96")
t.add("H(=|", u"\u1F97")
t.add("W)|", u"\u1FA0")
t.add("W(=|", u"\u1FA7")
t.add("A=", u"\u1FB6")
t.add("H=", u"\u1FC6")
t.add("I=", u"\u1FD6")
t.add("U=", u"\u1FE6")
t.add("W=", u"\u1FF6")
t.add("I\\+", u"\u1FD2")
t.add("I/+", u"\u1FD3")
t.add("I+/", u"\u1FD3")
t.add("U\\+", u"\u1FE2")
t.add("U/+", u"\u1FE3")
t.add("A|", u"\u1FB3")
t.add("A/|", u"\u1FB4")
t.add("H|", u"\u1FC3")
t.add("H/|", u"\u1FC4")
t.add("W|", u"\u1FF3")
t.add("W|/", u"\u1FF4")
t.add("W/|", u"\u1FF4")
t.add("A=|", u"\u1FB7")
t.add("H=|", u"\u1FC7")
t.add("W=|", u"\u1FF7")
t.add("R(", u"\u1FE4")
t.add("*R(", u"\u1FEC")
t.add("*(R", u"\u1FEC")
# t.add("~", u"~")
# t.add("-", u"-")
# t.add("(null)", u"(null)")
# t.add("&", "&")
t.add("0", u"0")
t.add("1", u"1")
t.add("2", u"2")
t.add("3", u"3")
t.add("4", u"4")
t.add("5", u"5")
t.add("6", u"6")
t.add("7", u"7")
t.add("8", u"8")
t.add("9", u"9")
t.add("#", u"#")
t.add("$", u"$")
t.add(" ", u" ")
t.add(".", u".")
t.add(",", u",")
t.add("'", u"'")
t.add(":", u":")
t.add(";", u";")
t.add("_", u"_")
t.add("[", u"[")
t.add("]", u"]")
t.add("\n", u"")
return t
t = beta2unicodeTrie()
import sys
for line in file(sys.argv[1]):
a, b = t.convert(line)
if b:
print a.encode("utf-8"), b
raise Exception
print a.encode("utf-8")
And here is a little .tex-file with which it should work.
\documentclass[12pt]{scrbook}
\usepackage[polutonikogreek, ngerman]{babel}
\usepackage[ngerman]{betababel}
\usepackage{fontspec}
%\defaultfontfeatures{Ligatures=TeX}
%\newfontfeature{Microtype}{protrusion=default;expansion=default;}
\begin{document}
\bcode{lo/gos}
\end{document}
In case the script does not work: would it be possible to convert all the strings within the \bcode-Makro with something like regex? For example the "o/" to the ό and so on? What would be the weapon of choice here?
Do I have python installed?
Try python -V at a shell prompt. Your code is python 2 code, so you will a python 2 version.
I need to install Python
Most straight forward way if you don't need a complex environment (and you don't for this problem) is just to go to python.org. Don't forget you need python 2.
Running the program
Generally it will be as simple as:
python beta2unicode.py myfile.tex-file
And to capture the output:
python beta2unicode.py myfile.tex-file > myfile.not-tex-file
Does the script work?
Almost. You will need to replace the code at the end of the script that starts the same way this does, with this:
import sys
t = beta2unicodeTrie()
import re
BCODE = re.compile(r'\\bcode{[^}]*}')
for line in open(sys.argv[1]):
matches = BCODE.search(line)
for match in BCODE.findall(line):
bcode = match[7:-1]
a, b = t.convert(bcode.upper())
if b:
raise IOError("failed conversion '%s' in '%s'" % (b, line))
converted = a.encode("utf-8")
line = line.replace(match, converted)
print(line.rstrip())
Results
\documentclass[12pt]{scrbook}
\usepackage[polutonikogreek, ngerman]{babel}
\usepackage[ngerman]{betababel}
\usepackage{fontspec}
%\defaultfontfeatures{Ligatures=TeX}
%\newfontfeature{Microtype}{protrusion=default;expansion=default;}
\begin{document}
λόγοσ
\end{document}
Using a Korean Input Method Editor (IME), it's possible to type 버리 + 어 and it will automatically become 버려.
Is there a way to programmatically do that in Python?
>>> x, y = '버리', '어'
>>> z = '버려'
>>> ord(z[-1])
47140
>>> ord(x[-1]), ord(y)
(47532, 50612)
Is there a way to compute that 47532 + 50612 -> 47140?
Here's some more examples:
가보 + 아 -> 가봐
끝나 + ㄹ -> 끝날
I'm a Korean. First, if you type 버리 + 어, it becomes 버리어 not 버려. 버려 is an abbreviation of 버리어 and it's not automatically generated. Also 가보아 cannot becomes 가봐 automatically during typing by the same reason.
Second, by contrast, 끝나 + ㄹ becomes 끝날 because 나 has no jongseong(종성). Note that one character of Hangul is made of choseong(초성), jungseong(중성), and jongseong. choseong and jongseong are a consonant, jungseong is a vowel. See more at Wikipedia. So only when there's no jongseong during typing (like 끝나), there's a chance that it can have jongseong(ㄹ).
If you want to make 버리 + 어 to 버려, you should implement some Korean language grammar like, especially for this case, abbreviation of jungseong. For example ㅣ + ㅓ = ㅕ, ㅗ + ㅏ = ㅘ as you provided. 한글 맞춤법 chapter 4. section 5 (I can't find English pages right now) defines abbreviation like this. It's possible, but not so easy job especially for non-Koreans.
Next, if what you want is just to make 끝나 + ㄹ to 끝날, it can be a relatively easy job since there're libraries which can handle composition and decomposition of choseong, jungseong, jongseong. In case of Python, I found hgtk. You can try like this (nonpractical code):
# hgtk methods take one character at a time
cjj1 = hgtk.letter.decompose('나') # ('ㄴ', 'ㅏ', '')
cjj2 = hgtk.letter.decompose('ㄹ') # ('ㄹ', '', '')
if cjj1[2]) == '' and cjj2[1]) == '':
cjj = (cjj1[0], cjj1[1], cjj2[0])
cjj2 = None
Still, without proper knowledge of Hangul, it will be very hard to get it done.
You could use your own Translation table.
The drawback is you have to input all pairs manual or you have a file to get it from.
For instance:
# Sample Korean chars to map
k = [[('버리', '어'), ('버려')], [('가보', '아'), ('가봐')], [('끝나', 'ㄹ'), ('끝날')]]
class Korean(object):
def __init__(self):
self.map = {}
for m in k:
key = m[0][0] + m[0][1]
self.map[hash(key)] = m[1]
def __getitem__(self, item):
return self.map[hash(item)]
def translate(self, s):
return [ self.map[hash(token)] for token in s]
if __name__ == '__main__':
k_map = Korean()
k_chars = [ m[0][0] + m[0][1] for m in k]
print('Input: %s' % k_chars)
print('Output: %s' % k_map.translate(k_chars))
one_char_3 = k[0][0][0] + k[0][0][1]
print('%s = %s' % (one_char_3, k_map[ one_char_3 ]) )
Input: ['버리어', '가보아', '끝나ㄹ']
Output: ['버려', '가봐', '끝날']
버리어 = 버려
Tested with Python:3.4.2
I have a program in python that takes two strings. One is the plain text string, another is the cipher key. what it does is go over each of the characters and xors the bits with the cipher characters. But when going back and forth a few of the letter do not seem to change properly. Here is the code:
//turns int into bin string length 8
def bitString(n):
bin_string = bin(n)[2:]
bin_string = ("0" * (8 - len(bin_string))) + bin_string
return bin_string
//xors the bits
def bitXOR(b0, b1):
nb = ""
for x in range(min(len(b0), len(b1))):
nb += "0" if b0[x] == b1[x] else "1"
return nb
//takes 2 chars, turns them into bin strings, xors them, then returns the new char
def cypherChar(c0, c1):
return chr(int(bitXOR(bitString(ord(c0)), bitString(ord(c1))), 2))
//takes s0 (the plaintext) and encrypts it using the cipher key (s1)
def cypherString(s0, s1):
ns = ""
for x in range(len(s0)):
ns += cypherChar(s0[x], s1[x%len(s1)])
return ns
For example sometimes in a long string the word 'test' will cipher back into 'eest', and stuff like that
I have checked over the code a dozen times and I can't figure out whats causing some of the characters to change. Is it possible some characters just behave strangely?
EDIT:
example:
This is a test
Due to the fact that in the last test
Some symbols: !##$%^&*()
were not changed properly
I am retesting
END
using the cipher key : 'cypher key'
translates back to :
This is a test
Due to toe aact that in the last sest
Some symbols: !##$%^&*()
were not changed properly
I am retestiig
END
Sorry it its a little messy, I put it together real quick
from binascii import hexlify, unhexlify
from sys import version_info
def bit_string(string):
if version_info >= (3, 0):
return bin(int.from_bytes(string.encode(), 'big'))
else:
return bin(int(hexlify(string), 16))
def bitXOR_encrypt(plain_text, key):
encrypted_list = []
for j in range(2, len(plain_text)):
encrypted_list.append(int(plain_text[j]) ^ int(key[j])) #Assume the key and string are the same length
return encrypted_list
def decrypt(cipher_text, key):
decrypted_list = []
for j in range(2, len(cipher_text)): #or xrange
decrypted_list.append(int(cipher_text[j]) ^ int(key[j])) #Again assumes key is the same length as the string
decrypted_list = [str(i) for i in decrypted_list]
add_binary = "0b" + "".join(decrypted_list)
decrypted_string = int(add_binary, 2)
if version_info >= (3, 0):
message = decrypted_string.to_bytes((decrypted_string.bit_length() + 7) // 8, 'big').decode()
else:
message = unhexlify('%x' % decrypted_string)
return message
def main():
plain_text = "Hello"
plain_text_to_bits = bit_string(plain_text)
key_to_bits = bit_string("candy")
#Encrypt
cipher_text = bitXOR_encrypt(plain_text_to_bits, key_to_bits)
#make Strings
cipher_text_string = "".join([str(i) for i in cipher_text])
key_string = "".join([str(i) for i in key_to_bits])
#Decrypt
decrypted_string = decrypt("0B"+cipher_text_string, key_string)
print("plain text: %s" % plain_text)
print("plain text to bits: % s" % plain_text_to_bits)
print("key string in bits: %s" % key_string)
print("Ciphered Message: %s" %cipher_text_string)
print("Decrypted String: %s" % decrypted_string)
main()
for more details or example code you can visit my repository either on github
https://github.com/marcsantiago/one_time_pad_encryption
Also, I know that in this example the key is the same length as the string. If you want to use a string that is smaller than the string try wrapping it like in a vigenere cipher (http://en.wikipedia.org/wiki/Vigenère_cipher)
I think you are overcomplicating things:
def charxor(s1, s2):
return chr(ord(s1) ^ ord(s2))
def wordxor(w1, w2):
return ''.join(charxor(w1[i], w2[i]) for i in range(min(len(w1), len(w2))))
word = 'test'
key = 'what'
cyphered = wordxor(word, key)
uncyphered = wordxor(cyphered, key)
print(repr(cyphered))
print(uncyphered)
You get
'\x03\r\x12\x00'
test
There is a fairly good explanation of Python's bit arithmetic in How do you get the logical xor of two variables in Python?
I could find nothing wrong with the results of your functions when testing with your input data and key. To demonstrate, you could try this test code which should not fail:
import random
def random_string(n):
return ''.join(chr(random.getrandbits(8)) for _ in range(n))
for i in range(1000):
plaintext = random_string(500)
key = random_string(random.randrange(1,100))
ciphertext = cypherString(plaintext, key)
assert cypherString(ciphertext, key) == plaintext
If you can provide a definitive sample of plain text, key, and cipher text that fails, I can look further.
I want to compare two strings in a python unittest which contain html.
Is there a method which outputs the result in a human friendly (diff like) version?
A simple method is to strip whitespace from the HTML and split it into a list. Python 2.7's unittest (or the backported unittest2) then gives a human-readable diff between the lists.
import re
def split_html(html):
return re.split(r'\s*\n\s*', html.strip())
def test_render_html():
expected = ['<div>', '...', '</div>']
got = split_html(render_html())
self.assertEqual(expected, got)
If I'm writing a test for working code, I usually first set expected = [], insert a self.maxDiff = None before the assert and let the test fail once. The expected list can then be copy-pasted from the test output.
You might need to tweak how whitespace is stripped depending on what your HTML looks like.
I submitted a patch to do this some years back. The patch was rejected but you can still view it on the python bug list.
I doubt you would want to hack your unittest.py to apply the patch (if it even still works after all this time), but here's the function for reducing two strings a manageable size while still keeping at least part of what differs. So long as all you didn't want the complete differences this might be what you want:
def shortdiff(x,y):
'''shortdiff(x,y)
Compare strings x and y and display differences.
If the strings are too long, shorten them to fit
in one line, while still keeping at least some difference.
'''
import difflib
LINELEN = 79
def limit(s):
if len(s) > LINELEN:
return s[:LINELEN-3] + '...'
return s
def firstdiff(s, t):
span = 1000
for pos in range(0, max(len(s), len(t)), span):
if s[pos:pos+span] != t[pos:pos+span]:
for index in range(pos, pos+span):
if s[index:index+1] != t[index:index+1]:
return index
left = LINELEN/4
index = firstdiff(x, y)
if index > left + 7:
x = x[:left] + '...' + x[index-4:index+LINELEN]
y = y[:left] + '...' + y[index-4:index+LINELEN]
else:
x, y = x[:LINELEN+1], y[:LINELEN+1]
left = 0
cruncher = difflib.SequenceMatcher(None)
xtags = ytags = ""
cruncher.set_seqs(x, y)
editchars = { 'replace': ('^', '^'),
'delete': ('-', ''),
'insert': ('', '+'),
'equal': (' ',' ') }
for tag, xi1, xi2, yj1, yj2 in cruncher.get_opcodes():
lx, ly = xi2 - xi1, yj2 - yj1
edits = editchars[tag]
xtags += edits[0] * lx
ytags += edits[1] * ly
# Include ellipsis in edits line.
if left:
xtags = xtags[:left] + '...' + xtags[left+3:]
ytags = ytags[:left] + '...' + ytags[left+3:]
diffs = [ x, xtags, y, ytags ]
if max([len(s) for s in diffs]) < LINELEN:
return '\n'.join(diffs)
diffs = [ limit(s) for s in diffs ]
return '\n'.join(diffs)
Maybe this is a quite 'verbose' solution. You could add a new 'equality function' for your user defined type (e.g: HTMLString) which you have to define first:
class HTMLString(str):
pass
Now you have to define a type equality function:
def assertHTMLStringEqual(first, second):
if first != second:
message = ... # TODO here: format your message, e.g a diff
raise AssertionError(message)
All you have to do is format your message as you like. You can also use a class method in your specific TestCase as a type equality function. This gives you more functionality to format your message, since unittest.TestCase does this a lot.
Now you have to register this equality function in your unittest.TestCase:
...
def __init__(self):
self.addTypeEqualityFunc(HTMLString, assertHTMLStringEqual)
The same for a class method:
...
def __init__(self):
self.addTypeEqualityFunc(HTMLString, 'assertHTMLStringEqual')
And now you can use it in your tests:
def test_something(self):
htmlstring1 = HTMLString(...)
htmlstring2 = HTMLString(...)
self.assertEqual(htmlstring1, htmlstring2)
This should work well with python 2.7.
I (the one asking this question) use BeautfulSoup now:
def assertEqualHTML(string1, string2, file1='', file2=''):
u'''
Compare two unicode strings containing HTML.
A human friendly diff goes to logging.error() if there
are not equal, and an exception gets raised.
'''
from BeautifulSoup import BeautifulSoup as bs
import difflib
def short(mystr):
max=20
if len(mystr)>max:
return mystr[:max]
return mystr
p=[]
for mystr, file in [(string1, file1), (string2, file2)]:
if not isinstance(mystr, unicode):
raise Exception(u'string ist not unicode: %r %s' % (short(mystr), file))
soup=bs(mystr)
pretty=soup.prettify()
p.append(pretty)
if p[0]!=p[1]:
for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):
logging.error(line)
raise Exception('Not equal %s %s' % (file1, file2))