How can I parse Python's triple-quote f-strings? - python

I have this code that parses and processes normal "f-string" template strings (See the usage part below for an example):
from string import Formatter
import sys
_conversions = {'a': ascii, 'r': repr, 's': str}
def z(template, locals_=None):
if locals_ is None:
previous_frame = sys._getframe(1)
previous_frame_locals = previous_frame.f_locals
locals_ = previous_frame_locals
# locals_ = globals()
result = []
parts = Formatter().parse(template)
for part in parts:
literal_text, field_name, format_spec, conversion = part
if literal_text:
result.append(literal_text)
if not field_name:
continue
value = eval(field_name, locals_) #.__format__()
if conversion:
value = _conversions[conversion](value)
if format_spec:
value = format(value, format_spec)
else:
value = str(value)
result.append(value)
res = ''.join(result)
return res
Usage:
a = 'World'
b = 10
z('Hello {a} --- {a:^30} --- {67+b} --- {a!r}')
# "Hello World --- World --- 77 --- 'World'"
But it doesn't work if the template string is something like this:
z('''
echo monkey {z("curl -s https://www.poemist.com/api/v1/randompoems | jq --raw-output '.[0].content'")} end | sed -e 's/monkey/start/'
echo --------------
''')
It gives this error:
File "<string>", line 1
z("curl -s https
^
SyntaxError: EOL while scanning string literal
I am willing to even copy code from Python's source code to get this to work, if it's not possible normally.

Thanks to the tip by #ForceBru, I finished this. The following code parses and processes source tripe-quote f-strings: (Ignore the process parts)
_conversions = {'a': ascii, 'r': repr, 's': str}
def zstring(self, template, locals_=None, getframe=1):
if locals_ is None:
previous_frame = sys._getframe(getframe)
previous_frame_locals = previous_frame.f_locals
locals_ = previous_frame_locals
def asteval(astNode):
if astNode is not None:
return eval(compile(ast.Expression(astNode), filename='<string>', mode='eval'), locals_)
else:
return None
def eatFormat(format_spec, code):
res = False
if format_spec:
flags = format_spec.split(':')
res = code in flags
format_spec = list(filter(lambda a: a != code,flags))
return ':'.join(format_spec), res
p = ast.parse(f"f'''{template}'''")
result = []
parts = p.body[0].value.values
for part in parts:
typ = type(part)
if typ is ast.Str:
result.append(part.s)
elif typ is ast.FormattedValue:
# print(part.__dict__)
value = asteval(part.value)
conversion = part.conversion
if conversion >= 0:
# parser doesn't support custom conversions
conversion = chr(conversion)
value = self._conversions[conversion](value)
format_spec = asteval(part.format_spec) or ''
# print(f"orig format: {format_spec}")
format_spec, fmt_eval = eatFormat(format_spec, 'e')
format_spec, fmt_bool = eatFormat(format_spec, 'bool')
# print(f"format: {format_spec}")
if format_spec:
value = format(value, format_spec)
if fmt_bool:
value = boolsh(value)
value = str(value)
if not fmt_eval:
value = self.zsh_quote(value)
result.append(value)
cmd = ''.join(result)
return cmd

Related

Code conversion from Python to Lua almost completed

I found a Python script that I'm trying to convert to Lua. I believe I have it just about converted, but the code isn't quite working properly, so I need assistance as I do not know Python at all, and can only guess at the intentions. This is merely a color converter to convert RGB color to xterm 256. The table is quite huge, so I've truncated it for ease of reading.
Python code:
import sys, re
CLUT = [ # color look-up table
# 8-bit, RGB hex
# Primary 3-bit (8 colors). Unique representation!
('00', '000000'),
('01', '800000'),
('02', '008000'),
('03', '808000'),
('04', '000080'),
('05', '800080'),
('06', '008080'),
('07', 'c0c0c0'),
]
def _str2hex(hexstr):
return int(hexstr, 16)
def _strip_hash(rgb):
# Strip leading `#` if exists.
if rgb.startswith('#'):
rgb = rgb.lstrip('#')
return rgb
def _create_dicts():
short2rgb_dict = dict(CLUT)
rgb2short_dict = {}
for k, v in short2rgb_dict.items():
rgb2short_dict[v] = k
return rgb2short_dict, short2rgb_dict
def short2rgb(short):
return SHORT2RGB_DICT[short]
def print_all():
""" Print all 256 xterm color codes.
"""
for short, rgb in CLUT:
sys.stdout.write('\033[48;5;%sm%s:%s' % (short, short, rgb))
sys.stdout.write("\033[0m ")
sys.stdout.write('\033[38;5;%sm%s:%s' % (short, short, rgb))
sys.stdout.write("\033[0m\n")
print "Printed all codes."
print "You can translate a hex or 0-255 code by providing an argument."
def rgb2short(rgb):
""" Find the closest xterm-256 approximation to the given RGB value.
#param rgb: Hex code representing an RGB value, eg, 'abcdef'
#returns: String between 0 and 255, compatible with xterm.
>>> rgb2short('123456')
('23', '005f5f')
>>> rgb2short('ffffff')
('231', 'ffffff')
>>> rgb2short('0DADD6') # vimeo logo
('38', '00afd7')
"""
rgb = _strip_hash(rgb)
incs = (0x00, 0x5f, 0x87, 0xaf, 0xd7, 0xff)
# Break 6-char RGB code into 3 integer vals.
parts = [ int(h, 16) for h in re.split(r'(..)(..)(..)', rgb)[1:4] ]
res = []
for part in parts:
i = 0
while i < len(incs)-1:
s, b = incs[i], incs[i+1] # smaller, bigger
if s <= part <= b:
s1 = abs(s - part)
b1 = abs(b - part)
if s1 < b1: closest = s
else: closest = b
res.append(closest)
break
i += 1
#print '***', res
res = ''.join([ ('%02.x' % i) for i in res ])
equiv = RGB2SHORT_DICT[ res ]
#print '***', res, equiv
return equiv, res
RGB2SHORT_DICT, SHORT2RGB_DICT = _create_dicts()
#---------------------------------------------------------------------
if __name__ == '__main__':
import doctest
doctest.testmod()
if len(sys.argv) == 1:
print_all()
raise SystemExit
arg = sys.argv[1]
if len(arg) < 4 and int(arg) < 256:
rgb = short2rgb(arg)
sys.stdout.write('xterm color \033[38;5;%sm%s\033[0m -> RGB exact \033[38;5;%sm%s\033[0m' % (arg, arg, arg, rgb))
sys.stdout.write("\033[0m\n")
else:
short, rgb = rgb2short(arg)
sys.stdout.write('RGB %s -> xterm color approx \033[38;5;%sm%s (%s)' % (arg, short, short, rgb))
sys.stdout.write("\033[0m\n")
And my nearly complete translated Lua code:
CLUT = {
-- Primary 3-bit (8 colors). Unique representation!
['00'] = '000000',
['01'] = '800000',
['02'] = '008000',
['03'] = '808000',
['04'] = '000080',
['05'] = '800080',
['06'] = '008080',
['07'] = 'c0c0c0',
}
function _str2hex(hexstr)
return tonumber(hexstr, 16)
end
function _strip_hash(rgb)
-- Strip leading # if exists
return rgb:gsub("^#", "")
end
function _create_dicts()
short2rgb_dict = CLUT
rgb2short_dict = {}
for k,v in pairs(short2rgb_dict) do
rgb2short_dict[v] = k
end
return rgb2short_dict, short2rgb_dict
end
function short2rgb(short)
return short2rgb_dict[short]
end
function rgb2short(rgb)
-- Find closest xterm-256 approximation to the given RGB value
_create_dicts()
rgb = _strip_hash(rgb)
local res = ""
local equiv = ""
local incs = {"0x00", "0x5f", "0x87", "0xaf", "0xd7", "0xff"}
for part in string.gmatch(rgb, "(..)") do
part = tonumber(part, 16)
i = 1
while i < #incs - 1 do
s, b = tonumber(incs[i]), tonumber(incs[i+1])
if s <= part and part <= b then
s1 = math.abs(s - part)
b1 = math.abs(b - part)
end
if s1 < b1 then
closest = s
else
closest = b
res = res .. closest
break
end
i = i + 1
end
end
equiv = rgb2short_dict[res]
return equiv, res
end
I realize that I'm missing the printing portion of the code, but I wasn't sure if that was at all relevant, and I know some of the code I've translated is not correct at all, as the script would be working otherwise. The failures I get are with the rgb2short function with it not returning the proper equiv and res values. How far off am I with my revision? What changes do I need to make to make it absolutely work?
I wound up figuring it out on my own after some hardcore trial and error. The function rgb2short should have been:
function rgb2short(rgb)
-- Find closest xterm-256 approximation to the given RGB value
_create_dicts()
rgb = _strip_hash(rgb)
local res = ""
local equiv = ""
local incs = {"0x00", "0x5f", "0x87", "0xaf", "0xd7", "0xff"}
for part in string.gmatch(rgb, "(..)") do
part = tonumber(part, 16)
i = 1
while i < #incs-1 do
s, b = tonumber(incs[i]), tonumber(incs[i+1])
if s <= part and part <= b then
s1 = math.abs(s - part)
b1 = math.abs(b - part)
--break
--end
if s1 < b1 then
closest = s
else
closest = b
end
res = res .. string.format("%02x", closest)
break
end
i = i + 1
end
end
equiv = rgb2short_dict[res]
return equiv, res
end

Windows/Python Error WindowsError: [Error 3] The system cannot find the path specified

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.
import os
import re
import codecs
import numpy as np
import theano
models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")
def get_name(parameters):
"""
Generate a model name from its parameters.
"""
l = []
for k, v in parameters.items():
if type(v) is str and "/" in v:
l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
else:
l.append((k, v))
name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
return "".join(i for i in name if i not in "\/:*?<>|")
def set_values(name, param, pretrained):
"""
Initialize a network parameter with pretrained values.
We check that sizes are compatible.
"""
param_value = param.get_value()
if pretrained.size != param_value.size:
raise Exception(
"Size mismatch for parameter %s. Expected %i, found %i."
% (name, param_value.size, pretrained.size)
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def iob2(tags):
"""
Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
for i, tag in enumerate(tags):
if tag == 'O':
continue
split = tag.split('-')
if len(split) != 2 or split[0] not in ['I', 'B']:
return False
if split[0] == 'B':
continue
elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
return True
def iob_iobes(tags):
"""
IOB -> IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
elif tag.split('-')[0] == 'B':
if i + 1 != len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif tag.split('-')[0] == 'I':
if i + 1 < len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise Exception('Invalid IOB format!')
return new_tags
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def pad_word_chars(words):
"""
Pad the characters of the words in a sentence.
Input:
- list of lists of ints (list of words, a word being a list of char indexes)
Output:
- padded list of lists of ints
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
char_pos = []
for word in words:
padding = [0] * (max_length - len(word))
char_for.append(word + padding)
char_rev.append(word[::-1] + padding)
char_pos.append(len(word) - 1)
return char_for, char_rev, char_pos
def create_input(data, parameters, add_label, singletons=None):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
words = data['words']
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
input = []
if parameters['word_dim']:
input.append(words)
if parameters['char_dim']:
input.append(char_for)
if parameters['char_bidirect']:
input.append(char_rev)
input.append(char_pos)
if parameters['cap_dim']:
input.append(caps)
if add_label:
input.append(data['tags'])
return input
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
id_to_tag, dictionary_tags, eval_id):
"""
Evaluate current model using CoNLL script.
"""
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)
for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
y_preds = np.array(f_eval(*input))[1:-1]
else:
y_preds = f_eval(*input).argmax(axis=1)
y_reals = np.array(data['tags']).astype(np.int32)
assert len(y_preds) == len(y_reals)
p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
r_tags = [id_to_tag[y_real] for y_real in y_reals]
if parameters['tag_scheme'] == 'iobes':
p_tags = iobes_iob(p_tags)
r_tags = iobes_iob(r_tags)
for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
predictions.append(new_line)
count[y_real, y_pred] += 1
predictions.append("")
# Write predictions to disk and run CoNLL script externally
#eval_id = np.random.randint(1000000, 2000000)
output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
with codecs.open(output_path, 'w', 'utf8') as f:
f.write("\n".join(predictions))
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
#trainLog = open('train.log', 'w')
for line in eval_lines:
print line
#trainLog.write("%s\n" % line)
# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)
# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
# Global accuracy
print "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
# F1 on all entities
return float(eval_lines[1].strip().split()[-1])
When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.
run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):
File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in
model = Model(parameters=parameters, models_path=models_path)
File "model.py", line 36, in init
os.makedirs(self.model_path)
File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)
WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.
Try it like blow if file is 1 folder back:
models_path = "..\models"
eval_path = "..\evaluation"

SHA256 doesn't yield same result

I'm following on this tutorial and in part 2 (picture below) it shows that the SHA256 yields a result different than what I get when I ran my python code:
the string is: 0450863AD64A87AE8A2FE83C1AF1A8403CB53F53E486D8511DAD8A04887E5B23522CD470243453A299FA9E77237716103ABC11A1DF38855ED6F2EE187E9C582BA6
While the tutorial SHA256 comes to: 600FFE422B4E00731A59557A5CCA46CC183944191006324A447BDB2D98D4B408
My short python shows:
sha_result = sha256(bitconin_addresss).hexdigest().upper()
print sha_result
32511E82D56DCEA68EB774094E25BAB0F8BDD9BC1ECA1CEEDA38C7A43ACEDDCE
in fact, any online sha256 shows the same python result; so am I missing here something?
You're hashing the string when you're supposed to be hashing the bytes represented by that string.
>>> hashlib.sha256('0450863AD64A87AE8A2FE83C1AF1A8403CB53F53E486D8511DAD8A04887E5B23522CD470243453A299FA9E77237716103ABC11A1DF38855ED6F2EE187E9C582BA6'.decode('hex')).hexdigest().upper()
'600FFE422B4E00731A59557A5CCA46CC183944191006324A447BDB2D98D4B408'
You could use Gavin's "base58.py", which I believe he no longer shares it on his github page. However you probably could easily google and find different versions of it from github.
Here is one version edited a little by me:
#!/usr/bin/env python
"""encode/decode base58 in the same way that Bitcoin does"""
import math
import sys
__b58chars = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
__b58base = len(__b58chars)
def b58encode(v):
""" encode v, which is a string of bytes, to base58.
"""
long_value = 0L
for (i, c) in enumerate(v[::-1]):
long_value += ord(c) << (8*i) # 2x speedup vs. exponentiation
result = ''
while long_value >= __b58base:
div, mod = divmod(long_value, __b58base)
result = __b58chars[mod] + result
long_value = div
result = __b58chars[long_value] + result
# Bitcoin does a little leading-zero-compression:
# leading 0-bytes in the input become leading-1s
nPad = 0
for c in v:
if c == '\0': nPad += 1
else: break
return (__b58chars[0]*nPad) + result
def b58decode(v):
""" decode v into a string of len bytes
"""
long_value = 0L
for (i, c) in enumerate(v[::-1]):
long_value += __b58chars.find(c) * (__b58base**i)
result = ''
while long_value >= 256:
div, mod = divmod(long_value, 256)
result = chr(mod) + result
long_value = div
result = chr(long_value) + result
nPad = 0
for c in v:
if c == __b58chars[0]: nPad += 1
else: break
result = chr(0)*nPad + result
return result
try:
import hashlib
hashlib.new('ripemd160')
have_crypto = True
except ImportError:
have_crypto = False
def hash_160(public_key):
if not have_crypto:
return ''
h1 = hashlib.sha256(public_key).digest()
r160 = hashlib.new('ripemd160')
r160.update(h1)
h2 = r160.digest()
return h2
def hash_160_to_bc_address(h160, version="\x00"):
if not have_crypto:
return ''
vh160 = version+h160
h3=hashlib.sha256(hashlib.sha256(vh160).digest()).digest()
addr=vh160+h3[0:4]
return b58encode(addr)
def public_key_to_bc_address(public_key, version="\x00"):
if not have_crypto or public_key is None:
return ''
h160 = hash_160(public_key)
return hash_160_to_bc_address(h160, version=version)
def sec_to_bc_key(sec, version="\x80"):
if not have_crypto or sec is None:
return ''
vsec = version+sec +"\x01"
hvsec=hashlib.sha256(hashlib.sha256(vsec).digest()).digest()
return b58encode(vsec+hvsec[0:4])
def bc_key_to_sec(prv):
return b58decode(prv)[1:33]
def bc_address_to_hash_160(addr):
bytes = b58decode(addr)
return bytes[1:21]
if __name__ == '__main__':
if len(sys.argv) > 1:
if sys.argv[1] == '-en':
print b58encode(sys.argv[2].decode('hex_codec'))
if sys.argv[1] == '-de':
print b58decode(sys.argv[2]).encode('hex_codec')
if sys.argv[1] == '-pub':
print public_key_to_bc_address(sys.argv[2].decode('hex_codec'))
if sys.argv[1] == '-adr':
print bc_address_to_hash_160(sys.argv[2]).encode('hex_codec')
if sys.argv[1] == '-sec':
print sec_to_bc_key(sys.argv[2].decode('hex_codec'))
if sys.argv[1] == '-prv':
print bc_key_to_sec(sys.argv[2]).encode('hex_codec')
else:
print ''
print 'Usage: ./base58.py [options]'
print ''
print ' -en converts hex to base58'
print ' -de converts base58 to hex'
print
print ' -pub public_key_to_bc_address'
print ' -adr bc_address_to_hash_160'
print
print ' -sec sec_to_bc_key'
print ' -prv bc_key_to_sec'
print
To answer your specific question, based on above code you could use this command:
hashlib.sha256('0450863AD64A87AE8A2FE83C1AF1A8403CB53F53E486D8511DAD8A04887E5B23522CD470243453A299FA9E77237716103ABC11A1DF38855ED6F2EE187E9C582BA6'.decode('hex_codec')).digest().encode('hex_codec').upper()

Parsing Event Information Table files

My dreambox compatible video recorder stores event information table ".eit" files with every recording. I'd like to work with this information to rearrange my recordings.
A similar question came up in http://www.i-have-a-dreambox.com/wbb2/thread.php?threadid=186234&sid=3b36acb1ba62e4724cb47216ce08a564
The format seems to be a binary format as outlined in:
https://de.wikipedia.org/wiki/Event_Information_Table
and
http://www.etsi.org/deliver/etsi_en/300400_300499/300468/01.14.01_60/en_300468v011401p.pdf
I am now looking for a parser for such files. Where could I find one that works with files and does not assume a broadcast stream as input?
What did i try so far?
I searched the web and found the following links and pointers:
There seems to be a java library
https://docs.oracle.com/javame/config/cdc/opt-pkgs/api/jsr927/javax/tv/service/guide/ProgramEvent.html
which is part of the JSR 927 https://jcp.org/en/jsr/detail?id=927
specification.
As it looks this libary is only available for Java-ME see https://en.wikipedia.org/wiki/Java_TV
If found some dvb related EIT code snippets e.g.
https://github.com/jinfeng-geeya/3202C/blob/master/SRC/lib/libdvb/libepg/eit_parser.c
or
http://linuxtv.org/docs/libdvbv5/eit_8h.html
as part of the Kamaelia DVB Tools Project http://www.kamaelia.org/Developers/Projects/DVBTools.html there seems to be a python solution:
http://pydoc.net/Python/Kamaelia/0.6.0/Kamaelia.Device.DVB.Parse.ParseEventInformationTable/
The closest I found so far was from a hint athttp://forums.openpli.org/topic/29141-eit-file-format/ which points to:
https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
Currently I am pursuing to go from this Open Source Python Code.
This is a Python script that seems to be a valid start.
It's available as opensource at https://github.com/WolfgangFahl/eitparser where you'll find the latest python3 compatible version and documentation.
When you call it with
python EitParser.py SomeEitFile
it will print out the name and description of the eit file.
Add you language codes as you need e.g. from https://github.com/libo/Enigma2/blob/master/lib/python/Tools/ISO639.py
#!/usr/bin/python
# encoding: utf-8
#
# EitSupport
# Copyright (C) 2011 betonme
# Copyright (C) 2016 Wolfgang Fahl
#
# This EITParser is based on:
# https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
#
# In case of reuse of this source code please do not remove this copyright.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# For more information on the GNU General Public License see:
# <http://www.gnu.org/licenses/>.
#
import os
import struct
import time
from datetime import datetime
#from Components.config import config
#from Components.Language import language
#from EMCTasker import emcDebugOut
#from IsoFileSupport import IsoSupport
#from MetaSupport import getInfoFile
#def crc32(data):
# poly = 0x4c11db7
# crc = 0xffffffffL
# for byte in data:
# byte = ord(byte)
# for bit in range(7,-1,-1): # MSB to LSB
# z32 = crc>>31 # top bit
# crc = crc << 1
# if ((byte>>bit)&1) ^ z32:
# crc = crc ^ poly
# crc = crc & 0xffffffffL
# return crc
decoding_charSpecHR = {u'Ć': u'\u0106', u'æ': u'\u0107', u'®': u'\u017D', u'¾': u'\u017E', u'©': u'\u0160', u'¹': u'\u0161', u'Č': u'\u010C', u'è': u'\u010D', u'ð': u'\u0111'}
decoding_charSpecCZSK = {u'Ï'+u'C': u'Č',u'Ï'+u'E': u'Ě',u'Ï'+u'L': u'Ľ',u'Ï'+u'N': u'Ň',u'Ï'+u'R': u'Ř',u'Ï'+u'S': u'Š',u'Ï'+u'T': u'Ť',u'Ï'+u'Z': u'Ž',u'Ï'+u'c': u'č',u'Ï'+u'd': u'ď',u'Ï'+u'e': u'ě',u'Ï'+u'l': u'ľ', u'Ï'+u'n': u'ň',
u'Ï'+u'r': u'ř',u'Ï'+u's': u'š',u'Ï'+u't': u'ť',u'Ï'+u'z': u'ž',u'Ï'+u'D': u'Ď',u'Â'+u'A': u'Á',u'Â'+u'E': u'É',u'Â'+u'I': u'Í',u'Â'+u'O': u'Ó',u'Â'+u'U': u'Ú',u'Â'+u'a': u'á',u'Â'+u'e': u'é',u'Â'+u'i': u'í',u'Â'+u'o': u'ó',
u'Â'+u'u': u'ú',u'Â'+u'y': u'ý',u'Ã'+u'o': u'ô',u'Ã'+u'O': u'Ô',u'Ê'+u'u': u'ů',u'Ê'+u'U': u'Ů',u'È'+u'A': u'Ä',u'È'+u'E': u'Ë',u'È'+u'I': u'Ï',u'È'+u'O': u'Ö',u'È'+u'U': u'Ü',u'È'+u'Y': u'Ÿ',u'È'+u'a': u'ä',u'È'+u'e': u'ë',
u'È'+u'i': u'ï',u'È'+u'o': u'ö',u'È'+u'u': u'ü',u'È'+u'y': u'ÿ'}
def convertCharSpecHR(text):
for i, j in decoding_charSpecHR.iteritems():
text = text.replace(i, j)
return text
def convertCharSpecCZSK(text):
for i, j in decoding_charSpecCZSK.iteritems():
text = text.replace(i, j)
return text
def parseMJD(MJD):
# Parse 16 bit unsigned int containing Modified Julian Date,
# as per DVB-SI spec
# returning year,month,day
YY = int( (MJD - 15078.2) / 365.25 )
MM = int( (MJD - 14956.1 - int(YY*365.25) ) / 30.6001 )
D = MJD - 14956 - int(YY*365.25) - int(MM * 30.6001)
K=0
if MM == 14 or MM == 15: K=1
return (1900 + YY+K), (MM-1-K*12), D
def unBCD(byte):
return (byte>>4)*10 + (byte & 0xf)
#from Tools.ISO639 import LanguageCodes
# -*- coding: iso-8859-2 -*-
LanguageCodes = { }
LanguageCodes["deu"] = LanguageCodes["ger"] = LanguageCodes["de"] = ("German", "Germanic")
LanguageCodes["fra"] = LanguageCodes["fre"] = LanguageCodes["fr"] = ("French", "Romance")
def language_iso639_2to3(alpha2):
ret = alpha2
if alpha2 in LanguageCodes:
language = LanguageCodes[alpha2]
for alpha, name in LanguageCodes.items():
if name == language:
if len(alpha) == 3:
return alpha
return ret
#TEST
#print LanguageCodes["sv"]
#print language_iso639_2to3("sv")
# Eit File support class
# Description
# http://de.wikipedia.org/wiki/Event_Information_Table
class EitList():
EIT_SHORT_EVENT_DESCRIPTOR = 0x4d
EIT_EXTENDED_EVENT_DESCRIPOR = 0x4e
def __init__(self, path=None):
self.eit_file = None
#TODO
# The dictionary implementation could be very slow
self.eit = {}
self.iso = None
self.__newPath(path)
self.__readEitFile()
def __newPath(self, path):
name = None
if path:
if self.eit_file != path:
self.eit_file = path
def __mk_int(self, s):
return int(s) if s else 0
def __toDate(self, d, t):
if d and t:
#TODO Is there another fast and safe way to get the datetime
try:
return datetime(int(d[0]), int(d[1]), int(d[2]), int(t[0]), int(t[1]))
except ValueError:
return None
else:
return None
##############################################################################
## Get Functions
def getEitsid(self):
return self.eit.get('service', "") #TODO
def getEitTsId(self):
return self.eit.get('transportstream', "") #TODO
def getEitWhen(self):
return self.eit.get('when', "")
def getEitStartDate(self):
return self.eit.get('startdate', "")
def getEitStartTime(self):
return self.eit.get('starttime', "")
def getEitDuration(self):
return self.eit.get('duration', "")
def getEitName(self):
return self.eit.get('name', "").strip()
def getEitDescription(self):
return self.eit.get('description', "").strip()
# Wrapper
def getEitShortDescription(self):
return self.getEitName()
def getEitExtendedDescription(self):
return self.getEitDescription()
def getEitLengthInSeconds(self):
length = self.eit.get('duration', "")
#TODO Is there another fast and safe way to get the length
if len(length)>2:
return self.__mk_int((length[0]*60 + length[1])*60 + length[2])
elif len(length)>1:
return self.__mk_int(length[0]*60 + length[1])
else:
return self.__mk_int(length)
def getEitDate(self):
return self.__toDate(self.getEitStartDate(), self.getEitStartTime())
##############################################################################
## File IO Functions
def __readEitFile(self):
data = ""
path = self.eit_file
#lang = language.getLanguage()[:2]
lang = language_iso639_2to3( "de" )
#print lang + str(path)
if path and os.path.exists(path):
#print "Reading Event Information Table " + str(path)
# Read data from file
# OE1.6 with Pyton 2.6
#with open(self.eit_file, 'r') as file: lines = file.readlines()
f = None
try:
f = open(path, 'rb')
#lines = f.readlines()
data = f.read()
except Exception, e:
emcDebugOut("[META] Exception in readEitFile: " + str(e))
finally:
if f is not None:
f.close()
# Parse the data
if data and 12 <= len(data):
# go through events
pos = 0
e = struct.unpack(">HHBBBBBBH", data[pos:pos+12])
event_id = e[0]
date = parseMJD(e[1]) # Y, M, D
time = unBCD(e[2]), unBCD(e[3]), unBCD(e[4]) # HH, MM, SS
duration = unBCD(e[5]), unBCD(e[6]), unBCD(e[7]) # HH, MM, SS
running_status = (e[8] & 0xe000) >> 13
free_CA_mode = e[8] & 0x1000
descriptors_len = e[8] & 0x0fff
if running_status in [1,2]:
self.eit['when'] = "NEXT"
elif running_status in [3,4]:
self.eit['when'] = "NOW"
self.eit['startdate'] = date
self.eit['starttime'] = time
self.eit['duration'] = duration
pos = pos + 12
short_event_descriptor = []
short_event_descriptor_multi = []
extended_event_descriptor = []
extended_event_descriptor_multi = []
component_descriptor = []
content_descriptor = []
linkage_descriptor = []
parental_rating_descriptor = []
endpos = len(data) - 1
while pos < endpos:
rec = ord(data[pos])
length = ord(data[pos+1]) + 2
if rec == 0x4D:
descriptor_tag = ord(data[pos+1])
descriptor_length = ord(data[pos+2])
ISO_639_language_code = str(data[pos+3:pos+5])
event_name_length = ord(data[pos+5])
short_event_description = data[pos+6:pos+6+event_name_length]
if ISO_639_language_code == lang:
short_event_descriptor.append(short_event_description)
short_event_descriptor_multi.append(short_event_description)
elif rec == 0x4E:
ISO_639_language_code = str(data[pos+3:pos+5])
extended_event_description = ""
extended_event_description_multi = ""
for i in range (pos+8,pos+length):
if str(ord(data[i]))=="138":
extended_event_description += '\n'
extended_event_description_multi += '\n'
else:
if data[i]== '\x10' or data[i]== '\x00' or data[i]== '\x02':
pass
else:
extended_event_description += data[i]
extended_event_description_multi += data[i]
if ISO_639_language_code == lang:
extended_event_descriptor.append(extended_event_description)
extended_event_descriptor_multi.append(extended_event_description)
elif rec == 0x50:
component_descriptor.append(data[pos+8:pos+length])
elif rec == 0x54:
content_descriptor.append(data[pos+8:pos+length])
elif rec == 0x4A:
linkage_descriptor.append(data[pos+8:pos+length])
elif rec == 0x55:
parental_rating_descriptor.append(data[pos+2:pos+length])
else:
#print "unsopported descriptor: %x %x" %(rec, pos + 12)
#print data[pos:pos+length]
pass
pos += length
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if short_event_descriptor:
short_event_descriptor = "".join(short_event_descriptor)
else:
short_event_descriptor = "".join(short_event_descriptor_multi)
if short_event_descriptor:
#try:
# short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
short_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
short_event_descriptor = short_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
short_event_descriptor = str(convertCharSpecCZSK(short_event_descriptor))
if (lang == "hr"):
short_event_descriptor = str(convertCharSpecHR(short_event_descriptor))
self.eit['name'] = short_event_descriptor
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if extended_event_descriptor:
extended_event_descriptor = "".join(extended_event_descriptor)
else:
extended_event_descriptor = "".join(extended_event_descriptor_multi)
if extended_event_descriptor:
#try:
# extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
extended_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
extended_event_descriptor = extended_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
extended_event_descriptor = str(convertCharSpecCZSK(extended_event_descriptor))
if (lang == "hr"):
extended_event_descriptor = str(convertCharSpecHR(extended_event_descriptor))
self.eit['description'] = extended_event_descriptor
else:
# No date clear all
self.eit = {}
"""Module docstring.
Read Eit File and show the information.
"""
import sys
import getopt
def readeit(eitfile):
eitlist=EitList(eitfile)
print eitlist.getEitName();
print eitlist.getEitStartDate();
print eitlist.getEitDescription();
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
# process arguments
for arg in args:
readeit(arg) # process() is defined elsewhere
if __name__ == "__main__":
main()

I have 2 expressions which should be valid that are not recognized as such

import re
cards1 = "'F'*4 + 'H'*10"; cards2 = 'FFHH'
def find_number_of_cards(cards):
regexp = re.compile(r"(?P<FandH>[FH]+) | (('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
result = regexp.search(cards)
if result == None:
return ("The expression given is not valid.")
else:
FnH = result.group('FandH')
F = result.group('F')
H = result.group('H')
if FnH == None:
return F, H
else:
return "Blank."
print(find_number_of_cards(cards1))
print(find_number_of_cards(cards2))
Change this:
regexp = re.compile(r"(?P<FandH>[FH]+) | (('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
to this:
regexp = re.compile(r"(?P<FandH>[FH]+)|(('F')[*](?P<F>[0-9]+)\s*[+]\s*('H')[*](?P<H>[0-9]+))")
It's looking for a space in the string, which isn't there.

Categories

Resources