Convert a file to static C string declaration - python

I would like convert number of files to static string declarations in C. I have trying writing a quick script in Python (shown below), but it doesn't seem exactly simple and a number of issue came up trying to compile the output.
import os, sys
from glob import glob
from re import sub
test_dirs = ('basics', 'float', 'import', 'io', 'misc')
tests = sorted(test_file for test_files in (glob('{}/*.py'.format(dir)) for dir in test_dirs) for test_file in test_files)
def cfunc_name(t):
return sub(r'/|\.|-', '_', t)
for t in tests:
print("void {}(void* data) {{".format(cfunc_name(t)))
with open(t) as f:
lines = ''.join(f.readlines())
cstr = sub('"', '\\"', lines)
cstr = sub('\n', '\"\n\"', cstr)
print(" const char * pystr = \"\"\n\"{}\";".format(cstr))
print("end:\n ;\n}")
print("struct testcase_t core_tests[] = {")
for t in tests:
print(" {{ \"{}\", test_{}_fn, TT_ENABLED_, 0, 0 }},".format(t, cfunc_name(t)))
print("END_OF_TESTCASES };")
Looking for an existing tool is not exactly obvious (may be my search keywords are not quite right)... Is there a simple UNIX tool that does this or has anyone come across something similar?

Does this work for you? https://code.google.com/p/txt2cs/
The main issue I can think of is new lines and escaping if you want to roll your own.

I have used the txt2cs implementation as a reference and ended-up with just a few lines of Python that do all the escaping. As I didn't want to add extra things to the build system, it's easier to have this done in Python. This is going to be integrated in test automation, which is already a complex beast.
The main takeaway is that RE substitutions have to be done in a certain order and aren't the ideal tool for this purpose.
import os, sys
from glob import glob
from re import sub
def escape(s):
lookup = {
'\0': '\\0',
'\t': '\\t',
'\n': '\\n\"\n\"',
'\r': '\\r',
'\\': '\\\\',
'\"': '\\\"',
}
return "\"\"\n\"{}\"".format(''.join([lookup[x] if x in lookup else x for x in s]))
def chew_filename(t):
return { 'func': "test_{}_fn".format(sub(r'/|\.|-', '_', t)), 'desc': t.split('/')[1] }
def script_to_map(t):
r = { 'name': chew_filename(t)['func'] }
with open(t) as f: r['script'] = escape(''.join(f.readlines()))
return r
test_function = (
"void {name}(void* data) {{\n"
" const char * pystr = {script};\n"
" do_str(pystr);\n"
"}}"
)
testcase_struct = (
"struct testcase_t {name}_tests[] = {{\n{body}\n END_OF_TESTCASES\n}};"
)
testcase_member = (
" {{ \"{desc}\", {func}, TT_ENABLED_, 0, 0 }},"
)
testgroup_struct = (
"struct testgroup_t groups[] = {{\n{body}\n END_OF_GROUPS\n}};"
)
testgroup_member = (
" {{ \"{name}/\", {name}_tests }},"
)
test_dirs = ('basics', 'float', 'import', 'io', 'misc')
output = []
for group in test_dirs:
tests = glob('{}/*.py'.format(group))
output.extend([test_function.format(**script_to_map(test)) for test in tests])
testcase_members = [testcase_member.format(**chew_filename(test)) for test in tests]
output.append(testcase_struct.format(name=group, body='\n'.join(testcase_members)))
testgroup_members = [testgroup_member.format(name=group) for group in test_dirs]
output.append(testgroup_struct.format(body='\n'.join(testgroup_members)))
print('\n\n'.join(output))
Below is what the output looks like, as you can see the initial ""\n and '\\n\"\n\"' make it quite a bit more readable:
void test_basics_break_py_fn(void* data) {
const char * pystr = ""
"while True:\n"
" break\n"
"\n"
"for i in range(4):\n"
" print('one', i)\n"
" if i > 2:\n"
" break\n"
" print('two', i)\n"
"\n"
"for i in [1, 2, 3, 4]:\n"
" if i == 3:\n"
" break\n"
" print(i)\n"
"";
do_str(pystr);
}

Related

PyParsing: parse if not a keyword

I am trying to parse a file as follows:
testp.txt
title = Test Suite A;
timeout = 10000
exp_delay = 500;
log = TRUE;
sect
{
type = typeA;
name = "HelloWorld";
output_log = "c:\test\out.log";
};
sect
{
name = "GoodbyeAll";
type = typeB;
comm1_req = 0xDEADBEEF;
comm1_resp = (int, 1234366);
};
The file first contains a section with parameters and then some sects. I can parse a file containing just parameters and I can parse a file just containing sects but I can't parse both.
from pyparsing import *
from pathlib import Path
command_req = Word(alphanums)
command_resp = "(" + delimitedList(Word(alphanums)) + ")"
kW = Word(alphas+'_', alphanums+'_') | command_req | command_resp
keyName = ~Literal("sect") + Word(alphas+'_', alphanums+'_') + FollowedBy("=")
keyValue = dblQuotedString.setParseAction( removeQuotes ) | OneOrMore(kW,stopOn=LineEnd())
param = dictOf(keyName, Suppress("=")+keyValue+Optional(Suppress(";")))
node = Group(Literal("sect") + Literal("{") + OneOrMore(param) + Literal("};"))
final = OneOrMore(node) | OneOrMore(param)
param.setDebug()
p = Path(__file__).with_name("testp.txt")
with open(p) as f:
try:
x = final.parseFile(f, parseAll=True)
print(x)
print("...")
dx = x.asDict()
print(dx)
except ParseException as pe:
print(pe)
The issue I have is that param matches against sect so it expects a =. So I tried putting in ~Literal("sect") in keyName but that just leads to another error:
Exception raised:Found unwanted token, "sect", found '\n' (at char 188), (line:4, col:56)
Expected end of text, found 's' (at char 190), (line:6, col:1)
How do I get it use one parse method for sect and another (param) if not sect?
My final goal would be to have the whole lot in a Dict with the global params and sects included.
EDIT
Think I've figured it out:
This line...
final = OneOrMore(node) | OneOrMore(param)
...should be:
final = ZeroOrMore(param) + ZeroOrMore(node)
But I wonder if there is a more structured way (as I'd ultimately like a dict)?

Tiny Language compiler using python and regex

Hello stack overflow users
I hope you having a good
so I'm doing this tiny language compiler for my homework
tried using regex
but the output is so weird
First of all, I get an Identifier called 't' which is not used in my input
And it doesn't separate Identifier 'x' from the semicolon
thanks in advance for your help
Here is my input
read x; {input an integer }
if 0 < x then { don’t compute if x <= 0 }
fact := 1;
repeat
fact := fact * x;
x := x - 1
until x = 0;
write fact { output factorial of x }
end
And that's my code using regex
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020
#author: PC
"""
class OwnCompiler (object):
def __init__ (self,file):
import re
self.file=open(file,"r").readlines()
self.symbols = {
"+":"PLUS_OP",
"-":"MINUS_OP",
"*":"MUL_OP",
"/":"DIV_OP",
"=":"EQUAL_OP",
"<":"LESS_OP",
">":"GREATER_OP",
"(":"LEFT_PARENTHESIS",
")":"RIGHT_PARENTHESIS",
":=":"ASSIGN",
";":"SEMICOLON",
}
self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")
def compileOutput(self):
self.fileWrite=open("output.txt","w")
self.fileWrite.write("Type Token\n==================\n")
for i in self.file :
print(i)
self.getComment(i)
self.getReserveWord(i)
self.getIdentify(i)
self.fileWrite.close()#end
def getComment(self,text):
try:
self.fileWrite.write("COMMENT "+self.commentPattern.match(text).group(1)+"\n")
except:
print("NO_COMMENT")
def getReserveWord(self,text):
self.Compiled = self.reservePattern.match(text)
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(1)+"\n")
self.getSymbols(self.Compiled.group(2))
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(3)+"\n")
except:
print("NO_RESERVE_WORD2")
except:
print("NO_RESERVE_WORD")
def getSymbols(self,text):
self.Compiled= self.symbolPattern.match(text)
self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
try:
self.fileWrite.write(self.GOT_TOKEN+" "+self.Compiled.group()+"\n")
except:
print("NO_SYMBOLS")
def getIdentify(self,text):
self.Compiled = self.identifierSymbol.match(text)
try:
self.fileWrite.write("IDENTIFIER "+self.Compiled.group(1)+"\n")
self.getSymbols(text)
for i in self.Compiled.group(3):
if i ==" " :
continue
if self.isNumber(i):
self.fileWrite.write("NUMBER ")
else:
self.fileWrite.write("WORD ")
self.fileWrite.write(self.Compiled.group(3)+"\n")
except:
print("NO_IDENTIFIRES")
def getTokensSymbols(self,symbol):
try:
return self.symbols[symbol]
except:
print("NOT_DEFINED_IN_SYMBOL_DICT")
return "UNKNOWN"
def isNumber(self,text):
try:
int(text)
return True
except:
return False
if __name__ == "__main__":
instance = OwnCompiler("input.txt")
instance.compileOutput()
And here is my output
Type Token
==================
COMMENT { Sample program in TINY language – computes factorial }
COMMENT {input an integer }
RESERVE_WORD read
UNKNOWN x;
COMMENT { don’t compute if x <= 0 }
RESERVE_WORD if
UNKNOWN 0 < x then { don’t compute if x <=
IDENTIFIER t
UNKNOWN fact := 1;
RESERVE_WORD repeat
IDENTIFIER t
UNKNOWN fact := fact * x;
IDENTIFIER x
UNKNOWN x := x -
RESERVE_WORD until
UNKNOWN x = 0;
COMMENT { output factorial of x }
RESERVE_WORD write
RESERVE_WORD end
If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:
import re, collections
class Lexer(object):
WHITESPACE = r'(?P<WHITESPACE>\s+)'
COMMENT = r'(?P<COMMENT>{[^}]*})'
READ = r'(?P<READ>\bread\b)'
WRITE = r'(?P<WRITE>\bwrite\b)'
IF = r'(?P<IF>\bif\b)'
THEN = r'(?P<THEN>\bthen\b)'
ELSE = r'(?P<ELSE>\belse\b)'
END = r'(?P<END>\bend\b)'
REPEAT = r'(?P<REPEAT>\brepeat\b)'
UNTIL = r'(?P<UNTIL>\buntil\b)'
OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
INTEGER = r'(?P<INTEGER>\d+)'
SEMICOLON = r'(?P<SEMICOLON>;)'
regex = re.compile('|'.join([
WHITESPACE,
COMMENT,
READ,
WRITE,
IF,
THEN,
ELSE,
END,
REPEAT,
UNTIL,
OPERATOR,
LPAREN,
RPAREN,
IDENTIFIER,
INTEGER,
SEMICOLON
]))
def __init__ (self, file):
def generate_tokens(text):
Token = collections.namedtuple('Token', ['type','value'])
scanner = Lexer.regex.finditer(text)
last_end = 0
for m in scanner:
start = m.start()
end = m.end()
if start != last_end:
# skipped over text to find the next token implies that there was unrecognizable text or an "error token"
text = self.text[last_end:start]
token = Token('ERROR', text)
yield token
last_end = end
token = Token(m.lastgroup, m.group())
if token.type != 'WHITESPACE' and token.type != 'COMMENT':
yield token
yield Token('EOF', '<end-of-file>')
with open(file, "r") as f:
text = f.read()
self._token_generator = generate_tokens(text)
def next_token(self):
# if you call this past the "EOF" token you will get a StopIteration exception
return self._token_generator.__next__()
lexer = Lexer('input.txt')
while True:
token = lexer.next_token()
print(token)
if token.type == 'EOF':
break
Prints:
Token(type='READ', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IF', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='THEN', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='SEMICOLON', value=';')
Token(type='REPEAT', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='UNTIL', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='SEMICOLON', value=';')
Token(type='WRITE', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='END', value='end')
Token(type='EOF', value='<end-of-file>')

How to split a file into multiple files based on a repeated string?

i have a file and a want to split the based on the string "async" into different files. The expected output is a little messy. I try to use a word as key ("async") to divide the file but the generated files have the first line of its function with the context of the below function. For example, the file is:
'use strict';
const shim = require('fabric-shim');
const util = require('util');
let Chaincode = class {
async Init(stub) {
let ret = stub.getFunctionAndParameters();
console.info(ret);
console.info('=========== Instantiated Marbles Chaincode ===========');
return shim.success();
}
async Invoke(stub) {
console.info('Transaction ID: ' + stub.getTxID());
console.info(util.format('Args: %j', stub.getArgs()));
let ret = stub.getFunctionAndParameters();
console.info(ret);
let method = this[ret.fcn];
if (!method) {
console.log('no function of name:' + ret.fcn + ' found');
throw new Error('Received unknown function ' + ret.fcn + ' invocation');
}
try {
let payload = await method(stub, ret.params, this);
return shim.success(payload);
} catch (err) {
console.log(err);
return shim.error(err);
}
}
async initMarble(stub, args, thisClass) {
if (args.length != 4) {
throw new Error('Incorrect number of arguments. Expecting 4');
}
// ==== Input sanitation ====
console.info('--- start init marble ---')
if (args[0].lenth <= 0) {
throw new Error('1st argument must be a non-empty string');
}
if (args[1].lenth <= 0) {
throw new Error('2nd argument must be a non-empty string');
}
if (args[2].lenth <= 0) {
throw new Error('3rd argument must be a non-empty string');
}
if (args[3].lenth <= 0) {
throw new Error('4th argument must be a non-empty string');
}
let marbleName = args[0];
let color = args[1].toLowerCase();
let owner = args[3].toLowerCase();
let size = parseInt(args[2]);
if (typeof size !== 'number') {
throw new Error('3rd argument must be a numeric string');
}
let marbleState = await stub.getState(marbleName);
if (marbleState.toString()) {
throw new Error('This marble already exists: ' + marbleName);
}
// ==== Create marble object and marshal to JSON ====
let marble = {};
marble.docType = 'marble';
marble.name = marbleName;
marble.color = color;
marble.size = size;
marble.owner = owner;
await stub.putState(marbleName, Buffer.from(JSON.stringify(marble)));
let indexName = 'color~name'
let colorNameIndexKey = await stub.createCompositeKey(indexName, [marble.color, marble.name]);
console.info(colorNameIndexKey);
console.info('- end init marble');
}
i tried this:
import re
import os
filetype = '.js'
result = ''
count = 0
start = 0
name = 'functions'
matchedLine = ''
stringToMatch = 'async'
with open ('myjson.js', 'r') as f:
for x in f.read().split("\n"):
if stringToMatch in x:
if (start == 1):
with open (name + str(count) + '.js', 'w') as opf:
matchedLine = x
opf.write(matchedLine + '\n' + result)
opf.close()
result = ''
print (count)
count+= 1
matchedLine = ''
else:
start = 1
else:
if (result == ''):
result = x
else:
result = result + '\n' + x
but the output is a little bit messy
function0.js:
async Invoke(stub) {
'use strict';
const shim = require('fabric-shim');
const util = require('util');
let Chaincode = class {
let ret = stub.getFunctionAndParameters();
console.info(ret);
console.info('=========== Instantiated Marbles Chaincode ===========');
return shim.success();
}
function1.js:
async initMarble(stub, args, thisClass) {
console.info('Transaction ID: ' + stub.getTxID());
console.info(util.format('Args: %j', stub.getArgs()));
let ret = stub.getFunctionAndParameters();
console.info(ret);
let method = this[ret.fcn];
if (!method) {
console.log('no function of name:' + ret.fcn + ' found');
throw new Error('Received unknown function ' + ret.fcn + ' invocation');
}
try {
let payload = await method(stub, ret.params, this);
return shim.success(payload);
} catch (err) {
console.log(err);
return shim.error(err);
}
}
There must be many ways to do this. Here is one:
import re
class Writer:
def __init__(self):
self._num = 0
self._fh = None
def close(self):
if self._fh:
self._fh.close()
def start_file(self):
self.close()
self._fh = open("file{}.js".format(self._num), "w")
self._num += 1
def write(self, data):
if self._fh:
self._fh.write(data)
writer = Writer()
with open('myjson.js') as f:
for line in f:
if re.match(' *async ', line):
writer.start_file()
writer.write(line)
writer.close()
If your goal is to separate all the sections that have async code into individual files, one method you might try would be to count the curly brackets for open, and then closed. To do this, you would set a variable that positively increments for every { and negatively for each } e.g (not optimized/pretty, just explaining).
brackets = 0
buffer = ""
found_async = False
for line_of_code in code:
if "async" in line_of_code:
if "{" in line_of_code:
brackets += 1
if "}" in line_of_code:
brackets -= 1
buffer += line_of_code
if brackets == 0:
write_buffer_to_file_here
buffer = ""
As a concept, this will probably not work as is, but should give you an idea of what I'm trying to say.

Escape reserved characters in a list by adding backslash in front of it

reserved_chars = "? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -"
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name']
What is that fastest way to loop through every element in a list and add a \ in front of the reserved character if one of the elements contains them?
desired output:
fixed_list = ['gold\-bear#gmail.com', 'P\&G#dom.com', 'JACKSON\! BOT', 'annoying\\name']
You could make a translation table with str.maketrans() and pass that into translate. This takes a little setup, but you can reuse the translation table and it's quite fast:
reserved_chars = '''?&|!{}[]()^~*:\\"'+-'''
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\\name']
# make trans table
replace = ['\\' + l for l in reserved_chars]
trans = str.maketrans(dict(zip(reserved_chars, replace)))
# translate with trans table
fixed_list = [s.translate(trans) for s in list_vals]
print("\n".join(fixed_list))
Prints:
gold\-bear#gmail.com
P\&G#dom.com
JACKSON\! BOT
annoying\\name
There is no fast way - you got strings, strings are immuteable, you need to create new ones.
Probably best way is to build your own translation dictionary and do the grunt work yourself:
reserved = """? & | ! { } [ ] ( ) ^ ~ * : \ " ' + -"""
tr = { c:f"\\{c}" for c in reserved}
print(tr)
data = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', 'annoying\name']
transformed = [ ''.join(tr.get(letter,letter) for letter in word) for word in data]
for word in transformed:
print(word)
Output:
# translation dictionary
{'?': '\\?', ' ': '\\ ', '&': '\\&', '|': '\\|', '!': '\\!', '{': '\\{',
'}': '\\}', '[': '\\[', ']': '\\]', '(': '\\(', ')': '\\)', '^': '\\^',
'~': '\\~', '*': '\\*', ':': '\\:', '\\': '\\\\', '"': '\\"', "'": "\\'",
'+': '\\+', '-': '\\-'}
# transformed strings
gold\-bear#gmail.com
P\&G#dom.com
JACKSON\!\ BOT
annoying
ame
Sidenotes:
Your example missed to escape the space inside 'JACKSON\! BOT'.
The repl() of the transformed list looks "wrongly" escaped because when printing it escapes each '\' itself again - whats being printed see wordlist
Definitely not the fastest, but could be the easiest to code. Make a regex that does it for you, and run re.sub, like this:
import re
reserved_chars = "?&|!{}[]()^~*:\\\"'+-"
replace_regex = "([" + ''.join('\\x%x' % ord(x) for x in reserved_chars) + "])"
list_vals = ['gold-bear#gmail.com', 'P&G#dom.com', 'JACKSON! BOT', r'annoying\name']
escaped_vals = [re.sub(replace_regex, r"\\\1", x) for x in list_vals]
Again, just to clarify, regexes are SLOW.

Extracting data from string with specific format using Python

I am novice with Python and currently I am trying to use it to parse some custom output formated string. In fact format contains named lists of float and lists of tuples of float. I wrote a function but it looks excessive. How can it be done in more suitable way for Python?
import re
def extract_line(line):
line = line.lstrip('0123456789# ')
measurement_list = list(filter(None, re.split(r'\s*;\s*', line)))
measurement = {}
for elem in measurement_list:
elem_list = list(filter(None, re.split(r'\s*=\s*', elem)))
name = elem_list[0]
if name == 'points':
points = list(filter(None, re.split(r'\s*\(\s*|\s*\)\s*',elem_list[1].strip(' {}'))))
for point in points:
p = re.match(r'\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\s*', point).groups()
if 'points' not in measurement.keys():
measurement['points'] = []
measurement['points'].append(tuple(map(float,p)))
else:
values = list(filter(None, elem_list[1].strip(' {}').split(' ')))
for value in values:
if name not in measurement.keys():
measurement[name] = []
measurement[name].append(float(value))
return measurement
to_parse = '#10 points = { ( 2.96296 , 0.822213 ) ( 3.7037 , 0.902167 ) } ; L = { 5.20086 } ; P = { 3.14815 3.51852 } ;'
print(extract_line(to_parse))
You can do it using re.findall:
import re
to_parse = '#10 points = { ( 2.96296 , 0.822213 ) ( 3.7037 , 0.902167 ) } ; L = { 5.20086 } ; P = { 3.14815 3.51852 } ;'
m_list = re.findall(r'(\w+)\s*=\s*{([^}]*)}', to_parse)
measurements = {}
for k,v in m_list:
if k == 'points':
elts = re.findall(r'([0-9.]+)\s*,\s*([0-9.]+)', v)
measurements[k] = [tuple(map(float, elt)) for elt in elts]
else:
measurements[k] = [float(x) for x in v.split()]
print(measurements)
Feel free to put it in a function and to check if keys don't already exists.
This:
import re
a=re.findall(r' ([\d\.eE-]*) ',to_parse)
map(float, a)
>> [2.96296, 0.822213, 3.7037, 0.902167, 5.20086, 3.14815]
Will give you your list of numbers, is that what you look for?

Categories

Resources