Tiny Language compiler using python and regex - python
Hello stack overflow users
I hope you having a good
so I'm doing this tiny language compiler for my homework
tried using regex
but the output is so weird
First of all, I get an Identifier called 't' which is not used in my input
And it doesn't separate Identifier 'x' from the semicolon
thanks in advance for your help
Here is my input
read x; {input an integer }
if 0 < x then { don’t compute if x <= 0 }
fact := 1;
repeat
fact := fact * x;
x := x - 1
until x = 0;
write fact { output factorial of x }
end
And that's my code using regex
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 04:11:06 2020
#author: PC
"""
class OwnCompiler (object):
def __init__ (self,file):
import re
self.file=open(file,"r").readlines()
self.symbols = {
"+":"PLUS_OP",
"-":"MINUS_OP",
"*":"MUL_OP",
"/":"DIV_OP",
"=":"EQUAL_OP",
"<":"LESS_OP",
">":"GREATER_OP",
"(":"LEFT_PARENTHESIS",
")":"RIGHT_PARENTHESIS",
":=":"ASSIGN",
";":"SEMICOLON",
}
self.commentPattern = re.compile(r".*({\n*\s*.*\s*})")
self.reservePattern = re.compile(r"\s*(read|write|if|then|else|end|repeat|until)+\s*(.*)(then)*")
self.symbolPattern = re.compile(r".*(\+|\*|-|/|=|<|>|\(|\)|;)")
self.identifierSymbol = re.compile(r".*(\w+)\s+(:=)\s+(.*)")
def compileOutput(self):
self.fileWrite=open("output.txt","w")
self.fileWrite.write("Type Token\n==================\n")
for i in self.file :
print(i)
self.getComment(i)
self.getReserveWord(i)
self.getIdentify(i)
self.fileWrite.close()#end
def getComment(self,text):
try:
self.fileWrite.write("COMMENT "+self.commentPattern.match(text).group(1)+"\n")
except:
print("NO_COMMENT")
def getReserveWord(self,text):
self.Compiled = self.reservePattern.match(text)
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(1)+"\n")
self.getSymbols(self.Compiled.group(2))
try:
self.fileWrite.write("RESERVE_WORD "+self.Compiled.group(3)+"\n")
except:
print("NO_RESERVE_WORD2")
except:
print("NO_RESERVE_WORD")
def getSymbols(self,text):
self.Compiled= self.symbolPattern.match(text)
self.GOT_TOKEN= self.getTokensSymbols(self.Compiled.group())
try:
self.fileWrite.write(self.GOT_TOKEN+" "+self.Compiled.group()+"\n")
except:
print("NO_SYMBOLS")
def getIdentify(self,text):
self.Compiled = self.identifierSymbol.match(text)
try:
self.fileWrite.write("IDENTIFIER "+self.Compiled.group(1)+"\n")
self.getSymbols(text)
for i in self.Compiled.group(3):
if i ==" " :
continue
if self.isNumber(i):
self.fileWrite.write("NUMBER ")
else:
self.fileWrite.write("WORD ")
self.fileWrite.write(self.Compiled.group(3)+"\n")
except:
print("NO_IDENTIFIRES")
def getTokensSymbols(self,symbol):
try:
return self.symbols[symbol]
except:
print("NOT_DEFINED_IN_SYMBOL_DICT")
return "UNKNOWN"
def isNumber(self,text):
try:
int(text)
return True
except:
return False
if __name__ == "__main__":
instance = OwnCompiler("input.txt")
instance.compileOutput()
And here is my output
Type Token
==================
COMMENT { Sample program in TINY language – computes factorial }
COMMENT {input an integer }
RESERVE_WORD read
UNKNOWN x;
COMMENT { don’t compute if x <= 0 }
RESERVE_WORD if
UNKNOWN 0 < x then { don’t compute if x <=
IDENTIFIER t
UNKNOWN fact := 1;
RESERVE_WORD repeat
IDENTIFIER t
UNKNOWN fact := fact * x;
IDENTIFIER x
UNKNOWN x := x -
RESERVE_WORD until
UNKNOWN x = 0;
COMMENT { output factorial of x }
RESERVE_WORD write
RESERVE_WORD end
If you are going to parse a language you need a 'lexer' that will return individual tokens ignoring whitespace and comments. Along these lines, just as an example:
import re, collections
class Lexer(object):
WHITESPACE = r'(?P<WHITESPACE>\s+)'
COMMENT = r'(?P<COMMENT>{[^}]*})'
READ = r'(?P<READ>\bread\b)'
WRITE = r'(?P<WRITE>\bwrite\b)'
IF = r'(?P<IF>\bif\b)'
THEN = r'(?P<THEN>\bthen\b)'
ELSE = r'(?P<ELSE>\belse\b)'
END = r'(?P<END>\bend\b)'
REPEAT = r'(?P<REPEAT>\brepeat\b)'
UNTIL = r'(?P<UNTIL>\buntil\b)'
OPERATOR = r'(?P<OPERATOR>(?:[+*/=<>-]|:=))'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
IDENTIFIER = r'(?P<IDENTIFIER>[a-z]+)'
INTEGER = r'(?P<INTEGER>\d+)'
SEMICOLON = r'(?P<SEMICOLON>;)'
regex = re.compile('|'.join([
WHITESPACE,
COMMENT,
READ,
WRITE,
IF,
THEN,
ELSE,
END,
REPEAT,
UNTIL,
OPERATOR,
LPAREN,
RPAREN,
IDENTIFIER,
INTEGER,
SEMICOLON
]))
def __init__ (self, file):
def generate_tokens(text):
Token = collections.namedtuple('Token', ['type','value'])
scanner = Lexer.regex.finditer(text)
last_end = 0
for m in scanner:
start = m.start()
end = m.end()
if start != last_end:
# skipped over text to find the next token implies that there was unrecognizable text or an "error token"
text = self.text[last_end:start]
token = Token('ERROR', text)
yield token
last_end = end
token = Token(m.lastgroup, m.group())
if token.type != 'WHITESPACE' and token.type != 'COMMENT':
yield token
yield Token('EOF', '<end-of-file>')
with open(file, "r") as f:
text = f.read()
self._token_generator = generate_tokens(text)
def next_token(self):
# if you call this past the "EOF" token you will get a StopIteration exception
return self._token_generator.__next__()
lexer = Lexer('input.txt')
while True:
token = lexer.next_token()
print(token)
if token.type == 'EOF':
break
Prints:
Token(type='READ', value='read')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IF', value='if')
Token(type='INTEGER', value='0')
Token(type='OPERATOR', value='<')
Token(type='IDENTIFIER', value='x')
Token(type='THEN', value='then')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='INTEGER', value='1')
Token(type='SEMICOLON', value=';')
Token(type='REPEAT', value='repeat')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='fact')
Token(type='OPERATOR', value='*')
Token(type='IDENTIFIER', value='x')
Token(type='SEMICOLON', value=';')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value=':=')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='-')
Token(type='INTEGER', value='1')
Token(type='UNTIL', value='until')
Token(type='IDENTIFIER', value='x')
Token(type='OPERATOR', value='=')
Token(type='INTEGER', value='0')
Token(type='SEMICOLON', value=';')
Token(type='WRITE', value='write')
Token(type='IDENTIFIER', value='fact')
Token(type='END', value='end')
Token(type='EOF', value='<end-of-file>')
Related
How to replace and insert a new node into a ast tree using esprima for python?
I am having trouble trying to replace and insert a new node into the ast tree using esprima for python. There is an example on github but then it replaces all the nodes with the same node that I created but I just want one of them changed while keeping the rest of the tree intact. from __future__ import print_function import json import esprima from jscodegen_py import jscodegen # Build a CallExpression expression statement manually: # callee = esprima.nodes.Identifier("alert") # args = [esprima.nodes.Literal("other alert", "'other alert'")] # call = esprima.nodes.CallExpression(callee, args) # other_alert = esprima.nodes.ExpressionStatement(call) generator = jscodegen.CodeGenerator(indent = 2) def js2ast(js: str): return esprima.parseScript(js) def ast2js(ast: dict): return generator.generate(ast) # Add a few expression statements using `parse()`: af = {'Lg': {'RawString': 'var Lg = function(WN5, AN5) {\n return WN5 > AN5;\n };', 'RawValue': 'WN5 > AN5', 'operator': '>'}} accessory_function_expression_statements = {} for name in af: accessory_function_expression_statements[name] = esprima.parse(af[name]['RawValue']).body[0] class MyVisitor(esprima.NodeVisitor): def transform_CallExpression(self, node, metadata): # If the callee is an `alert()`, change it to `console.log()`: if node.callee.name == 'Lg': new_node_arguments = [] for item in node.arguments: new_node_arguments.append(esprima.parse(generator.generate_expression(item.toDict(), 0)).body[0]) new_node = accessory_function_expression_statements['Lg'].expression new_node.left = new_node_arguments[0].expression new_node.right = new_node_arguments[1].expression print(f'new_node: {new_node}') return self.transform_Object(new_node, metadata) # every time this is called it will walk down the tree from the beginning visitor = MyVisitor() tree = esprima.parse(""" if (Lg(GP5["length"], 5)) { var kP5 = window["parseInt"](GP5[5], 10); lP5 = window["isNaN"](kP5) || dK(hA(1), kP5) ? window["Number"]["MAX_VALUE"] : kP5, lP5 = kP5; var abc = Boolean(Lg(Jj, 21)) } """, delegate=visitor) print(ast2js(tree.toDict())) But using this code it gives this result. if (Jj > 21) { var kP5 = window["parseInt"](GP5[5], 10); lP5 = window["isNaN"](kP5) || dK(hA(1), kP5) ? window["Number"]["MAX_VALUE"] : kP5, lP5 = kP5; var abc = Boolean(Jj > 21); } As you can see the script in the code it replaces all the ifstatements to '(Jj > 21)' but I want it like this. if (GP5.length > 5) { var kP5 = window["parseInt"](GP5[5], 10); lP5 = window["isNaN"](kP5) || dK(hA(1), kP5) ? window["Number"]["MAX_VALUE"] : kP5, lP5 = kP5; var abc = Boolean(Jj > 21); } How can I do this using esprima in python?
PyParsing: parse if not a keyword
I am trying to parse a file as follows: testp.txt title = Test Suite A; timeout = 10000 exp_delay = 500; log = TRUE; sect { type = typeA; name = "HelloWorld"; output_log = "c:\test\out.log"; }; sect { name = "GoodbyeAll"; type = typeB; comm1_req = 0xDEADBEEF; comm1_resp = (int, 1234366); }; The file first contains a section with parameters and then some sects. I can parse a file containing just parameters and I can parse a file just containing sects but I can't parse both. from pyparsing import * from pathlib import Path command_req = Word(alphanums) command_resp = "(" + delimitedList(Word(alphanums)) + ")" kW = Word(alphas+'_', alphanums+'_') | command_req | command_resp keyName = ~Literal("sect") + Word(alphas+'_', alphanums+'_') + FollowedBy("=") keyValue = dblQuotedString.setParseAction( removeQuotes ) | OneOrMore(kW,stopOn=LineEnd()) param = dictOf(keyName, Suppress("=")+keyValue+Optional(Suppress(";"))) node = Group(Literal("sect") + Literal("{") + OneOrMore(param) + Literal("};")) final = OneOrMore(node) | OneOrMore(param) param.setDebug() p = Path(__file__).with_name("testp.txt") with open(p) as f: try: x = final.parseFile(f, parseAll=True) print(x) print("...") dx = x.asDict() print(dx) except ParseException as pe: print(pe) The issue I have is that param matches against sect so it expects a =. So I tried putting in ~Literal("sect") in keyName but that just leads to another error: Exception raised:Found unwanted token, "sect", found '\n' (at char 188), (line:4, col:56) Expected end of text, found 's' (at char 190), (line:6, col:1) How do I get it use one parse method for sect and another (param) if not sect? My final goal would be to have the whole lot in a Dict with the global params and sects included. EDIT Think I've figured it out: This line... final = OneOrMore(node) | OneOrMore(param) ...should be: final = ZeroOrMore(param) + ZeroOrMore(node) But I wonder if there is a more structured way (as I'd ultimately like a dict)?
Avoiding repetition of if statement
I prepared functions and sorted data for this task: (it's actually AoC day 4, but quick explanation to make it clear) I have already sorted data to this 'structure' byr:1991 eyr:2022 hcl:#341e13 iyr:2016 pid:729933757 hgt:167cm ecl:gry hcl:231d64 cid:124 ecl:gmt eyr:2039 hgt:189in pid:#9c3ea1 ecl:#1f58f9 pid:#758e59 iyr:2022 hcl:z byr:2016 hgt:68 eyr:1933 [and so on +250 packages(by package I mean set of byr,ecl,eyr... separated by new line).] and prepared this code: def check_fields(list): comparison_list = ['byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'] statement = True for i in comparison_list: statement = statement and (i in list) return statement def check_byr_iyr_eyr(line): prefix,value = line.split(':') cases = {'byr':{'min':1920, 'max':2002}, 'iyr':{'min':2010, 'max':2020}, 'eyr':{'min':2020, 'max':2030} } return cases[prefix]['min'] <= int(value) <= cases[prefix]['max'] def check_hgt(line): unit = line[len(line)-2] + line[len(line)-1] value = line[line.index(':')+1: -2] cases = {'cm':{'min':150, 'max':193}, 'in':{'min':59, 'max':76}} return cases[unit]['min'] <= int(value) <= cases[unit]['max'] def check_hcl(line): statement = True if line[line.index(':')+1] != '#' or len(line[line.index(':')+2:]) != 6: return False else: string = line[line.index('#')+1:] for i in string: statement = statement and (97 <= ord(i) <= 102 or 48 <= ord(i) <= 57) return statement def check_ecl(line): comparison_list = ['amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth' ] if line[line.index(':') + 1:] in comparison_list: return True return False def check_pid(line): if len(line[line.index(':')+1:]) != 9: return False try: int(line[line.index(':')+1:]) return True except: return False line_list = [] valid_passports = 0 with open('results.txt', 'r') as f: for line in f: if line != '\n': ''' add line to line_list''' pass else: ''' check lines from line_list using above declared functions if every line is ok: valid_passports +=1 ''' I have to check if every package contains every key except of cid, and then check if every value for each key is proper. byr (Birth Year) - four digits; at least 1920 and at most 2002. iyr (Issue Year) - four digits; at least 2010 and at most 2020. eyr (Expiration Year) - four digits; at least 2020 and at most 2030. hgt (Height) - a number followed by either cm or in: If cm, the number must be at least 150 and at most 193. If in, the number must be at least 59 and at most 76. hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f. ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth. pid (Passport ID) - a nine-digit number, including leading zeroes. cid (Country ID) - ignored, missing or not. (above mentioned rules are ensured by earlier declared functions) And the question/problem is How can I avoid repetition of if statement during checking every line added to line list(it refers to part with multi-line comment with "pseudo code") ? - I mean I could do it like if line[0:3] == "byr": check_byr(line) # and so on, many if statement checking the first 3 letters to adjust proper function to use but it doesn't seem like proper and elegant solution, matybe you could give me hints how to deal with that, or give another idea to solve that problem in different way that I didn't use. Please help, thanks.
Can't you have a mapping from prefix to target function? Something like line = # ... prefix = # ... either "hgt" or "pid" or other def check_hgt(line): pass def check_pid(line): pass # ... other checker functions checker_functions_pool = {"hgt": check_hgt, "pid": check_pid} checker_function = checker_functions_pool[prefix] checker_function(line)
#viGor207, this is anther way to approach it: (part 2 example): import re passports = [ dict( line.split(':') for line in pas.split() ) for pas in open('input').read().split('\n\n') ] required = {'byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'} def valid(pas): return bool( 1920 <= int(pas['byr']) <= 2002 and 2010 <= int(pas['iyr']) <= 2020 <= int(pas['eyr']) <= 2030 and re.fullmatch(r'[0-9]{2,3}(cm|in)', pas['hgt']) and ( (pas['hgt'][-2:] == 'cm' and 150 <= int(pas['hgt'][:-2]) <= 193) or (pas['hgt'][-2:] == 'in' and 59 <= int(pas['hgt'][:-2]) <= 79) ) and re.fullmatch(r'#[0-9a-f]{6}', pas['hcl']) and pas['ecl'] in {'amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth'} and re.fullmatch(r'[0-9]{9}', pas['pid']) ) print( sum( all(r in pas for r in required) and valid(pas) for pas in passports ) )
To make it complete, here is the part one: passports = [ dict( line.split(':') for line in pas.split() ) for pas in open('input').read().split('\n\n') ] required = {'byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'} print( sum( all(r in pas for r in required) for pas in passports ) )
The task states: Each passport is represented as a sequence of key:value pairs separated by spaces or newlines. Passports are separated by blank lines. a sequence of key:value pairs is screaming use a list of dicts in Python, but your method could be used. You could use a dict that maps field names to the function that checks for that field line, field_to_checker . I took your example parsed inputs as a list of your parsed lines then added a checker for cid that just returns True, and created the following code snippet: def check_cid(line): return True field_to_checker = { 'byr': check_byr_iyr_eyr, 'cid': check_cid, 'ecl': check_ecl, 'eyr': check_byr_iyr_eyr, 'hcl': check_hcl, 'hgt': check_hgt, 'iyr': check_byr_iyr_eyr, 'pid': check_pid, } line_data = """byr:1991 eyr:2022 hcl:#341e13 iyr:2016 pid:729933757 hgt:167cm ecl:gry hcl:231d64 cid:124 ecl:gmt eyr:2039 hgt:189in pid:#9c3ea1 ecl:#1f58f9 pid:#758e59 iyr:2022 hcl:z byr:2016 hgt:68 eyr:1933""".split('\n') valid_passports = 0 ok_passport = True # Accumulating over all fields of one passport for line in line_data + ['\n']: # Add empty line to force processing last passport line = line.rstrip() if line: # Not blank line if ok_passport: # One False value in a passport will prevail key = line[:3] ok_passport = (key in field_to_checker and field_to_checker[key](line)) else: # Blank line, end of passport record if ok_passport: valid_passports += 1 ok_passports = True In the for line in line_data + ['\n'] loop the count of valid_passports is only updated when there is a blank line ending a passport record. The last passport needs to have a blank line after it to be properly counted hence the addition of an extra blank line to the end of the line_data. The above is untested, but should give you tips on how to extend what you have started with.
I would suggest placing the dictionary values in variables as early as possible to make the logic simpler to write and read. That should allow you to make single line conditions that are ledgible: data = \ """byr:1991 eyr:2022 hcl:#341e13 iyr:2016 pid:729933757 hgt:167cm ecl:gry hcl:231d64 cid:124 ecl:gmt eyr:2039 hgt:189in pid:#9c3ea1 ecl:#1f58f9 pid:#758e59 iyr:2022 hcl:z byr:2016 hgt:68 eyr:1933""" ... # iterator to get packages def getPackages(d): package = dict() for line in d: if line: field,value = line.split(":",1) package[field]=value else: yield package.copy() package.clear() if package: yield package fields = ['byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid'] for package in getPackages(data.split("\n")): values = [package.get(f,"") for f in fields] byr, iyr, eyr, hgt, hcl, ecl, pid = values isValid = "" not in values[:-1] \ and int(byr) in range(1920,2001+1) \ and int(iyr) in range(2010,2020+1) \ and int(eyr) in range(2020,2030+1) \ and int(hgt[:-2]) in {"cm":range(150,193+1),"in":range(59,76+1)}.get(hgt[-2:],[]) \ and hcl.startswith("#") and len(hcl)==7 \ and all(97 <= ord(i) <= 102 or 48 <= ord(i) <= 57 for i in hcl[1:]) \ and ecl in {'amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth' } \ and (pid == "" or pid.isdigit() and len(pid)==9) print(pid,isValid) """ 729933757 True #9c3ea1 False #758e59 False """
Reupload, How do i convert a php file to python dictionary?
After going over de fora, i did not find something that could solve this issue properly. I want to convert a file written in php to a python dictionary. In this case this file is a converted TrueType Font-file. <?php $type = 'TrueType'; $name = 'Calibri'; $desc = array('Ascent'=>750,'Descent'=>-250,'CapHeight'=>632,'Flags'=>32,'FontBBox'=>'[-503 -313 1240 1026]','ItalicAngle'=>0,'StemV'=>70,'MissingWidth'=>507); $up = -113; $ut = 65; $cw = array( chr(0)=>507,chr(1)=>507,chr(2)=>507,chr(3)=>507,chr(4)=>507,chr(5)=>507,chr(6)=>507,chr(7)=>507,chr(8)=>507,chr(9)=>507,chr(10)=>507,chr(11)=>507,chr(12)=>507,chr(13)=>507,chr(14)=>507,chr(15)=>507,chr(16)=>507,chr(17)=>507,chr(18)=>507,chr(19)=>507,chr(20)=>507,chr(21)=>507, chr(22)=>507,chr(23)=>507,chr(24)=>507,chr(25)=>507,chr(26)=>507,chr(27)=>507,chr(28)=>507,chr(29)=>507,chr(30)=>507,chr(31)=>507,' '=>226,'!'=>326,'"'=>401,'#'=>498,'$'=>507,'%'=>715,'&'=>682,'\''=>221,'('=>303,')'=>303,'*'=>498,'+'=>498, ','=>250,'-'=>306,'.'=>252,'/'=>386,'0'=>507,'1'=>507,'2'=>507,'3'=>507,'4'=>507,'5'=>507,'6'=>507,'7'=>507,'8'=>507,'9'=>507,':'=>268,';'=>268,'<'=>498,'='=>498,'>'=>498,'?'=>463,'#'=>894,'A'=>579, 'B'=>544,'C'=>533,'D'=>615,'E'=>488,'F'=>459,'G'=>631,'H'=>623,'I'=>252,'J'=>319,'K'=>520,'L'=>420,'M'=>855,'N'=>646,'O'=>662,'P'=>517,'Q'=>673,'R'=>543,'S'=>459,'T'=>487,'U'=>642,'V'=>567,'W'=>890, 'X'=>519,'Y'=>487,'Z'=>468,'['=>307,'\\'=>386,']'=>307,'^'=>498,'_'=>498,'`'=>291,'a'=>479,'b'=>525,'c'=>423,'d'=>525,'e'=>498,'f'=>305,'g'=>471,'h'=>525,'i'=>229,'j'=>239,'k'=>455,'l'=>229,'m'=>799, 'n'=>525,'o'=>527,'p'=>525,'q'=>525,'r'=>349,'s'=>391,'t'=>335,'u'=>525,'v'=>452,'w'=>715,'x'=>433,'y'=>453,'z'=>395,'{'=>314,'|'=>460,'}'=>314,'~'=>498,chr(127)=>507,chr(128)=>507,chr(129)=>507,chr(130)=>250,chr(131)=>305, chr(132)=>418,chr(133)=>690,chr(134)=>498,chr(135)=>498,chr(136)=>395,chr(137)=>1038,chr(138)=>459,chr(139)=>339,chr(140)=>867,chr(141)=>507,chr(142)=>468,chr(143)=>507,chr(144)=>507,chr(145)=>250,chr(146)=>250,chr(147)=>418,chr(148)=>418,chr(149)=>498,chr(150)=>498,chr(151)=>905,chr(152)=>450,chr(153)=>705, chr(154)=>391,chr(155)=>339,chr(156)=>850,chr(157)=>507,chr(158)=>395,chr(159)=>487,chr(160)=>226,chr(161)=>326,chr(162)=>498,chr(163)=>507,chr(164)=>498,chr(165)=>507,chr(166)=>498,chr(167)=>498,chr(168)=>393,chr(169)=>834,chr(170)=>402,chr(171)=>512,chr(172)=>498,chr(173)=>306,chr(174)=>507,chr(175)=>394, chr(176)=>339,chr(177)=>498,chr(178)=>336,chr(179)=>334,chr(180)=>292,chr(181)=>550,chr(182)=>586,chr(183)=>252,chr(184)=>307,chr(185)=>246,chr(186)=>422,chr(187)=>512,chr(188)=>636,chr(189)=>671,chr(190)=>675,chr(191)=>463,chr(192)=>579,chr(193)=>579,chr(194)=>579,chr(195)=>579,chr(196)=>579,chr(197)=>579, chr(198)=>763,chr(199)=>533,chr(200)=>488,chr(201)=>488,chr(202)=>488,chr(203)=>488,chr(204)=>252,chr(205)=>252,chr(206)=>252,chr(207)=>252,chr(208)=>625,chr(209)=>646,chr(210)=>662,chr(211)=>662,chr(212)=>662,chr(213)=>662,chr(214)=>662,chr(215)=>498,chr(216)=>664,chr(217)=>642,chr(218)=>642,chr(219)=>642, chr(220)=>642,chr(221)=>487,chr(222)=>517,chr(223)=>527,chr(224)=>479,chr(225)=>479,chr(226)=>479,chr(227)=>479,chr(228)=>479,chr(229)=>479,chr(230)=>773,chr(231)=>423,chr(232)=>498,chr(233)=>498,chr(234)=>498,chr(235)=>498,chr(236)=>229,chr(237)=>229,chr(238)=>229,chr(239)=>229,chr(240)=>525,chr(241)=>525, chr(242)=>527,chr(243)=>527,chr(244)=>527,chr(245)=>527,chr(246)=>527,chr(247)=>498,chr(248)=>529,chr(249)=>525,chr(250)=>525,chr(251)=>525,chr(252)=>525,chr(253)=>453,chr(254)=>525,chr(255)=>453); $enc = 'cp1252'; $uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); $file = 'calibri.z'; $originalsize = 77252; $subsetted = true; ?> to: font = {"type":"TrueType", "name":"Calibri", "desc":{"Ascent":750,etc...}, etc...... } I thank all in advance! P.S. I reuploaded this question (my previous was closed) to share my solution in case someone else needs it.
The solution i found was just writing the parsing myself: import re import regex def parse_php(fontfile): font_dict = {} for item in php_chunks(fontfile): key, attr = item.split(" = ") attr = attr.replace("\t","").strip() attr = re.sub("^(.*);",r"\1",attr) # re.split("[,](?!'=>)",data["cw"]) if re.match("'(.*)'",attr): attr = re.sub("'(.*)'",r"\1",attr) try: attr = eval(attr) font_dict[key.replace("$","").strip()] = attr except: if "array" in attr: if re.match("^array\(",attr): attr_dict = {} attr = re.sub("array\((.*)\)",r"\1",attr) attr = regex.split("(?<!array\(\d*)[,](?!'=>)",attr) for row in attr: dict_key, dict_item = row.strip().split("=>") try: attr_dict[str(eval(dict_key))] = eval(dict_item) except: attr_dict[str(eval(dict_key))] = dict_item font_dict[key.replace("$","").strip()] = attr_dict else: font_dict[key.replace("$","").strip()] = attr return font_dict def php_chunks(raw): raw = raw.read() chunk = "" for idx, line in enumerate(raw.splitlines()): if line.startswith("$"): if idx != 1: yield chunk chunk = "" chunk = "".join(line) else: chunk = "".join([chunk,line])
How to split a file into multiple files based on a repeated string?
i have a file and a want to split the based on the string "async" into different files. The expected output is a little messy. I try to use a word as key ("async") to divide the file but the generated files have the first line of its function with the context of the below function. For example, the file is: 'use strict'; const shim = require('fabric-shim'); const util = require('util'); let Chaincode = class { async Init(stub) { let ret = stub.getFunctionAndParameters(); console.info(ret); console.info('=========== Instantiated Marbles Chaincode ==========='); return shim.success(); } async Invoke(stub) { console.info('Transaction ID: ' + stub.getTxID()); console.info(util.format('Args: %j', stub.getArgs())); let ret = stub.getFunctionAndParameters(); console.info(ret); let method = this[ret.fcn]; if (!method) { console.log('no function of name:' + ret.fcn + ' found'); throw new Error('Received unknown function ' + ret.fcn + ' invocation'); } try { let payload = await method(stub, ret.params, this); return shim.success(payload); } catch (err) { console.log(err); return shim.error(err); } } async initMarble(stub, args, thisClass) { if (args.length != 4) { throw new Error('Incorrect number of arguments. Expecting 4'); } // ==== Input sanitation ==== console.info('--- start init marble ---') if (args[0].lenth <= 0) { throw new Error('1st argument must be a non-empty string'); } if (args[1].lenth <= 0) { throw new Error('2nd argument must be a non-empty string'); } if (args[2].lenth <= 0) { throw new Error('3rd argument must be a non-empty string'); } if (args[3].lenth <= 0) { throw new Error('4th argument must be a non-empty string'); } let marbleName = args[0]; let color = args[1].toLowerCase(); let owner = args[3].toLowerCase(); let size = parseInt(args[2]); if (typeof size !== 'number') { throw new Error('3rd argument must be a numeric string'); } let marbleState = await stub.getState(marbleName); if (marbleState.toString()) { throw new Error('This marble already exists: ' + marbleName); } // ==== Create marble object and marshal to JSON ==== let marble = {}; marble.docType = 'marble'; marble.name = marbleName; marble.color = color; marble.size = size; marble.owner = owner; await stub.putState(marbleName, Buffer.from(JSON.stringify(marble))); let indexName = 'color~name' let colorNameIndexKey = await stub.createCompositeKey(indexName, [marble.color, marble.name]); console.info(colorNameIndexKey); console.info('- end init marble'); } i tried this: import re import os filetype = '.js' result = '' count = 0 start = 0 name = 'functions' matchedLine = '' stringToMatch = 'async' with open ('myjson.js', 'r') as f: for x in f.read().split("\n"): if stringToMatch in x: if (start == 1): with open (name + str(count) + '.js', 'w') as opf: matchedLine = x opf.write(matchedLine + '\n' + result) opf.close() result = '' print (count) count+= 1 matchedLine = '' else: start = 1 else: if (result == ''): result = x else: result = result + '\n' + x but the output is a little bit messy function0.js: async Invoke(stub) { 'use strict'; const shim = require('fabric-shim'); const util = require('util'); let Chaincode = class { let ret = stub.getFunctionAndParameters(); console.info(ret); console.info('=========== Instantiated Marbles Chaincode ==========='); return shim.success(); } function1.js: async initMarble(stub, args, thisClass) { console.info('Transaction ID: ' + stub.getTxID()); console.info(util.format('Args: %j', stub.getArgs())); let ret = stub.getFunctionAndParameters(); console.info(ret); let method = this[ret.fcn]; if (!method) { console.log('no function of name:' + ret.fcn + ' found'); throw new Error('Received unknown function ' + ret.fcn + ' invocation'); } try { let payload = await method(stub, ret.params, this); return shim.success(payload); } catch (err) { console.log(err); return shim.error(err); } }
There must be many ways to do this. Here is one: import re class Writer: def __init__(self): self._num = 0 self._fh = None def close(self): if self._fh: self._fh.close() def start_file(self): self.close() self._fh = open("file{}.js".format(self._num), "w") self._num += 1 def write(self, data): if self._fh: self._fh.write(data) writer = Writer() with open('myjson.js') as f: for line in f: if re.match(' *async ', line): writer.start_file() writer.write(line) writer.close()
If your goal is to separate all the sections that have async code into individual files, one method you might try would be to count the curly brackets for open, and then closed. To do this, you would set a variable that positively increments for every { and negatively for each } e.g (not optimized/pretty, just explaining). brackets = 0 buffer = "" found_async = False for line_of_code in code: if "async" in line_of_code: if "{" in line_of_code: brackets += 1 if "}" in line_of_code: brackets -= 1 buffer += line_of_code if brackets == 0: write_buffer_to_file_here buffer = "" As a concept, this will probably not work as is, but should give you an idea of what I'm trying to say.