Flatten nested array in Spark DataFrame

Flatten nested array in Spark DataFrame - python

I'm reading in some JSON on the from:
{"a": [{"b": {"c": 1, "d": 2}}]}
That is, the array items are unnecessarily nested. Now, because this happens inside an array, the answers given in How to flatten a struct in a Spark dataframe? don't apply directly.
This is how the dataframe looks when parsed:
root
|-- a: array
| |-- element: struct
| | |-- b: struct
| | | |-- c: integer
| | | |-- d: integer
I'm looking to transform the dataframe into this:
root
|-- a: array
| |-- element: struct
| | |-- b_c: integer
| | |-- b_d: integer
How do I go about aliasing the columns inside the array to effectively unnest it?

You can use transform:
df2 = df.selectExpr("transform(a, x -> struct(x.b.c as b_c, x.b.d as b_d)) as a")

Using the method presented in the accepted answer I wrote a function to recursively unnest a dataframe (recursing into nested arrays as well):
from pyspark.sql.types import ArrayType, StructType
def flatten(df, sentinel="x"):
def _gen_flatten_expr(schema, indent, parents, last, transform=False):
def handle(field, last):
path = parents + (field.name,)
alias = (
" as "
+ "_".join(path[1:] if transform else path)
+ ("," if not last else "")
)
if isinstance(field.dataType, StructType):
yield from _gen_flatten_expr(
field.dataType, indent, path, last, transform
)
elif (
isinstance(field.dataType, ArrayType) and
isinstance(field.dataType.elementType, StructType)
):
yield indent, "transform("
yield indent + 1, ".".join(path) + ","
yield indent + 1, sentinel + " -> struct("
yield from _gen_flatten_expr(
field.dataType.elementType,
indent + 2,
(sentinel,),
True,
True
)
yield indent + 1, ")"
yield indent, ")" + alias
else:
yield (indent, ".".join(path) + alias)
try:
*fields, last_field = schema.fields
except ValueError:
pass
else:
for field in fields:
yield from handle(field, False)
yield from handle(last_field, last)
lines = []
for indent, line in _gen_flatten_expr(df.schema, 0, (), True):
spaces = " " * 4 * indent
lines.append(spaces + line)
expr = "struct(" + "\n".join(lines) + ") as " + sentinel
return df.selectExpr(expr).select(sentinel + ".*")

Simplified Approach:
from pyspark.sql.functions import col
def flatten_df(nested_df):
stack = [((), nested_df)]
columns = []
while len(stack) > 0:
parents, df = stack.pop()
flat_cols = [
col(".".join(parents + (c[0],))).alias("_".join(parents + (c[0],)))
for c in df.dtypes
if c[1][:6] != "struct"
]
nested_cols = [
c[0]
for c in df.dtypes
if c[1][:6] == "struct"
]
columns.extend(flat_cols)
for nested_col in nested_cols:
projected_df = df.select(nested_col + ".*")
stack.append((parents + (nested_col,), projected_df))
return nested_df.select(columns)
ref: https://learn.microsoft.com/en-us/azure/synapse-analytics/how-to-analyze-complex-schema

Related

How to align strings in columns?

I am trying to print out a custom format but am facing an issue.
header = ['string', 'longer string', 'str']
header1, header2, header3 = header
data = ['string', 'str', 'longest string']
data1, data2, data3 = data
len1 = len(header1)
len2 = len(header2)
len3 = len(header3)
len_1 = len(data1)
len_2 = len(data2)
len_3 = len(data3)
un = len1 + len2 + len3 + len_1 + len_2 + len_3
un_c = '_' * un
print(f"{un_c}\n|{header1} |{header2} |{header3}| \n |{data1} |{data2} |{data3}|")
Output:
_____________________________________________
|string |longer string |str|
|string |str |longest string|
The output I want is this:
_______________________________________
|string |longer string |str |
|string |str |longest string|
I want it to work for all lengths of strings using the len to add extra spacing to each string to make it aligned, but I can't figure it out at all.

There is a package called tabulate this is very good for this (https://pypi.org/project/tabulate/). Similar post here.

Each cell is constructed according to the longest content, with additional spaces for any shortfall, printing a | at the beginning of each line, and the rest of the | is constructed using the end parameter of print
The content is placed in a nested list to facilitate looping, other ways of doing this are possible, the principle is the same and adding some content does not affect it
items = [
['string', 'longer string', 'str'],
['string', 'str', 'longest string'],
['longer string', 'str', 'longest string'],
]
length = [max([len(item[i]) for item in items]) for i in range(len(items[0]))]
max_length = sum(length)
print("_" * (max_length + 4))
for item in items:
print("|", end="")
for i in range(len(length)):
item_length = len(item[i])
if length[i] > len(item[i]):
print(item[i] + " " * (length[i] - item_length), end="|")
else:
print(item[i], end="|")
print()
OUTPUT:
____________________________________________
|string |longer string|str |
|string |str |longest string|
|longer string|str |longest string|

Do it in two parts. First, figure out the size of each column. Then, do the printing based on those sizes.
header = ['string','longer string','str']
data = ['string','str','longest string']
lines = [header] * 3 + [data] * 3
def getsizes(lines):
maxn = [0] * len(lines[0])
for row in lines:
for i,col in enumerate(row):
maxn[i] = max(maxn[i], len(col)+1)
return maxn
def maketable(lines):
sizes = getsizes(lines)
all = sum(sizes)
print('_'*(all+len(sizes)) )
for row in lines:
print('|',end='')
for width, col in zip( sizes, row ):
print( col.ljust(width), end='|' )
print()
maketable(lines)
Output:
_______________________________________
|string |longer string |str |
|string |longer string |str |
|string |longer string |str |
|string |str |longest string |
|string |str |longest string |
|string |str |longest string |
You could change it to build up a single string, if you need that.

It accept an arbitrary number of rows. Supposed each row has string-type terms.
def table(*rows, padding=2, sep='|'):
sep_middle = ' '*(padding//2) + sep + ' '*(padding//2)
template = '{{:{}}}'
col_sizes = [max(map(len, col)) for col in zip(*rows)]
table_template = sep_middle.join(map(template.format, col_sizes))
print('_' * (sum(col_sizes) + len(sep_middle)*(len(header)-1) + 2*len(sep) + 2*(len(sep)*padding//2)))
for line in (header, *rows):
print(sep + ' ' * (padding//2) + table_template.format(*line) + ' ' * (padding//2) + sep)
header = ['string', 'longer string', 'str', '21']
data1 = ['string', 'str', 'longest stringhfykhj', 'null']
data2 = ['this', 'is', 'a', 'test']
# test 1
table(header, data1, data2)
# test 2
table(header, data1, data2, padding=4, sep=':')
Output
# first test
________________________________________________________
| string | longer string | str | 21 |
| string | longer string | str | 21 |
| string | str | longest stringhfykhj | null |
| this | is | a | test |
# second test
________________________________________________________________
: string : longer string : str : 21 :
: string : longer string : str : 21 :
: string : str : longest stringhfykhj : null :
: this : is : a : test :

Parsing a total.txt file by keywords in it

I'm having trouble parsing a file. I have code that parses a file by the word Total: if its value is greater than 20.0 and returns the data. I need to change the search keyword to Tokens eth: with a value greater than 20.0 and output all data between separators ======== and additionally write all sorted values into sort.txt file. I would be grateful for professional help)
Code:
outlist = []
flag = False
def dump(list_, flag_):
if list_ and flag_:
print('\n'.join(list_))
return [], False
with open('total.txt') as file:
for line in map(str.strip, file):
if line.startswith('='):
outlist, flag = dump(outlist, flag)
else:
tokens = line.split()
if len(tokens) == 3 and tokens[1] == 'Total:':
try:
flag = float(tokens[2][:-1]) > 20.0
except ValueError:
pass
outlist.append(line)
dump(outlist, flag)
total.txt
============
| hafuia
| 0xb34a47885262f9d8673dc77de7b583961134f09fb03620b29d282c32ee6932be
| 0xD0b2612a6eE3111114b43b25322C6F08A251D38D
| Total: 47.62874464666479$
|
|
| Tokens eth:
| 20.608732$ MANA
|
| Protocols cro:
| 17.840052$ VVS Finance
| 8.953779$ V3S Finance
============
| asdf
| 0x72e164aa187feaff7cb28a74b7ff800a0dfe916594c70f141069669e9df5a23b
| 0xC7dFe558ed09F0f3b72eBb0A04e9d4e99af0bd0D
| Total: 22.908481672796988$
|
|
| Tokens eth:
| 22.376087$ SOS
============
| asdf
| 0xbce666bca3c862a2ee44651374f95aca677de16b4922c6d5e7d922cc0ac42a3d
| 0x5870923a244f52fF2D119fbf5525421E32EC006e
| Total: 9.077030269778557$
|
|
| Tokens eth:
| 8.942218$ SOS
============

This is how you can parse the file.
def parse_output(filename):
outlist = []
with open(filename) as file:
new_block = False
to_write = False
lines_arr = []
for line in map(str.strip, file):
if line.startswith('======='):
new_block = not new_block
if new_block:
if to_write:
outlist.append(lines_arr)
lines_arr = []
new_block = False
to_write = False
else:
lines_arr.append(line)
if 'Total:' in line:
num = float(line.split()[-1][:-1])
if num > 20:
to_write = True
return outlist
def write_output(outlist, filename):
for block in outlist:
for line in block:
with open(filename, 'a') as out_file:
out_file.write(line + '\n')
with open(filename, 'a') as out_file:
out_file.write('=======' + '\n')
if __name__ == '__main__':
write_output(parse_output('total.txt'), 'output.txt')
I missed the sorted wallet thing. For sorting, while appending array to outlist, you can use another array for order, or prepend the number to array, sort the outputs, and skip first element while writing.

This is written in such a way that it's easy to get fe. the addresses as well. sorting done with a simple lambda function.
from pprint import pprint
wallet_splitter = "============"
wallet_content_start = "Tokens eth:"
wallet_line_start = "|"
with open("totals.txt") as infile:
wallets = infile.read().split(wallet_splitter)
print(wallets)
wallets_above_20 = []
for wallet in wallets:
total = 0
separate = []
contents = False
for line in wallet.splitlines():
if wallet_content_start in line:
contents = True
elif contents:
if "$" in line:
separate.append(line.replace(wallet_line_start, "").split("$")[0])
total += float(separate[-1])
else:
contents = False
for amount in separate:
if float(amount) > 20:
wallets_above_20.append({
"total": total,
"data": wallet
})
pprint(sorted(wallets_above_20, key = lambda i: i['total'],reverse=True))

This is another simple extensible approach you can use to achieve what you need. The comments will explain the code.
# Create a simple representational object with data for every record.
class RateObject:
# You can change the delimiter to whatever you want.
def __init__(self, text_lines: list, delimiter="Tokens eth:"):
self.text_lines = text_lines
index = [i for i, x in enumerate(text_lines) if delimiter in x][0]
# Get the value from delimiter line
self.value = self._get_value(index)
# Override this method, to change the way you extract the value. From same line or different line etc.
def _get_value(self, delimiter_index: int):
# Case of Tokens eth:
value = self.text_lines[delimiter_index + 1]
value = value.strip()
# A bad parsing for numbers, can be improved may be!
number = "".join([x for x in value if x.isdigit() or x == "."])
if number:
return float(number)
else:
# Assume 0 for unknown values
return 0.0
def __str__(self):
# Return the lines as it is
return "".join(self.text_lines)
def __repr__(self):
return "".join(self.text_lines)
# read the source file
with open("src.txt", "r") as src:
line_texts = src.readlines()
# Split the lines into sections, using the delimiter ========
splitters = [index for index, text in enumerate(line_texts) if text == "============\n"]
# Create a list of RateObjects
raw_objects = [RateObject(lt) for lt in [line_texts[splitters[i]:splitters[i + 1]] for i in range(len(splitters) - 1)]]
# Filter the objects, to get only the ones with value > 20
selected_objects = list(filter(lambda x: x.value > 20.0, raw_objects))
# Sort the objects by value
sorted_objects = sorted(selected_objects, key=lambda x: x.value, reverse=True)
# print(selected_objects)
# print(sorted_objects)
# Write the sorted objects to a file
with open("sorted.txt", "w") as dst:
dst.write("\n".join([str(x) for x in sorted_objects]))

Here's a simple generator-based approach.
def items(file):
"""
Generator to yield items from filename
whose "Tokens eth:" is above 20.0
"""
with open(file) as lines:
item = []
tokens = 0
capture = False
for line in lines:
if line == "============\n":
if tokens > 20.0:
yield tokens, item
item = []
tokens = 0
continue
if capture:
tokens = float(line.strip().split()[-2].rstrip("$"))
capture = False
if line.startswith("| Tokens eth:"):
# Set flag to capture next line when we get to it
capture = True
item.append(line)
def main():
import sys
print("============")
for tokens, item in sorted(list(items(sys.argv[1]))):
print("".join(item), end="")
print("============")
if __name__ == "__main__":
main()
For simplicity, I made the generator also perform filtering, though it would be easy to remove items with a lower total on the caller's side if you wanted to make this reusable.
Demo: https://ideone.com/UKuC6C
In fact, I would recommend that you parse this haphazard file format just once, and convert it to a standard format like CSV or JSON for further processing if this is more than a one-off.

Using regular expressions from the re module of the standard library you can, for example, split the text into blocks enclosed by the separator, then find the amount of eth in each block, sort and finally filter them.
# parameters
total_txt = """from question"""
sorted_file_name = 'sort.txt'
THRESHOLD = 20.
as_dicreasing_order = False
# body
separators = re.finditer('='*12, total_txt)
separators = list(separators)
blocks = map(total_txt.__getitem__, [slice(m1.start(), m2.start()) for m1, m2 in zip(separators, separators[1:])])
amount_block_pairs = [(float(re.search(r'Tokens eth:\n\| (\d*\.\d*)\$', block, re.M).group(1)), block) for block in blocks]
# reverse=False for increasing order, True for the opposite
sorted_blocks = sorted(amount_block_pairs, reverse=as_dicreasing_order)
filtered_blocks = [block for amount, block in sorted_blocks if amount >= THRESHOLD]
with open(sorted_file_name, 'w') as fd:
fd.write(''.join(filtered_blocks))

One another option is to use python ttp template to parse your data. In the following code, it checks your total values, finds out the value lower than 20.0. Then, the code asks a value to enter which will replace with the Tokens eth: which is lower than 20.
from ttp import ttp
import json
with open('total.txt') as f:
data_to_parse = f.read()
ttp_template = '''
| Total: {{total}}$
| {{tokens_eth}}$ {{ignore}}
'''
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
# print result in JSON format
results = parser.result(format='json')[0]
#print(results)
#converting str to json.
result = json.loads(results)
# print(result)
for i in result[0]:
# print(i)
if float(i['total']) < 20:
new_tokens_eth = float(input(f"Total value is {i['total']} lower than 20. Enter a new 'Tokens eth:' value: "))
if i['tokens_eth'] in data_to_parse:
data_to_parse = data_to_parse.replace(i['tokens_eth'], str(new_tokens_eth))
print(data_to_parse)
See the parsed data:
See the output after the code is run.

Translating an EBNF grammar to pyparsing give error

I am making a parser to convert a simple DSL into elasticsearch query. some of the possible queries are:
response:success
response:success AND extension:php OR extension:css
response:sucess AND (extension:php OR extension:css)
time >= 2020-01-09
time >= 2020-01-09 AND response:success OR os:windows
NOT reponse:success
response:success AND NOT os:windows
I have written the following EBNF grammar for this :
<expr> ::= <or>
<or> ::= <and> (" OR " <and>)*
<and> ::= <unary> ((" AND ") <unary>)*
<unary> ::= " NOT " <unary> | <equality>
<equality> ::= (<word> ":" <word>) | <comparison>
<comparison> ::= "(" <expr> ")" | (<word> (" > " | " >= " | " < " | " <= ") <word>)+
<word> ::= ("a" | "b" | "c" | "d" | "e" | "f" | "g"
| "h" | "i" | "j" | "k" | "l" | "m" | "n"
| "o" | "p" | "q" | "r" | "s" | "t" | "u"
| "v" | "w" | "x" | "y" | "z")+
The precdence of operators in the DSL is:
() > NOT > AND > OR
aslo exact mathing i.e ':' has higher precedence than comparison operators.
I believe the above grammar capture the idea of my DSL. I am having a difficult time translating it to pyparsing, this is what i have now:
from pyparsing import *
AND = Keyword('AND') | Keyword('and')
OR = Keyword('OR') | Keyword('or')
NOT = Keyword('NOT') | Keyword('not')
word = Word(printables, excludeChars=':')
expr = Forward()
expr << Or
Comparison = Literal('(') + expr + Literal(')') + OneOrMore(word + ( Literal('>') | Literal('>=') | Literal('<') | Literal('<=')) + word)
Equality = (word + Literal(':') + word) | Comparison
Unary = Forward()
Unary << (NOT + Unary) | Equality
And = Unary + ZeroOrMore(AND + Unary)
Or = And + ZeroOrMore(OR + And)
The error i get is :
Traceback (most recent call last):
File "qql.py", line 54, in <module>
expr << Or
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pyparsing.py", line 5006, in __lshift__
self.mayIndexError = self.expr.mayIndexError
AttributeError: type object 'Or' has no attribute 'mayIndexError'
I think its becuase i am unable to understand Forward() correctly.
Questions: how can i correctly translate the above grammar to pyparsing?
**EDIT **: when i changed the pyparsing code to:
AND = Keyword('AND')
OR = Keyword('OR')
NOT = Keyword('NOT')
word = Word(printables, excludeChars=':')
expr = Forward()
Comparison = Literal('(') + expr + Literal(')') + OneOrMore(word + ( Literal('>') | Literal('>=') | Literal('<') | Literal('<=')) + word)
Equality = (word + Literal(':') + word) | Comparison
Unary = Forward()
Unary << ((NOT + Unary) | Equality)
And = Unary + ZeroOrMore(AND) + Unary
Or = And + ZeroOrMore(OR + And)
expr << Or
Q = """response : 200 \
AND extesnion: php \
OR extension: css \
"""
print(expr.parseString(Q))
I get this output:
['response', ':', '200', 'AND', 'extesnion', ':', 'php']
why OR expression is not parsed?

pyparsing can only find one instance

I want to parse Windows Resource File with pyparsing, because Menu can have deep nesting structure. It would be very difficult to parse such structure with regular expressions.
Everything works properly, but today I found that my code can only find one instance.
To make me clear, here is the contents of *.rc file(E:\tool\res\my.rc, to save space only show the error prone part):
#include "../include/resource.h"
IDR_MENU_OPTION MENU BEGIN
POPUP "Options"
BEGIN
MENUITEM "List Layers for &All Pages", IDM_SHOW_ALL
MENUITEM "List Layers for &Visible Pages", IDM_SHOW_VISIBLE
MENUITEM SEPARATOR
MENUITEM "&Reset to Initial Visibility", IDM_RESET_INIT
MENUITEM SEPARATOR
MENUITEM "E&xpand All", IDM_EXPAND_ALL
MENUITEM "C&ollapse All", IDM_COLLAPSE_ALL
END
POPUP ""
BEGIN
MENUITEM "List Layers for &All Pages", IDM_LIST_ALL
MENUITEM "List Layers for &Visible Pages", IDM_LIST_VISIBLE
MENUITEM SEPARATOR
MENUITEM "&Reset to Initial Visibility", IDM_RESET_INIT
MENUITEM SEPARATOR
MENUITEM "E&xpand All", IDM_EXPAND_ALL
MENUITEM "C&ollapse All", IDM_COLLAPSE_ALL
MENUITEM SEPARATOR
MENUITEM "Layer &Properties...", IDM_LAYER_PROPERTIES
END END
IDR_MENU_PRPPERTIES MENU BEGIN // the menu block is skiped by pyparsing
POPUP ""
BEGIN
MENUITEM "&Show Layers", IDM_SHOW
MENUITEM "&Properties...", IDM_PROPERTIES
END
MENUITEM "", 65535 END
#endif // not APSTUDIO_INVOKED
my Python code can't find IDR_MENU_PRPPERTIES MENU,
the output now is:
IDM_COLLAPSE_ALL
IDM_EXPAND_ALL
IDM_LAYER_PROPERTIES
IDM_LIST_ALL
IDM_LIST_VISIBLE
IDM_RESET_INIT
IDM_SHOW_ALL
IDM_SHOW_VISIBLE
IDR_MENU_OPTION
but the expected output should be:
IDM_COLLAPSE_ALL
IDM_EXPAND_ALL
IDM_LAYER_PROPERTIES
IDM_LIST_ALL
IDM_LIST_VISIBLE
IDM_RESET_INIT
IDM_SHOW_ALL
IDM_SHOW_VISIBLE
IDR_MENU_OPTION
IDR_MENU_PRPPERTIES
IDM_SHOW
IDM_PROPERTIES
and here is my code:
import re
import os
import codecs
import fnmatch
from bs4 import UnicodeDammit
from pyparsing import restOfLine, cStyleComment, Word, alphanums, alphas, \
Optional, SkipTo, ZeroOrMore, Group, Keyword, quotedString, delimitedList, \
nums, commaSeparatedList, Forward, Combine
class RcParser:
def __init__(self, rc_file):
self.rc_file = rc_file
handle = open(rc_file, 'rb')
binary_data = handle.read()
handle.close()
dammit = UnicodeDammit(binary_data)
self.rc_src = dammit.unicode_markup
self.encoding = dammit.original_encoding
self.string_table_id = set()
self.dialog_id = set()
self.menu_id = set()
self.img_id = set()
self.parse(self.rc_src)
def get_rc_header(self):
inx = self.rc_file.rfind('\\')
path = self.rc_file[: inx + 1]
file_lists = [path + file for file in os.listdir(path) if file.lower().endswith('resource.h')]
if not file_lists:
return None
return file_lists[0]
def id_by_parsing_rc(self):
rc_id = self.img_id | self.menu_id | self.dialog_id | self.string_table_id
return rc_id
def rc_statement(self):
""" Generate a RC statement parser that can be used to parse a RC file
:rtype: pyparsing.ParserElement
"""
one_line_comment = '//' + restOfLine
comments = cStyleComment ^ one_line_comment
precompiler = Word('#', alphanums) + restOfLine
language_definition = "LANGUAGE" + Word(alphas + '_').setResultsName(
"language") + Optional(',' + Word(alphas + '_').setResultsName("sublanguage"))
block_start = (Keyword('{') | Keyword("BEGIN")).setName("block_start")
block_end = (Keyword('}') | Keyword("END")).setName("block_end")
reserved_words = block_start | block_end
name_id = ~reserved_words + \
Word(alphas, alphanums + '_').setName("name_id")
numbers = Word(nums)
integerconstant = numbers ^ Combine('0x' + numbers)
constant = Combine(
Optional(Keyword("NOT")) + (name_id | integerconstant), adjacent=False, joinString=' ')
combined_constants = delimitedList(constant, '|')
block_options = Optional(SkipTo(
Keyword("CAPTION"), failOn=block_start)("pre_caption") + Keyword("CAPTION") + quotedString(
"caption")) + SkipTo(
block_start)("post_caption")
undefined_control = Group(name_id.setResultsName(
"id_control") + delimitedList(quotedString ^ constant ^ numbers ^ Group(combined_constants)).setResultsName(
"values_"))
block = block_start + \
ZeroOrMore(undefined_control)("controls") + block_end
dialog = name_id(
"block_id") + (Keyword("DIALOGEX") | Keyword("DIALOG"))("block_type") + block_options + block
string_table = Keyword("STRINGTABLE")(
"block_type") + block_options + block
menu_item = Keyword(
"MENUITEM")("block_type") + (commaSeparatedList("values_") | Keyword("SEPARATOR"))
popup_block = Forward()
popup_block <<= Group(Keyword("POPUP")("block_type") + Optional(quotedString("caption")) + block_start +
ZeroOrMore(Group(menu_item | popup_block))("elements") + block_end)("popups*")
menu = name_id("block_id") + \
Keyword("MENU")("block_type") + block_options + \
block_start + ZeroOrMore(popup_block) + block_end
statem = comments ^ precompiler ^ language_definition ^ dialog ^ string_table ^ menu
return statem
def generate_menu_pre_name(self, block_type, block_id):
"""Return the pre-name generated for elements of a menu."""
return "%s.%s" % (block_type, block_id)
def generate_popup_pre_name(self, pre_name, caption):
"""Return the pre-name generated for subelements of a popup.
:param pre_name: The pre_name that already have the popup.
:param caption: The caption (whitout quotes) of the popup.
:return: The subelements pre-name based in the pre-name of the popup and
its caption.
"""
return "%s.%s" % (pre_name, caption.replace(" ", "_"))
def add_popup_units(self, pre_name, popup):
"""Transverses the popup tree making new units as needed."""
for element in popup.elements:
if element.block_type and element.block_type == "MENUITEM":
if element.values_ and len(element.values_) >= 2:
var = element.values_[1]
if not var.isdigit():
self.menu_id.add(var)
# Else it can be a separator.
elif element.popups:
for sub_popup in element.popups:
self.add_popup_units(self.generate_popup_pre_name(pre_name, popup.caption[1:-1]), sub_popup)
def parse(self, rcsrc):
"""Read the source of a .rc file in and include them as units."""
# Parse the strings into a structure.
results = self.rc_statement().searchString(rcsrc)
for statement in results:
if not statement.block_type:
continue
if statement.block_type in ("DIALOG", "DIALOGEX"):
helper = statement.block_id[0]
self.dialog_id.add(statement.block_id[0])
control_type = [
"AUTOCHECKBOX", "AUTORADIOBUTTON", "CAPTION", "CHECKBOX"
, "CTEXT", "CONTROL", "DEFPUSHBUTTON", "GROUPBOX"
, "LTEXT", "PUSHBUTTON", "RADIOBUTTON", "RTEXT"
, "COMBOBOX"
]
for control in statement.controls:
fk = (control.id_control[0] in control_type)
flag = (control.values_[0].startswith('"') or control.values_[0].startswith("'"))
if control.id_control[0] in control_type:
if flag:
self.dialog_id.add(control.values_[1])
else:
self.dialog_id.add(control.values_[0])
continue
if statement.block_type in ("MENU"):
pre_name = self.generate_menu_pre_name(statement.block_type, statement.block_id[0])
self.menu_id.add(statement.block_id[0])
for popup in statement.popups:
self.add_popup_units(pre_name, popup)
continue
if statement.block_type in ("STRINGTABLE"):
for text in statement.controls:
self.string_table_id.add(text.id_control[0])
continue
lines = rcsrc.splitlines()
for line in lines:
line = line.rstrip()
m = re.match(r'(\w+)\s+(\bBITMAP\b|\bPNG\b|\bXML\b|\bICON\b)\s+(\".*\")$', line)
if not m:
continue
self.img_id.add(m.group(1))
def main():
x = RcParser(r'E:\tool\res\my.rc')
print('\n'.join(sorted(x.id_by_parsing_rc())))
if __name__ == "__main__":
main()

Your definition of a menu is:
menu = name_id("block_id") + \
Keyword("MENU")("block_type") + block_options + \
block_start + ZeroOrMore(popup_block) + block_end
Within your block_start/block_end you only allow popup_blocks. In the menu that does not match, there is a menu_item in the menu that is not part of a popup_block. You may need something like:
menu = name_id("block_id") + \
Keyword("MENU")("block_type") + block_options + \
block_start + ZeroOrMore(popup_block | menu_item) + block_end

Sql to Pymongo generation using Pyparsing

I am using the "awesomest" parsing library in the world existing right now. Pyparsing. The problem at hand is to generate a PyMongo dictionary from a given SQL string (For select statements). The grammar def I am using is following :
sql_stmt = (select_key_word + ('*' | column_list).setResultsName
("columns") + form_key_word + table_name_list.setResultsName
("collections") +
Optional(where_condition, "").setResultsName("where"))
Here the select_key_word, column_list etc. constructs are valid grammar defs. and using this i can parse a string like "Select * from collection_1 where (Sal = 1000 or Sal=5000) AND Car>2"
The problem i have is that, the where part is being parsed is like this :
[[u'where', [u'(', [u'Sal', '=', u'1000'], 'or', [u'Sal', '=', u'5000'], u')'], 'and', [u'Car', '>', u'2']]]
Which is fine if i want it translated into something sqlish. But a valid representation of that same in pymongo would be something like this :
{u'$or': [{u'$and': [{u'Sal': u'1000'}, {u'Sal': u'5000'}]}, {u'Car': {u'$gte': u'2'}}]}
That is where I am stuck. Can anybody give me a direction? it seems to me that setParseAction will be a way to go, but just can't figure that out
the code for the where_contidion is :
where_expr = Forward()
and_keyword = get_conjunction_as_grammar("and")
or_keyword = get_conjunction_as_grammar("or")
in_operation = get_operation_as_grammar("in")
column_value = get_real_number_as_grammar() | get_int_as_grammar() | \
quotedString
binary_operator = get_bin_op_as_grammar()
col_name = get_column_name_as_grammar()
where_condn = Group(
(col_name + binary_operator + column_value) |
(col_name + in_operation + "(" + delimitedList(column_value) + ")" ) |
("(" + where_expr + ")")
)
where_expr << where_condn + ZeroOrMore((and_keyword | or_keyword)
+ where_expr)
where_condition = Group(CaselessLiteral("where") + where_expr)
Thanks in advance. Please let me know if you need any other information.

Yes, parse actions are just the thing for this kind of project. Also, if you are trying to evaluate an expression that can have parenthetical nesting of operations of varying precedence, then operatorPrecedence is often a handy shortcut:
from pyparsing import *
and_keyword = CaselessKeyword("and")
or_keyword = CaselessKeyword("or")
in_operation = CaselessKeyword("in")
value = quotedString | Word(alphanums)
comparisonOp = oneOf("= != > < >= <=")
LPAR,RPAR = map(Suppress,"()")
valueList = LPAR + delimitedList(value) + RPAR
comparisonExpr = value + comparisonOp + value | value + in_operation + Group(valueList)
def makePymongoComparison(tokens):
v1,op,v2 = tokens
if op != 'in':
if op != '=':
op = {
"!=" : "$ne",
">" : "$gt",
"<" : "$lt",
">=" : "$gte",
"<=" : "$lte",
}[op]
v2 = "{'%s': '%s'}" % (op, v2)
return "{'%s': '%s'}" % (v1, v2)
else:
return "{'%s': {'$in': [%s]}}" % (v1, ','.join("'%s'"%v for v in v2))
comparisonExpr.setParseAction(makePymongoComparison)
def handleBinaryOp(op):
def pa(tokens):
return "{'$%s': %s}" % (op, ', '.join(tokens.asList()[0][::2]))
return pa
handleAnd = handleBinaryOp("and")
handleOr = handleBinaryOp("or")
whereOperand = comparisonExpr
where_expr = operatorPrecedence(whereOperand,
[
(and_keyword, 2, opAssoc.LEFT, handleAnd),
(or_keyword, 2, opAssoc.LEFT, handleOr),
])
where_condition = Group(CaselessLiteral("where") + where_expr)
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car>2")[0]
print where_expr.parseString("(Sal = 1000 or Sal=5000) AND Car in (1,2,3)")[0]
prints:
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': '{'$gt': '2'}'}}
{'$and': {'$or': {'Sal': '1000'}, {'Sal': '5000'}}, {'Car': {'$in': ['1','2','3']}}}
Still needs a few tweaks, but I hope this gets you further along.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Flatten nested array in Spark DataFrame - python

You can use transform: df2 = df.selectExpr("transform(a, x -> struct(x.b.c as b_c, x.b.d as b_d)) as a")

Related

How to align strings in columns?

Parsing a total.txt file by keywords in it

Translating an EBNF grammar to pyparsing give error

pyparsing can only find one instance

Sql to Pymongo generation using Pyparsing

Categories

Resources