diff --git a/examples/abnf/url_parser.py b/examples/abnf/url_parser.py new file mode 100644 index 00000000..b2355834 --- /dev/null +++ b/examples/abnf/url_parser.py @@ -0,0 +1,132 @@ +""" +Creating URL Parser from ABNF grammar in internet standards (RFC3986) +================================================================== + +Usage: + python3 -m examples.abnf.url_parser https://github.com/lark-parser/lark#readme + python3 -m examples.abnf.url_parser http://localhost:8000/search?q=lark%2dparser?user=me + +It outputs parse tree for an URI passed as first argument. + +""" +import sys + +from lark import Lark, Transformer, v_args, Token, Visitor, Tree +from lark.load_grammar import FromPackageLoader + +grammar_in_abnf =""" + +%import rfc3986 ; import from examples/grammars/rfc3986.abnf using custom loader +%import core-rules ; import from the standard library: ../lark/grammars/core-rules.abnf + +; Terminals need to be specified via %terminal directive to control +; automatic parse-tree construction by lark. +%terminal ALPHA, DIGIT +%terminal HEXDIG +%terminal unreserved +""" + + +class SimplifyABNFTree_Visitor(Visitor): + def __init__(self, unwrap_children=(), keep=(), *args, **kwargs): + super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs) + self.unwrap = unwrap_children + self.keep = keep + + def visit(self, tree: Tree) -> Tree: + # override self.visit(), since _unwrap_and_flatten() assumes top-down visitor + self.visit_topdown(tree) + + def _unwrap_and_flatten(self, tree, unwrap_recursive=False): + """ a generator to flatten tree into list or tuple """ + do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False + + for x in tree.children: + if isinstance(x, Tree) and do_unwrap: + if x.data in self.keep: + yield self._concat_tokens(x, unwrap_recursive=True) + else: + for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)): + yield item + elif isinstance(x, Token): + yield x + else: + yield x + + + def _concat_tokens(self, tree, unwrap_recursive=False): + """ concatenate multiple tokens in tree.children into single token. + leave it as it is if there is a tree in tree.children. + """ + items = [None] + words = [] + children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive)) + + for x in children: + if isinstance(x, Token): + words.append(x.value) + if not isinstance(items[-1], Token): + items.append(x) + else: + if len(words) > 1: + items[-1] = items[-1].update(value=''.join(words)) + items.append(x) + words=[] + + if len(words) > 1: + items[-1] = items[-1].update(value=''.join(words)) + + tree.children = items[1:] + return tree; + + def __default__(self, tree): + return self._concat_tokens(tree) + + +class pct_encoded_conv(Transformer): + def pct_encoded(self, items): # alias for pct-encoded + # items = "%" HEXDIG HEXDIG + + # extract hexadecimal digits, convert it to a character, + # then return modified token + char_in_hex = ''.join(items[1:]) + char_ = bytearray.fromhex(char_in_hex).decode() + token = items[0].update(value=char_) + return token + +def main(): + url = sys.argv[1] + + custom_loader = FromPackageLoader('examples', ('grammars', )) + url_parser = Lark(grammar_in_abnf, + # using ABNF grammar + syntax='abnf', + start='URI', + # use earley parser since RFC3986 is too complex for LALR. + parser='earley', + # often needed to set keep_all_tokens=True when ABNF grammar is used. + keep_all_tokens=True, + import_paths=[custom_loader], + ) + tree = url_parser.parse(url) + + # Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters + transformer=pct_encoded_conv() + tree = transformer.transform(tree) + + + # We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens + # to construct a token that we actually want since many ABNF grammar + # in RFCs split every input into too small units like a single character. + + unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name', + 'segment', 'query', 'fragment', + 'path_abempty', 'path_absolute', 'path_noscheme', 'path_rootless') + simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap) + simplifier.visit(tree) + + print(tree.pretty()) + + +if __name__ == '__main__': + main() diff --git a/examples/grammars/rfc3986.abnf b/examples/grammars/rfc3986.abnf new file mode 100644 index 00000000..d127daf7 --- /dev/null +++ b/examples/grammars/rfc3986.abnf @@ -0,0 +1,87 @@ +; ABNF grammar from RFC3986 +; Uniform Resource Identifier (URI): Generic Syntax +; +; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234. +; + +URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + +hier-part = "//" authority path-abempty + / path-absolute + / path-rootless + / path-empty + +URI-reference = URI / relative-ref + +absolute-URI = scheme ":" hier-part [ "?" query ] + +relative-ref = relative-part [ "?" query ] [ "#" fragment ] + +relative-part = "//" authority path-abempty + / path-absolute + / path-noscheme + / path-empty + +scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + +authority = [ userinfo "@" ] host [ ":" port ] +userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +host = IP-literal / IPv4address / reg-name +port = *DIGIT + +IP-literal = "[" ( IPv6address / IPvFuture ) "]" +IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + +IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + / [ *4( h16 ":" ) h16 ] "::" ls32 + / [ *5( h16 ":" ) h16 ] "::" h16 + / [ *6( h16 ":" ) h16 ] "::" + +h16 = 1*4HEXDIG +ls32 = ( h16 ":" h16 ) / IPv4address +IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + +dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 + +reg-name = *( unreserved / pct-encoded / sub-delims ) + +path = path-abempty ; begins with "/" or is empty + / path-absolute ; begins with "/" but not "//" + / path-noscheme ; begins with a non-colon segment + / path-rootless ; begins with a segment + / path-empty ; zero characters + +path-abempty = *( "/" segment ) +path-absolute = "/" [ segment-nz *( "/" segment ) ] +path-noscheme = segment-nz-nc *( "/" segment ) +path-rootless = segment-nz *( "/" segment ) +path-empty = 0 + + +segment = *pchar +segment-nz = 1*pchar +segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) + ; non-zero-length segment without any colon ":" + +pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + +query = *( pchar / "/" / "?" ) +fragment = *( pchar / "/" / "?" ) + +pct-encoded = "%" HEXDIG HEXDIG + +unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +reserved = gen-delims / sub-delims +gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + / "*" / "+" / "," / ";" / "=" + diff --git a/lark/grammars/abnf.lark b/lark/grammars/abnf.lark new file mode 100644 index 00000000..73e8149a --- /dev/null +++ b/lark/grammars/abnf.lark @@ -0,0 +1,84 @@ +// +// Lark's EBNF grammar to parse ABNF grammar (RFC5234) +// + + +_LPAR: "(" +_RPAR: ")" +_LBRA: "[" +_RBRA: "]" +_STAR: "*" +_SLASH: "/" +EQ: "=" +EQ_ALT: "=/" +_IGNORE_CASE: "%i" +_CASE_SENSITIVE: "%s" + +RULE: /[a-zA-Z][a-zA-Z0-9\-]*/ + +QSTRING: /"[ !#$%&\'\(\)\*\+,\-\.\/0-9:;<=>\?@A-Z\[\\\]\^_`a-z\{|\}~]*"/ +PROSE_VAL: /<[ !"#$%&\'\(\)\*\+,\-\.\/0-9:;<=\?@A-Z\[\\\]\^_`a-z\{|\}~]*>/ + +NUMBER: /[0-9]+/ +DEC_VAL: /%d([0-9]+(\.[0-9]+)+|[0-9]+\-[0-9]+|[0-9]+)/ +HEX_VAL: /%x([0-9A-F]+(\.[0-9A-F]+)+|[0-9A-F]+\-[0-9A-F]+|[0-9A-F]+)/ +BIN_VAL: /%b([01]+(\.[01]+)+|[01]+\-[01]+|[01]+)/ + +_C_NL: /(;[^\n]*)*\r?\n/ +_C_WSP: /((;[^\n]*)*\r?\n)?[ \t]+/ + +// terminals for nonstandard extensions +_IMPORT: "%import" +_DOT: "." +_COMMA: "," + + +start: _rulelist +_rulelist: (rule | abnf_import | terminal_def | (_C_WSP* _C_NL))+ + +rule: RULE _defined_as _elements _C_NL + +_defined_as: _C_WSP* (EQ|EQ_ALT) _C_WSP* +_elements: alternation _C_WSP* +alternation: concatenation (_C_WSP* _SLASH _C_WSP* concatenation)* +concatenation: repetition (_C_WSP+ repetition)* +repetition: repeat? _element + +// repeat = 1*DIGIT / (*DIGIT "*" *DIGIT) +repeat: (repeat_min _STAR repeat_max)|(repeat_min _STAR)|(_STAR repeat_max)|_STAR|repeat_n +repeat_n: NUMBER +repeat_min: NUMBER +repeat_max: NUMBER + +_element: rule_ref|_group|option|char_val|num_val|prose_val +rule_ref: RULE +// 'group' is inlined intentionally. +_group: _LPAR _C_WSP* alternation _C_WSP* _RPAR +option: _LBRA _C_WSP* alternation _C_WSP* _RBRA + +char_val: case_insensitive_string|case_sensitive_string +case_insensitive_string: _IGNORE_CASE? QSTRING +case_sensitive_string: _CASE_SENSITIVE QSTRING + +num_val: dec_val|bin_val|hex_val +dec_val: DEC_VAL +hex_val: HEX_VAL +bin_val: BIN_VAL + +prose_val: PROSE_VAL + +// nonstandard extensions to ABNF grammar +// (%import) +abnf_import: _import1 +_import1: _IMPORT _C_WSP+ _import_path _C_WSP* name_list? _C_WSP* _C_NL +_import_path: import_from_lib|import_relpath +import_from_lib: _import_args +import_relpath: _DOT _import_args +_import_args: PATHNAME (_DOT PATHNAME)* +name_list: _LPAR _C_WSP* RULE (_C_WSP* _COMMA _C_WSP* RULE)* _C_WSP* _RPAR + +PATHNAME: /[!#$%&\'\+,\-0-9;=@A-Z\[\]\^_a-z`\{\}~]+/ + +// (%terminal) +terminal_def: _TERMINAL _C_WSP+ RULE (_C_WSP* _COMMA _C_WSP* RULE)* +_TERMINAL: "%terminal" diff --git a/lark/grammars/core-rules.abnf b/lark/grammars/core-rules.abnf new file mode 100644 index 00000000..7ccf69f5 --- /dev/null +++ b/lark/grammars/core-rules.abnf @@ -0,0 +1,39 @@ +; ABNF Core Rules (RFC5234 Appendix.B) + +ALPHA = %x41-5A / %x61-7A ; A-Z / a-z +BIT = "0" / "1" +CHAR = %x01-7F + ; any 7-bit US-ASCII character, + ; excluding NUL +CR = %x0D + ; carriage return +CRLF = CR LF + ; Internet standard newline +CTL = %x00-1F / %x7F + ; controls +DIGIT = %x30-39 + ; 0-9 +DQUOTE = %x22 + ; " (Double Quote) +HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F" +HTAB = %x09 + ; horizontal tab +LF = %x0A + ; linefeed +LWSP = *(WSP / CRLF WSP) + ; Use of this linear-white-space rule + ; permits lines containing only white + ; space that are no longer legal in + ; mail headers and have caused + ; interoperability problems in other + ; contexts. + ; Do not use when defining mail + ; headers and use with caution in + ; other contexts. +OCTET = %x00-FF + ; 8 bits of data +SP = %x20 +VCHAR = %x21-7E + ; visible (printing) characters +WSP = SP / HTAB + ; white space diff --git a/lark/lark.py b/lark/lark.py index 6c6560f1..5e72caa0 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -98,6 +98,11 @@ class LarkOptions(Serialize): Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``) tree_class Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``. + syntax + Syntax for grammar specification. + + - "lark" (default): Lark's EBNF based syntax + - "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported. **=== Algorithm Options ===** @@ -169,6 +174,7 @@ class LarkOptions(Serialize): 'use_bytes': False, 'import_paths': [], 'source_path': None, + 'syntax': 'lark', } def __init__(self, options_dict): @@ -328,7 +334,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: # Parse the grammar file and compose the grammars - self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens, self.options.syntax) else: assert isinstance(grammar, Grammar) self.grammar = grammar diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cba27987..3d1a76a8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -2,6 +2,7 @@ import hashlib import os.path import sys +from importlib import import_module from collections import namedtuple from copy import copy, deepcopy import pkgutil @@ -18,6 +19,7 @@ from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY from .utils import classify, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput +from .exceptions import ConfigurationError from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -1056,20 +1058,64 @@ def __init__(self, is_term, tree, params=(), options=None): self.params = tuple(params) self.options = options +class GrammarLoaderBase: + used_files: Dict[str, str] + import_paths: List[Union[str, Callable]] + + def __init__(self, + import_paths: Optional[List[Union[str, Callable]]]=None, + used_files: Optional[Dict[str, str]]=None) -> None: + self.import_paths = import_paths or [] + self.used_files = used_files or {} + + def read_grammar_from_file(self, base_path: Optional[str], grammar_path: str): + to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + h = hashlib.md5(text.encode('utf8')).hexdigest() + if self.used_files.get(joined_path, h) != h: + raise RuntimeError("Grammar file was changed during importing") + self.used_files[joined_path] = h + return joined_path, text + else: + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') + assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (grammar_path,) + + +class LarkGrammarLoader(GrammarLoaderBase): + """ default grammar loader for Lark's EBNF grammar """ + GRAMMAR_FILE_EXT = '.lark' + + def parse_grammar(self, gramma_text, grammar_name): + return _parse_grammar(gramma_text, grammar_name) + + class GrammarBuilder: global_keep_all_tokens: bool - import_paths: List[Union[str, Callable]] - used_files: Dict[str, str] def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None: + + self.loader = LarkGrammarLoader(import_paths, used_files) self.global_keep_all_tokens = global_keep_all_tokens - self.import_paths = import_paths or [] - self.used_files = used_files or {} self._definitions = {} self._ignore_names = [] + @property + def used_files(self): + return self.loader.used_files + def _grammar_error(self, is_term, msg, *names): args = {} for i, name in enumerate(names, start=1): @@ -1207,8 +1253,24 @@ def _unpack_definition(self, tree, mangle): return name, is_term, exp, params, opts + def set_syntax(self, syntax: Union[str, object]): + if isinstance(syntax, str): + if syntax == 'lark': + self.loader = LarkGrammarLoader(self.loader.import_paths, self.loader.used_files) + else: + # load plugin from lark/syntax/ to support alternative grammars + try: + module_name = 'lark.syntax.%s' % syntax + loader = import_module(module_name, '.') + except Exception as e: + raise ConfigurationError("invalid syntax option: %s" % syntax) + + self.loader = loader.get_grammar_loader(self.loader.import_paths, self.loader.used_files) + else: + self.loader = syntax + def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: - tree = _parse_grammar(grammar_text, grammar_name) + tree = self.loader.parse_grammar(grammar_text, grammar_name) imports = {} for stmt in tree.children: @@ -1277,36 +1339,17 @@ def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], alia assert dotted_path mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) grammar_path = os.path.join(*dotted_path) + EXT - to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] - for source in to_try: - try: - if callable(source): - joined_path, text = source(base_path, grammar_path) - else: - joined_path = os.path.join(source, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - except IOError: - continue - else: - h = hashlib.md5(text.encode('utf8')).hexdigest() - if self.used_files.get(joined_path, h) != h: - raise RuntimeError("Grammar file was changed during importing") - self.used_files[joined_path] = h - - gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) - gb.load_grammar(text, joined_path, mangle) - gb._remove_unused(map(mangle, aliases)) - for name in gb._definitions: - if name in self._definitions: - raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) - - self._definitions.update(**gb._definitions) - break - else: - # Search failed. Make Python throw a nice error. - open(grammar_path, encoding='utf8') - assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) + + joined_path, text = self.loader.read_grammar_from_file(base_path, grammar_path) + + gb = GrammarBuilder(self.global_keep_all_tokens, self.loader.import_paths, self.loader.used_files) + gb.load_grammar(text, joined_path, mangle) + gb._remove_unused(map(mangle, aliases)) + for name in gb._definitions: + if name in self._definitions: + raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) + + self._definitions.update(**gb._definitions) def validate(self) -> None: @@ -1380,7 +1423,8 @@ def list_grammar_imports(grammar, import_paths=[]): builder.load_grammar(grammar, '') return list(builder.used_files.keys()) -def load_grammar(grammar, source, import_paths, global_keep_all_tokens): +def load_grammar(grammar, source, import_paths, global_keep_all_tokens, syntax='lark'): builder = GrammarBuilder(global_keep_all_tokens, import_paths) + builder.set_syntax(syntax) builder.load_grammar(grammar, source) return builder.build(), builder.used_files diff --git a/lark/syntax/abnf.py b/lark/syntax/abnf.py new file mode 100644 index 00000000..f1c666d9 --- /dev/null +++ b/lark/syntax/abnf.py @@ -0,0 +1,401 @@ +""" + Grammar loader plugin for ABNF grammar (RFC5234 and RFC7405). + + It parses ABNF grammar and creates grammar objects as default grammar loader does. +""" + +import sys, os +import hashlib + +from lark import Lark, Transformer, Visitor, Tree, Token +from lark import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput +from typing import List, Tuple, Union, Callable, Dict, Optional +from lark.load_grammar import stdlib_loader, GrammarLoaderBase, PackageResource +from lark.grammar import Terminal, NonTerminal + +#inline_args = v_args(inline=True) +ABNF_EXT = '.abnf' + +ABNF_GRAMMAR_ERRORS = [ + ('Unexpected line endings', ['a = \n', 'a = ( \n']), + ('Unclosed parenthesis', ['a = ( x \n']), + ('Unclosed bracket', ['a = [ x \n']), + ('Unmatched closing parenthesis', ['a = x )\n', 'a = x ]\n']), + ('Incorrect type of value', ['a = 1\n']), + ('Unexpected character (missing "=" or "=/" after rule name, or unusable character in rule name)', + ['a\n', 'a A\n', 'a /= A\n', 'a == A\n', 'a@rule = x']), +] + +def _translate_parser_exception(parse, e): + error = e.match_examples(parse, ABNF_GRAMMAR_ERRORS, use_accepts=True) + return error + +class ABNFToLarkTransformer(Transformer): + """ convert parse-tree of ABNF grammar into Lark's EBNF parse-tree. """ + def __init__(self, terminals=(), *args, **kwargs): + super(ABNFToLarkTransformer, self).__init__(*args, **kwargs) + self._terminals = terminals + + def char_val(self, items): + char_val = items[0] + literal = char_val.children[0] + text = literal[1:-1] # remove double quotes + if char_val.data == 'case_insensitive_string': + flags = 'i' + else: + flags = '' + + token = literal.update(type_='STRING', value='"{}"{}'.format(text, flags)) + return Tree('value', [ Tree('literal', [token]) ]) + + def _char_to_pattern(self, num_val_literal, base): + char = int(num_val_literal, base=base) + if char > 0xffffffff: + raise GrammarError("Terminal value characters larger than 0xffffffff is not supported.") + elif char > 0xffff: + regexp = r'\U{:08x}'.format(char) + elif char > 0xff: + regexp = r'\u{:04x}'.format(char) + else: + regexp = r'\x{:02x}'.format(char) + return regexp + + def _value_range_to_pattern(self, num_val, base=10): + #num_val = tree.children[0] + literal = num_val.value[2:] + if literal.find('.') > 0: + # '.' concatenation of values + nums = ( self._char_to_pattern(num, base) for num in literal.split('.') ) + regexp = ''.join(nums) + + elif literal.find('-') > 0: + # '-' value range + start, end = ( self._char_to_pattern(num, base) for num in literal.split('-') ) + regexp = r'[%s-%s]' % (start, end) + else: + regexp = self._char_to_pattern(literal, base) + + token = num_val.update(type_='REGEXP', value='/{}/'.format(regexp)) + #tree.children = [token] + #return tree + return Tree('literal', [token]) + + def hex_val(self, items): + return self._value_range_to_pattern(items[0], base=16) + def dec_val(self, items): + return self._value_range_to_pattern(items[0], base=10) + def bin_val(self, items): + return self._value_range_to_pattern(items[0], base=2) + def num_val(self, items): + return Tree('value', items) + + def concatenation(self, items): + # rename 'concatenation' in ABNF to 'expansion' in EBNF + return Tree('expansion', items) + + def alternation(self, items): + # rename 'alternation' in ABNF to 'expansions' in EBNF + return Tree('expansions', items) + + def option(self, items): + # rename to 'expr' and add '?' + items.append(Token('OP', '?')) + return Tree('expr', items) + + def rule_ref(self, items): + # replace hyphens in rule name with underscores + rulename = items[0].replace('-', '_') + if rulename in self._terminals: + return Tree('value', [Terminal(rulename)]) + else: + return Tree('value', [NonTerminal(rulename)]) + + def rule(self, items): + # remove '=' or '=/' + assert items[1].type in ('EQ_ALT', 'EQ') + items.pop(1) + + # replace hyphens in rule name with underscores + if items[0].find('-') > 0: + items[0] = items[0].update(value=items[0].replace('-', '_')) + + rulename = items[0].value + if rulename in self._terminals: + # rename 'rule' to 'term' + return Tree('term', items) + else: + # insert empty 'template params', 'priority', and rule modifiers + items[1:1] = [Tree('template_params', []), Tree('priority', [])] + items.insert(0, Tree('rule_modifiers', [])) + return Tree('rule', items) + + def repetition(self, items): + """ rewrite repetition in Lark's EBNF form """ + assert len(items) > 0 + if len(items) == 1: + # no repetition + return items[0] + + repeat, element = items + + rmin = [ x for x in repeat.find_data('repeat_min') ] + rmax = [ x for x in repeat.find_data('repeat_max') ] + rnum = [ x for x in repeat.find_data('repeat_n') ] + + rmin = int(rmin[0].children[0].value) if len(rmin) else 0 + rmax = int(rmax[0].children[0].value) if len(rmax) else None + rnum = int(rnum[0].children[0].value) if len(rnum) else None + + if rnum is not None: + # Specific Repetition 'nRule' + if rnum == 0: + # generate empty rule + return Tree('expansion', [] ) + else: + return Tree('expr', [ element, Token('TILDE', '~'), Token('NUMBER', str(rnum))]) + + # Variable Repetition '*Rule', where and are optional + if rmax is None: + # '*Rule' or '*Rule' + if rmin < 0: + raise GrammarError("Negative repetition is not possible") + elif rmin == 0: + # '*Rule' or '0*Rule' + return Tree('expr', [ element, Token('OP', '*') ]) + else: + # '*Rule' + expr1 = Tree('expr', [ element, Token('TILDE', '~'), Token('NUMBER', str(rmin)) ]) + expr2 = Tree('expr', [ element, Token('OP', '*') ]) + # concatenate them + return Tree('expansion', [expr1, expr2]) + + # '*Rule' or '*Rule' + if rmax < rmin or rmin < 0: + raise GrammarError("Bad repetition (%d*%d isn't allowed)" % (rmin, rmax)) + + if rmin == 0: + # '*Rule' or '0*Rule' + expr1 = Tree('expansion', []) # empty + expr2 = Tree('expansion', + [ Tree('expr', [ element, Token('TILDE', '~'), + Token('NUMBER', "1"), Token('NUMBER', str(rmax))])]) + # alternation of them + tree =Tree('expansions', [expr1, expr2]) + else: + '*Rule' + tree = Tree('expr', [ element, Token('TILDE', '~'), + Token('NUMBER', str(rmin)), Token('NUMBER', str(rmax))]) + return tree + + +class TreeValidator(Visitor): + def prose_val(self, tree): + # prose-val is a informal description for humans. + # we can't generate valid parser if prose-val existed in parse-tree. + prose = tree.children[0] + raise GrammarError("This ABNF cannot be used to generate parsers " + "since it has prose (informal) descriptions at line %s column %s" + % (prose.line, prose.column)) + + + +def _find_used_symbols(tree) -> set: + return {t for x in tree.find_data('rule_ref') + for t in x.scan_values(lambda t: t.type in ('RULE'))} + +def _find_used_symbols_recursive(stmt, rules: Dict[str, Tree]) -> set: + used_syms = set() + + depends = _find_used_symbols(stmt) + used_syms.update(depends) + for sym in used_syms.copy(): + used_syms.update(_find_used_symbols_recursive(rules[sym], rules)) + + return used_syms + +class ABNFGrammarLoader(GrammarLoaderBase): + + def __init__(self, *args, **kwargs): + super(ABNFGrammarLoader, self).__init__(*args, **kwargs) + + pkgres_, g = stdlib_loader(None, 'abnf.lark') + self.parser = Lark(g, parser='earley') + + + def _unpack_abnf_import(self, stmt, grammar_name): + if len(stmt.children) > 1: + path_node, name_list = stmt.children + rules_allowlist = {n.value:n.value for n in name_list.children} + else: + path_node, = stmt.children + rules_allowlist = {} + + # '%import topdir.subdir.file' --> dotted_path=('topdir','subdir','file') + dotted_path = tuple(path_node.children) + + if path_node.data == 'import_from_lib': # Import from lark/grammars/ + base_path = None + else: # Relative import + if grammar_name == '': + # Import relative to script file path if grammar is coded in script + try: + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None + else: + # Import relative to grammar file path if external grammar file + base_file = grammar_name + if base_file: + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + + return dotted_path, base_path, rules_allowlist + + + def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], + allowlist: Dict[str, str]): + + assert dotted_path + grammar_path = os.path.join(*dotted_path) + ABNF_EXT + + joined_path, text = self.read_grammar_from_file(base_path, grammar_path) + + imported_rules, directives = self._parse_abnf_grammar(text, joined_path) + + rules_to_import = {} + for rulename, stmt in imported_rules.items(): + # import all rules if allowlist is empty + if len(allowlist) > 0: + if rulename not in allowlist: + continue + for sym in _find_used_symbols_recursive(stmt, imported_rules): + rules_to_import[sym] = imported_rules[sym] + + rules_to_import[rulename] = stmt + + if len(rules_to_import) == 0: + raise GrammarError("Nothing was imported from `%s`" % import_path) + + return rules_to_import, directives, grammar_path + + + def _parse_abnf_grammar(self, abnf_grammar_text: str, grammar_name:str): + + rules = {} + casefold_rules = {} + + def add_rule(rulename:str, stmt, grammar_name:str): + if rulename in rules: + if stmt.children[1].type != 'EQ_ALT': + raise GrammarError("Rule '%s' is already defined in %s" + % (rulename, grammar_name)) + # merge incremental alternation into alternation + alt = rules[rulename].children[2] + alt_incr = stmt.children[2] + assert alt.data == 'alternation' + alt.children.extend(alt_incr.children) + else: + # case insensitive check for duplicated rule names. + # (rule names are case insensitive in ABNF.) + cf_rulename = rulename.casefold() + try: + r = casefold_rules[cf_rulename] + except KeyError: + casefold_rules[cf_rulename] = stmt + else: + raise GrammarError("Rule '%s' is already defined as '%s'" + % (rulename, r.children[0])) + + rules[rulename] = stmt + + try: + tree = self.parser.parse(abnf_grammar_text) + except UnexpectedCharacters as e: + context = e.get_context(abnf_grammar_text) + error = _translate_parser_exception(self.parser.parse, e) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, grammar_name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = _translate_parser_exception(self.parser.parse, e) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + raise + + imports = {} + unhandled_directives = [] + for stmt in tree.children: + if stmt.data == 'rule': + rulename = stmt.children[0] + add_rule(rulename, stmt, grammar_name) + + elif stmt.data == 'abnf_import': + dotted_path, base_path, allowlist = self._unpack_abnf_import(stmt, grammar_name) + imported_rules, directives, import_path = self.do_import(dotted_path, base_path, allowlist) + + unhandled_directives.extend(directives) + + for rulename, stmt in imported_rules.items(): + if rulename in rules: + raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." + % (stmt.children[0], import_path)) + add_rule(rulename, stmt, import_path) + + else: + unhandled_directives.append(stmt) + + return rules, unhandled_directives + + + def parse_grammar(self, abnf_grammar_text: str, grammar_name:str): + + rules, unhandled_directives = self._parse_abnf_grammar(abnf_grammar_text, grammar_name) + + tree = Tree('start', list(rules.values())) + + #====== + # make a list of terminals from %terminal directives + #====== + terminals = set() + for stmt in unhandled_directives: + if stmt.data == 'terminal_def': + for rulename in stmt.children: + terminals.add(rulename.replace('-','_')) + if rulename not in rules: + raise GrammarError("Symbol '%s' is not defined as a rule, " + "at line %d column %d ." % (rulename, n.line, n.column)) + terminals.update(_find_used_symbols_recursive(rules[rulename], rules)) + else: + assert False + + #====== + # Convert ABNF parse tree to Lark's EBNF tree. + # Note: + # - Hyphens in rule names are replaced with underscores. + # Otherwise we can't access such rules via visitors and transformers + # since hyphen is not a python identifier. + # + # - Rules specified via %terminal directive is converted into terminals. + #====== + transformer = ABNFToLarkTransformer(terminals) + tree = transformer.transform(tree) + + #====== + # Error checking + #====== + validator = TreeValidator() + validator.visit(tree) + + return tree + +def get_grammar_loader(import_paths: Optional[List[Union[str, Callable]]]=None, + used_files: Optional[Dict[str, str]]=None): + """ entry point of this syntax plugin. """ + return ABNFGrammarLoader(import_paths, used_files) diff --git a/tests/__main__.py b/tests/__main__.py index b8d39712..6c576a4e 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -8,6 +8,7 @@ from .test_tools import TestStandalone from .test_cache import TestCache from .test_grammar import TestGrammar +from .test_grammar_abnf import TestABNFGrammar from .test_reconstructor import TestReconstructor from .test_tree_forest_transformer import TestTreeForestTransformer from .test_lexer import TestLexer diff --git a/tests/grammars/ab.abnf b/tests/grammars/ab.abnf new file mode 100644 index 00000000..9b0f97c4 --- /dev/null +++ b/tests/grammars/ab.abnf @@ -0,0 +1,8 @@ +startab = expr + +expr = A B + / A expr B + +A = "a" +B = "b" + diff --git a/tests/test_grammar_abnf.py b/tests/test_grammar_abnf.py new file mode 100644 index 00000000..6eb7ecd3 --- /dev/null +++ b/tests/test_grammar_abnf.py @@ -0,0 +1,308 @@ +from __future__ import absolute_import + +import os +from unittest import TestCase, main + +from lark import Lark, Token, Tree, ParseError, UnexpectedInput, UnexpectedCharacters +from lark.load_grammar import GrammarError +from lark.load_grammar import FromPackageLoader +from lark.syntax.abnf import ABNF_GRAMMAR_ERRORS + +class TestABNFGrammar(TestCase): + def setUp(self): + pass + + def test_charval_case_insensitive(self): + p = Lark('rulename = %i"aBc" / "xyz"\n', syntax='abnf', start='rulename') + abcs = ["abc", "Abc", "aBc", "abC", "ABc", "aBC", "AbC", "ABC"] + xyzs = ["xyz", "Xyz", "XYZ" ] + for i in abcs + xyzs: + self.assertEqual(p.parse(i), Tree('rulename', [])) + + def test_charval_case_sensitive(self): + p = Lark('rulename = %s"aBc" / %s"xyZ"\n', syntax='abnf', start='rulename') + for i in ('aBc', 'xyZ'): + self.assertEqual(p.parse(i), Tree('rulename', [])) + + for i in ('abc', 'xYy'): + self.assertRaises(UnexpectedCharacters, p.parse, i) + + def test_inline_numval(self): + # test for anonymous rules generated for inline num-val (%x22) + g = ('cat = %x40 "cat" %x40\n') + l = Lark(g, syntax='abnf', start='cat', keep_all_tokens=True) + self.assertEqual(l.parse('@cat@'), + Tree('cat', [Token('__ANON_0', '@'), Token('CAT', 'cat'), Token('__ANON_0', '@')])) + + def test_basic_abnf(self): + # test for alternatives, concatenation, and grouping + g1 =('beef = "bEEf" / boeuf / (BE EF) \n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'boeuf = "boeuf"\n') + + # the same rule in multiple lines with comments + g2 =(' ; rules \n' + 'beef = "bEEf" \n' + ' / boeuf ; beef in french \n' + ' / (BE EF) ; bytes sequence [0xbe,0xef] \n' + ';terminals \n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'boeuf = "boeuf"\n') + + # the same rule using incremental alternatives + g3 = ('beef = "bEEf"\n' + 'beef =/ boeuf \n' + 'beef =/ (BE EF)\n' + 'BE = %xBE\n' + 'EF = %xEF\n' + 'boeuf = "boeuf"\n') + + for g in (g1, g2, g3): + l = Lark(g, syntax='abnf', start='beef', keep_all_tokens=True) + self.assertEqual(l.parse(u'beef'), Tree('beef', [Token('BEEF', 'beef')])) + self.assertEqual(l.parse(u'bEEf'), Tree('beef', [Token('BEEF', 'bEEf')])) + self.assertEqual(l.parse(u'boeuf'), Tree('beef', [Tree('boeuf', [Token('BOEUF', 'boeuf')])])) + self.assertEqual(l.parse(u'\xbe\xef'), Tree('beef', [Tree('BE', [Token('__ANON_0', '¾')]), + Tree('EF', [Token('__ANON_1', 'ï')])])) + + # undefined rule + g = g3 + 'unused-rule = BE EF beef3\n' + self.assertRaises(GrammarError, Lark, g, syntax='abnf', start='beef') + + def test_optional(self): + g = ('start = [ foo ] bar\n' + 'foo = "foo"\n' + 'bar = "bar"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse('foobar'), + Tree('start', [Tree('foo', ['foo']), Tree('bar', ['bar'])])) + self.assertEqual(l.parse('bar'), + Tree('start', [Tree('bar', ['bar'])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, 'foo') + + def test_empty_match_as_prose_val(self): + # some RFCs express empty match using prose-val (e.g. empty = 0 ) + g1 = ('start = ( foo / empty ) bar\n' + 'foo = "foo"\n' + 'bar = "bar"\n' + 'empty = 0\n') + l = Lark(g1, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('foobar'), + Tree('start', [Tree('foo', []), Tree('bar', [])])) + self.assertEqual(l.parse('bar'), + Tree('start', [Tree('empty', []), Tree('bar', [])])) + g2 = ('start = ( foo / anychar ) bar\n' + 'foo = "foo"\n' + 'bar = "bar"\n' + 'anychar = 1\n') + # GrammarError is raised if prose-val is used without zero repetition + self.assertRaises(GrammarError, Lark, g2, syntax='abnf') + + + def test_repetition(self): + g = ('start = rep-inf / rep-fixed \n' + 'rep-inf = *"X"\n' + 'rep-fixed = 3"F"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('XXX'), Tree('start', [Tree('rep_inf', [])])) + self.assertEqual(l.parse(''), Tree('start', [Tree('rep_inf', [])])) + self.assertEqual(l.parse('FFF'), Tree('start', [Tree('rep_fixed', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FF') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'FFFF') + + def test_repetition_range(self): + g = ('start = rep-range / rep-atleast / rep-atmost\n' + 'rep-range = 2*4%s"R"\n' + 'rep-atleast = 3*"L"\n' + 'rep-atmost = *5"M"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=False) + + self.assertEqual(l.parse('RRR'), Tree('start', [Tree('rep_range', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRRRR') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'R') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'RRr') # case sensitive + + self.assertEqual(l.parse('LlL'), Tree('start', [Tree('rep_atleast', [])])) # case insensitive + self.assertEqual(l.parse('LLLL'), Tree('start', [Tree('rep_atleast', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'LL') + + self.assertEqual(l.parse('mmm'), Tree('start', [Tree('rep_atmost', [])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'mmmmmm') + + def test_zero_repetition(self): + g1 = ('start = ("cat" / "dog" / empty) "food" \n' + 'empty = 0\n') + l = Lark(g1, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse("catfood"), Tree('start', [Token('CAT', 'cat'), Token('FOOD', 'food')])) + self.assertEqual(l.parse("dogfood"), Tree('start', [Token('DOG', 'dog'), Token('FOOD', 'food')])) + self.assertEqual(l.parse("food"), Tree('start', [Tree('empty', []), Token('FOOD', 'food')])) + self.assertRaises((UnexpectedInput), l.parse, u"petfood") + + def test_literal_range(self): + + g1 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n') + g2 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %d48-57 \n') + g3 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %b00110000-00111001 \n') + for g in (g1, g2, g3): + l = Lark(g, syntax='abnf') + for i in (0,1,2,3,4,5,6,7,8,9): + self.assertEqual(l.parse('lU%d' % i), + Tree('start', [Tree('LALPHA', ['l']), Tree('UALPHA', ['U']), + Tree('DIGIT', ['%d' % i])])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'lU0123456789:') + + + def test_literal_concatenation(self): + g1 = ('start = digits12345\n' + 'digits12345 = %x31.32.33.34.35\n') + g2 = ('start = digits12345\n' + 'digits12345 = %b00110001.00110010.00110011.00110100.00110101\n') + g3 = ('start = digits12345\n' + 'digits12345 = %d49.50.51.52.53\n') + for g in (g1, g2, g3): + l = Lark(g, syntax='abnf', keep_all_tokens=False) + self.assertEqual(l.parse('12345'), Tree('start', [Tree('digits12345', ['12345'])])) + + def test_operator_precedence(self): + # concatenation has higher precedence than alternation + g = ('start = "a" / "b" "c"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')])) + self.assertEqual(l.parse('a'), Tree('start', [Token('A', 'a')])) + + self.assertRaises((ParseError, UnexpectedInput), l.parse, 'ac') + + # grouping + g = ('start = ("a" / "b") "c"\n') + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse('bc'), Tree('start', [Token('B', 'b'), Token('C', 'c')])) + self.assertEqual(l.parse('ac'), Tree('start', [Token('A', 'a'), Token('C', 'c')])) + + def test_unicode_match(self): + # test for 16bit unicode character + char_vals = ('%x2227', '%d8743', '%b0010001000100111') + unicode_char = '∧' + + template = ('start = sym1\n' + 'sym1 = %s\n') + grammars = [ template % i for i in char_vals] + for g in grammars: + l = Lark(g, syntax='abnf', keep_all_tokens=True) + self.assertEqual(l.parse(unicode_char), Tree('start', [Tree('sym1', [unicode_char])])) + + def test_unicode_match_emoji(self): + # test for 32bit unicode character + char_vals = ('%x1F431', '%d128049', '%b00011111010000110001') + cat_face_in_unicode = '🐱' + + template = ('start = thecat\n' + 'thecat = %s\n') + grammars = [ template % i for i in char_vals] + for g in grammars: + l = Lark(g, syntax='abnf', keep_all_tokens=True) + tree = l.parse(cat_face_in_unicode) + self.assertEqual(l.parse(cat_face_in_unicode), + Tree('start', [Tree('thecat', [cat_face_in_unicode])])) + + + def test_terminal(self): + # '%terminal lineending' expected to turn CRLF, CR and LF into terminals (recursive search) + g = ('start = 1*(ALPHA/SP) lineending\n' + 'ALPHA = %x41-5A / %x61-7A\n' + 'SP = %x20\n' + 'lineending = CRLF\n' + 'CRLF = CR LF\n' + 'CR = %x0D\n' + 'LF = %x0A\n' + '%terminal ALPHA, SP\n' + '%terminal lineending\n') + l = Lark(g, syntax='abnf') + msg = 'lorem ipsum\r\n' + tree = l.parse(msg) + self.assertEqual(l.parse(msg),Tree('start', [c for c in 'lorem ipsum'] + ['\r\n'])) + + + def test_terminal_rulename_with_hyphen(self): + # Test to make sure that hyphens in rule names is replaced with hyphens + # so that they will not cause probrems (LALR parser can't handle it) + g = ('start = L-ALPHA U-ALPHA 1*DIGIT \n' + 'U-ALPHA = %x41-5A \n' + 'L-ALPHA = %x61-7A \n' + 'DIGIT = %d48-57 \n' + '%terminal U-ALPHA, L-ALPHA\n') + for p in ('earley', 'lalr'): + l = Lark(g, syntax='abnf', parser=p) + self.assertEqual(l.parse(u'aA1'), + Tree('start', [Token('L_ALPHA', 'a'), Token('U_ALPHA', 'A'), Tree('DIGIT', ['1'])])) + + def test_errors(self): + for msg, examples in ABNF_GRAMMAR_ERRORS: + for example in examples: + try: + p = Lark(example, syntax='abnf') + except GrammarError as e: + assert msg in str(e) + else: + assert False, "example did not raise an error" + + def test_import_from_custom_sources(self): + custom_loader = FromPackageLoader('tests', ('grammars', )) + g1 = ('start = startab \n' + '%import ab\n') + p = Lark(g1, syntax='abnf', start='start', import_paths=[custom_loader]) + self.assertEqual(p.parse('ab'), + Tree('start', [Tree('startab', [Tree('expr', [Tree('A', []), Tree('B', [])])])])) + + def test_import(self): + g1 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n' + '%import core-rules\n') + # GrammarError is raised since DIGIT is defined twice in both g1 and core-rules.abnf + self.assertRaises(GrammarError, Lark, g1, syntax='abnf') + + g2 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'dIGIT = %x30-39\n' + '%import core-rules\n') + # also GrammarError for multiple rule definition, since rule names are case insensitive + self.assertRaises(GrammarError, Lark, g2, syntax='abnf') + + g3 = ('start = LALPHA UALPHA 1*DIGIT CRLF\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'DIGIT = %x30-39\n' + '%import core-rules ( CRLF )\n') + # g3 is okay since only rule 'CRLF' is imported but 'DIGITS' is not + p = Lark(g3, syntax='abnf') + self.assertEqual(p.parse('aA1\r\n'), + Tree('start', [Tree('LALPHA', ['a']), + Tree('UALPHA', ['A']), + Tree('DIGIT', ['1']), + Tree('CRLF', [Tree('CR', ['\r']), Tree('LF', ['\n'])])])) + + def test_rule_duplication_casefold(self): + g1 = ('start = LALPHA UALPHA 1*DIGIT\n' + 'UALPHA = %x41-5A \n' + 'LALPHA = %x61-7A \n' + 'LaLPHA = %x61-7A \n' + 'DIGIT = %x30-39\n') + # GrammarError is expected for multiple rule definition, since rule names are case insensitive + self.assertRaises(GrammarError, Lark, g1, syntax='abnf') + + +if __name__ == '__main__': + main()