Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ABNF grammar support V2 #1022

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions examples/abnf/url_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
Creating URL Parser from ABNF grammar in internet standards (RFC3986)
==================================================================

Usage:
python3 -m examples.abnf.url_parser https://github.com/lark-parser/lark#readme
python3 -m examples.abnf.url_parser http://localhost:8000/search?q=lark%2dparser?user=me

It outputs parse tree for an URI passed as first argument.

"""
import sys

from lark import Lark, Transformer, v_args, Token, Visitor, Tree
from lark.load_grammar import FromPackageLoader

grammar_in_abnf ="""

%import rfc3986 ; import from examples/grammars/rfc3986.abnf using custom loader
%import core-rules ; import from the standard library: ../lark/grammars/core-rules.abnf

; Terminals need to be specified via %terminal directive to control
; automatic parse-tree construction by lark.
%terminal ALPHA, DIGIT
%terminal HEXDIG
%terminal unreserved
"""


class SimplifyABNFTree_Visitor(Visitor):
def __init__(self, unwrap_children=(), keep=(), *args, **kwargs):
super(SimplifyABNFTree_Visitor, self).__init__(*args, **kwargs)
self.unwrap = unwrap_children
self.keep = keep

def visit(self, tree: Tree) -> Tree:
# override self.visit(), since _unwrap_and_flatten() assumes top-down visitor
self.visit_topdown(tree)

def _unwrap_and_flatten(self, tree, unwrap_recursive=False):
""" a generator to flatten tree into list or tuple """
do_unwrap = True if tree.data in self.unwrap or unwrap_recursive else False

for x in tree.children:
if isinstance(x, Tree) and do_unwrap:
if x.data in self.keep:
yield self._concat_tokens(x, unwrap_recursive=True)
else:
for item in list(self._unwrap_and_flatten(x, unwrap_recursive=True)):
yield item
elif isinstance(x, Token):
yield x
else:
yield x


def _concat_tokens(self, tree, unwrap_recursive=False):
""" concatenate multiple tokens in tree.children into single token.
leave it as it is if there is a tree in tree.children.
"""
items = [None]
words = []
children = list(self._unwrap_and_flatten(tree, unwrap_recursive=unwrap_recursive))

for x in children:
if isinstance(x, Token):
words.append(x.value)
if not isinstance(items[-1], Token):
items.append(x)
else:
if len(words) > 1:
items[-1] = items[-1].update(value=''.join(words))
items.append(x)
words=[]

if len(words) > 1:
items[-1] = items[-1].update(value=''.join(words))

tree.children = items[1:]
return tree;

def __default__(self, tree):
return self._concat_tokens(tree)


class pct_encoded_conv(Transformer):
def pct_encoded(self, items): # alias for pct-encoded
# items = "%" HEXDIG HEXDIG

# extract hexadecimal digits, convert it to a character,
# then return modified token
char_in_hex = ''.join(items[1:])
char_ = bytearray.fromhex(char_in_hex).decode()
token = items[0].update(value=char_)
return token

def main():
url = sys.argv[1]

custom_loader = FromPackageLoader('examples', ('grammars', ))
url_parser = Lark(grammar_in_abnf,
# using ABNF grammar
syntax='abnf',
start='URI',
# use earley parser since RFC3986 is too complex for LALR.
parser='earley',
# often needed to set keep_all_tokens=True when ABNF grammar is used.
keep_all_tokens=True,
import_paths=[custom_loader],
)
tree = url_parser.parse(url)

# Convert pct-encoded (e.g. '%2D' in given URL) to ascii characters
transformer=pct_encoded_conv()
tree = transformer.transform(tree)


# We need some post-processing to unwrap unwanted tree node and concatenate ABNF tokens
# to construct a token that we actually want since many ABNF grammar
# in RFCs split every input into too small units like a single character.

unwrap = ('scheme', 'userinfo', 'IPv4address', 'IPv6address', 'reg-name',
'segment', 'query', 'fragment',
'path_abempty', 'path_absolute', 'path_noscheme', 'path_rootless')
simplifier = SimplifyABNFTree_Visitor(unwrap_children=unwrap)
simplifier.visit(tree)

print(tree.pretty())


if __name__ == '__main__':
main()
87 changes: 87 additions & 0 deletions examples/grammars/rfc3986.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; ABNF grammar from RFC3986
; Uniform Resource Identifier (URI): Generic Syntax
;
; some terminals (e.g. DIGIT, ALPHA, ..) is defined in ABNF core rules in RFC5234.
;

URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]

hier-part = "//" authority path-abempty
/ path-absolute
/ path-rootless
/ path-empty

URI-reference = URI / relative-ref

absolute-URI = scheme ":" hier-part [ "?" query ]

relative-ref = relative-part [ "?" query ] [ "#" fragment ]

relative-part = "//" authority path-abempty
/ path-absolute
/ path-noscheme
/ path-empty

scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )

authority = [ userinfo "@" ] host [ ":" port ]
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
host = IP-literal / IPv4address / reg-name
port = *DIGIT

IP-literal = "[" ( IPv6address / IPvFuture ) "]"
IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )

IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"

h16 = 1*4HEXDIG
ls32 = ( h16 ":" h16 ) / IPv4address
IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255

reg-name = *( unreserved / pct-encoded / sub-delims )

path = path-abempty ; begins with "/" or is empty
/ path-absolute ; begins with "/" but not "//"
/ path-noscheme ; begins with a non-colon segment
/ path-rootless ; begins with a segment
/ path-empty ; zero characters

path-abempty = *( "/" segment )
path-absolute = "/" [ segment-nz *( "/" segment ) ]
path-noscheme = segment-nz-nc *( "/" segment )
path-rootless = segment-nz *( "/" segment )
path-empty = 0<pchar>


segment = *pchar
segment-nz = 1*pchar
segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
; non-zero-length segment without any colon ":"

pchar = unreserved / pct-encoded / sub-delims / ":" / "@"

query = *( pchar / "/" / "?" )
fragment = *( pchar / "/" / "?" )

pct-encoded = "%" HEXDIG HEXDIG

unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="

84 changes: 84 additions & 0 deletions lark/grammars/abnf.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//
// Lark's EBNF grammar to parse ABNF grammar (RFC5234)
//


_LPAR: "("
_RPAR: ")"
_LBRA: "["
_RBRA: "]"
_STAR: "*"
_SLASH: "/"
EQ: "="
EQ_ALT: "=/"
_IGNORE_CASE: "%i"
_CASE_SENSITIVE: "%s"

RULE: /[a-zA-Z][a-zA-Z0-9\-]*/

QSTRING: /"[ !#$%&\'\(\)\*\+,\-\.\/0-9:;<=>\?@A-Z\[\\\]\^_`a-z\{|\}~]*"/
PROSE_VAL: /<[ !"#$%&\'\(\)\*\+,\-\.\/0-9:;<=\?@A-Z\[\\\]\^_`a-z\{|\}~]*>/

NUMBER: /[0-9]+/
DEC_VAL: /%d([0-9]+(\.[0-9]+)+|[0-9]+\-[0-9]+|[0-9]+)/
HEX_VAL: /%x([0-9A-F]+(\.[0-9A-F]+)+|[0-9A-F]+\-[0-9A-F]+|[0-9A-F]+)/
BIN_VAL: /%b([01]+(\.[01]+)+|[01]+\-[01]+|[01]+)/

_C_NL: /(;[^\n]*)*\r?\n/
_C_WSP: /((;[^\n]*)*\r?\n)?[ \t]+/

// terminals for nonstandard extensions
_IMPORT: "%import"
_DOT: "."
_COMMA: ","


start: _rulelist
_rulelist: (rule | abnf_import | terminal_def | (_C_WSP* _C_NL))+

rule: RULE _defined_as _elements _C_NL

_defined_as: _C_WSP* (EQ|EQ_ALT) _C_WSP*
_elements: alternation _C_WSP*
alternation: concatenation (_C_WSP* _SLASH _C_WSP* concatenation)*
concatenation: repetition (_C_WSP+ repetition)*
repetition: repeat? _element

// repeat = 1*DIGIT / (*DIGIT "*" *DIGIT)
repeat: (repeat_min _STAR repeat_max)|(repeat_min _STAR)|(_STAR repeat_max)|_STAR|repeat_n
repeat_n: NUMBER
repeat_min: NUMBER
repeat_max: NUMBER

_element: rule_ref|_group|option|char_val|num_val|prose_val
rule_ref: RULE
// 'group' is inlined intentionally.
_group: _LPAR _C_WSP* alternation _C_WSP* _RPAR
option: _LBRA _C_WSP* alternation _C_WSP* _RBRA

char_val: case_insensitive_string|case_sensitive_string
case_insensitive_string: _IGNORE_CASE? QSTRING
case_sensitive_string: _CASE_SENSITIVE QSTRING

num_val: dec_val|bin_val|hex_val
dec_val: DEC_VAL
hex_val: HEX_VAL
bin_val: BIN_VAL

prose_val: PROSE_VAL

// nonstandard extensions to ABNF grammar
// (%import)
abnf_import: _import1
_import1: _IMPORT _C_WSP+ _import_path _C_WSP* name_list? _C_WSP* _C_NL
_import_path: import_from_lib|import_relpath
import_from_lib: _import_args
import_relpath: _DOT _import_args
_import_args: PATHNAME (_DOT PATHNAME)*
name_list: _LPAR _C_WSP* RULE (_C_WSP* _COMMA _C_WSP* RULE)* _C_WSP* _RPAR

PATHNAME: /[!#$%&\'\+,\-0-9;=@A-Z\[\]\^_a-z`\{\}~]+/

// (%terminal)
terminal_def: _TERMINAL _C_WSP+ RULE (_C_WSP* _COMMA _C_WSP* RULE)*
_TERMINAL: "%terminal"
39 changes: 39 additions & 0 deletions lark/grammars/core-rules.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; ABNF Core Rules (RFC5234 Appendix.B)

ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
BIT = "0" / "1"
CHAR = %x01-7F
; any 7-bit US-ASCII character,
; excluding NUL
CR = %x0D
; carriage return
CRLF = CR LF
; Internet standard newline
CTL = %x00-1F / %x7F
; controls
DIGIT = %x30-39
; 0-9
DQUOTE = %x22
; " (Double Quote)
HEXDIG = DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
HTAB = %x09
; horizontal tab
LF = %x0A
; linefeed
LWSP = *(WSP / CRLF WSP)
; Use of this linear-white-space rule
; permits lines containing only white
; space that are no longer legal in
; mail headers and have caused
; interoperability problems in other
; contexts.
; Do not use when defining mail
; headers and use with caution in
; other contexts.
OCTET = %x00-FF
; 8 bits of data
SP = %x20
VCHAR = %x21-7E
; visible (printing) characters
WSP = SP / HTAB
; white space
8 changes: 7 additions & 1 deletion lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@ class LarkOptions(Serialize):
Prevent the tree builder from automagically removing "punctuation" tokens (Default: ``False``)
tree_class
Lark will produce trees comprised of instances of this class instead of the default ``lark.Tree``.
syntax
Syntax for grammar specification.

- "lark" (default): Lark's EBNF based syntax
- "abnf" : ABNF syntax, described in RFC5234. Various extentions in Lark's EBNF syntax are not supported.

**=== Algorithm Options ===**

Expand Down Expand Up @@ -169,6 +174,7 @@ class LarkOptions(Serialize):
'use_bytes': False,
'import_paths': [],
'source_path': None,
'syntax': 'lark',
}

def __init__(self, options_dict):
Expand Down Expand Up @@ -328,7 +334,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:


# Parse the grammar file and compose the grammars
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens, self.options.syntax)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar
Expand Down
Loading