为什么这个ANTLR语法会产生"错误"结果?

时间:2017-05-17 06:11:15

标签: java antlr antlr4

对于这个语法:(我将其余部分包含在下面。)

defaultBooleanExpression
  : nested += maybeAndExpression (nested += maybeAndExpression)+
  ;

fuzzyQuery
  : text =
      ( UNQUOTED
      | UNSIGNED_NUMBER
      | SIGNED_NUMBER
      )
    TILDE
    (similarity = UNSIGNED_NUMBER)?
  ;

如果我输入:

abc~0.5

我希望得到一个类似的结构:

{ fuzzyQuery text=abc similarity=0.5 }

但实际上我得到的是:

{ defaultBooleanQuery
  { fuzzyQuery text=abc similarity=null }
  { unquoted text=0.5 }
}

我用来运行解析器的代码如下。我们正在按照常见问题解答中的建议应用性能黑客...

        QueryLexer lexer = new QueryLexer(input);
        CommonTokenStream tokens = new CommonTokenStream(lexer);
        QueryParser parser = new QueryParser(tokens);
        parser.getInterpreter().setPredictionMode(PredictionMode.SLL);
        ParseTree expression;
        try
        {
            expression = parser.startExpression();
        }
        catch (Exception e)
        {
            parser.reset();
            parser.getInterpreter()
                  .setPredictionMode(PredictionMode.LL);
            expression = parser.startExpression();
        }

调试它:

  • 第一轮,它进入defaultBooleanExpression(),然后进入fuzzyQuery(),识别所有三个标记作为fuzzyQuery()的一部分,然后退出fuzzyQuery()。然后没有令牌,所以defaultBooleanExpression()失败,因为没有足够的子句,这导致解析失败。
  • 我们捕获异常,然后使用LL(*)重试。
  • 现在,它进入了fuzzyQuery(),但由于某种原因,预测无法看到第三个标记,因此它停在'〜'。然后它完成而不抛出异常,但返回错误的结果。

我尝试使用ANTLRWorks 2和IDEA插件按规则尝试调试语法规则,但这些工具似乎都没有工作 - 他们都拒绝解析语法而不打印任何类型的错误来解释这是为什么。

完整的语法如下。

/**
 * Grammar specification for our query syntax.
 */
grammar Query;

@header {
package com.nuix.storage.textindex.search.queryparser.antlr;
}

///////////////////////////////////////////////////////////////////////////
// Parser Rules

startExpression
  : expression EOF
  ;

expression
  : maybeOrExpression
  ;

maybeOrExpression
  : orExpression
  | maybeDefaultBooleanExpression
  ;

/**
 * e.g., a OR b
 */
orExpression
  : nested += maybeDefaultBooleanExpression (OR nested += maybeDefaultBooleanExpression)+
  ;

maybeDefaultBooleanExpression
  : defaultBooleanExpression
  | maybeAndExpression
  ;

/**
 * e.g., a b
 */
defaultBooleanExpression
  : nested += maybeAndExpression (nested += maybeAndExpression)+
  ;

maybeAndExpression
  : andExpression
  | maybeProximityExpression
  ;

/**
 * e.g., a AND b
 */
andExpression
  : nested += maybeProximityExpression (AND nested += maybeProximityExpression)+
  ;

maybeProximityExpression
  : withinExpression
  | notWithinExpression
  | precedingExpression
  | notPrecedingExpression
  | maybeUnaryExpression
  ;

/**
 * e.g., a W/4 b
 */
withinExpression
  : left = maybeUnaryExpression op = W_SLASH_N right = maybeUnaryExpression
  ;

/**
 * e.g., a NOT W/4 b
 */
notWithinExpression
  : left = maybeUnaryExpression NOT op = W_SLASH_N right = maybeUnaryExpression
  ;

/**
 * e.g., a PRE/4 b
 */
precedingExpression
  : left = maybeUnaryExpression op = PRE_SLASH_N right = maybeUnaryExpression
  ;

/**
 * e.g., a NOT PRE/4 b
 */
notPrecedingExpression
  : left = maybeUnaryExpression NOT op = PRE_SLASH_N right = maybeUnaryExpression
  ;

maybeUnaryExpression
  : notExpression
  | plusExpression
  | minusExpression
  | maybeBoostedQueryFragment
  ;

/**
 * e.g., NOT a
 */
notExpression
  : NOT nested = maybeBoostedQueryFragment
  ;

/**
 * e.g., +a
 */
plusExpression
  : PLUS nested = maybeBoostedQueryFragment
  ;

/**
 * e.g., -a
 */
minusExpression
  : MINUS nested = maybeBoostedQueryFragment
  ;

maybeBoostedQueryFragment
  : boostedQueryFragment
  | maybeFieldedQueryFragment
  ;

/**
 * e.g., a^2.0
 */
boostedQueryFragment
  : nested = maybeFieldedQueryFragment CARET boost = UNSIGNED_NUMBER
  ;

maybeFieldedQueryFragment
  : plainFieldedQueryFragment
  | subFieldedQueryFragment
  | wildcardSubFieldedQueryFragment
  | queryFragment
  ;

/**
 * e.g., properties:a
 */
plainFieldedQueryFragment
  : fieldName =
      ( UNQUOTED
      | LONE_WILDCARD
      | TO
      )
    COLON
    nested = queryFragment
  ;

/**
 * e.g., integer-properties:"File Size":3
 */
subFieldedQueryFragment
  : fieldName =
      ( UNQUOTED
      | LONE_WILDCARD
      | TO
      )
    COLON
    subFieldName =
      ( QUOTED
      | UNQUOTED
      )
    COLON
    nested = queryFragment
  ;

/**
 * e.g., date-properties:*:20010101
 */
wildcardSubFieldedQueryFragment
  : fieldName =
      ( UNQUOTED
      | LONE_WILDCARD
      | TO
      )
    COLON
    subFieldName =
      ( UNQUOTED_WILDCARD
      | QUOTED_WILDCARD
      | LONE_WILDCARD
      )
    COLON
    nested = queryFragment
  ;

queryFragment
  : fuzzyQuery
  | unquotedQuery
  | dateOffsetQuery
  | unquotedWildcardQuery
  | loneWildcardQuery
  | slopQuery
  | rangeQuery
  | groupQuery
  | unquotedMacro
  | quotedMacro
  | geoDistanceQuery
  ;

/**
 * e.g., GEODISTANCE((40N 50E) 60km)
 */
geoDistanceQuery
  : GEODISTANCE
    LPAREN
    LPAREN
    latitude =
      ( UNQUOTED
      | UNSIGNED_NUMBER
      | SIGNED_NUMBER
      )
    longitude =
      ( UNQUOTED
      | UNSIGNED_NUMBER
      | SIGNED_NUMBER
      )
    RPAREN
    distance =
      ( UNQUOTED
      | UNSIGNED_NUMBER
      )
    RPAREN
  ;

/**
 * e.g., "some query"~2
 */
slopQuery
  : nested = slopCapableQuery
    ( TILDE slop = UNSIGNED_NUMBER )?
  ;

slopCapableQuery
  : quotedQuery
  | quotedWildcardQuery
  | exactQuery
  | regexQuery
  ;

/**
 * e.g., a
 */
unquotedQuery
  : UNQUOTED
  | UNSIGNED_NUMBER
  | SIGNED_NUMBER
  ;

/**
 * e.g., +2Y
 */
dateOffsetQuery
  : DATE_OFFSET
  | TODAY
  ;

/**
 * e.g., query~0.8
 */
fuzzyQuery
  : text =
      ( UNQUOTED
      | UNSIGNED_NUMBER
      | SIGNED_NUMBER
      )
    TILDE
    (similarity = UNSIGNED_NUMBER)?
  ;

/**
 * e.g., a*
 */
unquotedWildcardQuery
  : UNQUOTED_WILDCARD
  ;

/**
 * e.g., *
 */
loneWildcardQuery
  : LONE_WILDCARD
  ;

/**
 * e.g., "a"
 */
quotedQuery
  : QUOTED
  ;

/**
 * e.g., "a*"
 */
quotedWildcardQuery
  : QUOTED_WILDCARD
  ;

/**
 * e.g., 'a'
 */
exactQuery
  : QUOTED_EXACT
  ;

/**
 * e.g., /a+/
 */
regexQuery
  : QUOTED_REGEX
  ;

/**
 * e.g., $a
 */
unquotedMacro
  : DOLLAR
    name = UNQUOTED
  ;

/**
 * e.g., $"a"
 */
quotedMacro
  : DOLLAR
    name = QUOTED
  ;

/**
 * e.g., [a TO b}
 */
rangeQuery
  : lowerBoundSymbol = ( LBRACE | LBRACKET )
    lowerBound = rangeQueryBound
    TO?
    upperBound = rangeQueryBound
    upperBoundSymbol = ( RBRACE | RBRACKET )
  ;

rangeQueryBound
  : unquotedQuery
  | dateOffsetQuery
  | quotedQuery
  | loneWildcardQuery
  ;

/**
 * <p>e.g., (a)</p>
 *
 * <p>If a ~N style suffix is present then the thing inside can only be an OR query. TODO: Not enforced yet though</p>
 */
groupQuery
  : LPAREN nested = expression RPAREN
    ( TILDE minimumMatches = UNSIGNED_NUMBER )?
  ;



///////////////////////////////////////////////////////////////////////////
// Lexer Rules

// Most specific rules go first, otherwise the more general ones will blot them out.

AND         : ('A'|'a')('N'|'n')('D'|'d') | '&' '&' ;
OR          : ('O'|'o')('R'|'r')          | '|' '|' ;
NOT         : ('N'|'n')('O'|'o')('T'|'t') | '!' ;
TO          : ('T'|'t')('O'|'o') ;

UNSIGNED_NUMBER : Digit+ ('.' Digit+)?
                | '.' Digit+
                ;

SIGNED_NUMBER : ( '+' | '-' ) UNSIGNED_NUMBER ;

DATE_OFFSET : ( '+' | '-' ) UNSIGNED_NUMBER ('D'|'d'|'W'|'w'|'M'|'m'|'Y'|'y') ;

GEODISTANCE
  : ('G'|'g')('E'|'e')('O'|'o')('D'|'d')('I'|'i')('S'|'s')('T'|'t')('A'|'a')('N'|'n')('C'|'c')('E'|'e')
  ;

PRE_SLASH_N
  : ('P'|'p')('R'|'r')('E'|'e') '/' UNSIGNED_NUMBER
  ;

W_SLASH_N
  : ('W'|'w') '/' UNSIGNED_NUMBER
  ;

TODAY
  : ('T'|'t')('O'|'o')('D'|'d')('A'|'a')('Y'|'y')
  ;

UNQUOTED
  : UnquotedStartChar
    UnquotedChar*
  ;

LONE_WILDCARD
  : '*'
  ;

UNQUOTED_WILDCARD
  : ( UnquotedStartChar
      UnquotedChar*
    )?
    ( WildcardChar
      UnquotedChar*
    )+
  ;

fragment
UnquotedStartChar
  : EscapeSequence
  | ~( ' ' | '\r' | '\t' | '\u000C' | '\n' | '\\' | ':'
     | '"' | '\u201C' | '\u201D'    // DoubleQuote
     | '\'' | '\u2018' | '\u2019'   // SingleQuote
     | '(' | ')' | '[' | ']' | '{' | '}' | '~'
     | '&' | '|' | '!' | '^' | '?' | '*' | '/' | '+' | '-' | '$' )
  ;

fragment
UnquotedChar
  : EscapeSequence
  | ~( ' ' | '\r' | '\t' | '\u000C' | '\n' | '\\' | ':'
     | '"' | '\u201C' | '\u201D'    // DoubleQuote
     | '\'' | '\u2018' | '\u2019'   // SingleQuote
     | '(' | ')' | '[' | ']' | '{' | '}' | '~'
     | '&' | '|' | '!' | '^' | '?' | '*' )
  ;

QUOTED
  : DoubleQuote
    QuotedChar*
    DoubleQuote
  ;

QUOTED_WILDCARD
  : DoubleQuote
    QuotedChar*
    ( WildcardChar
      QuotedChar*
    )+
    DoubleQuote
  ;

fragment
QuotedChar
  : EscapeSequence
  | ~( '\\'
     | '"' | '\u201C' | '\u201D'      // DoubleQuote
     | '\r' | '\n' | '?' | '*' )
  ;

fragment
WildcardChar
  : ( '?' | '*' )
  ;

QUOTED_EXACT
  : SingleQuote
    ( EscapeSequence
    | ~( '\\' | '\'' | '\r' | '\n' )
    )*
    SingleQuote
  ;

QUOTED_REGEX
  : '/'
    ( EscapeSequence
    | ~( '\\' | '/' | '\r' | '\n' )
    )*
    '/'
  ;

fragment
EscapeSequence
  : '\\'
    ( 'u' HexDigit HexDigit HexDigit HexDigit
    | ~( 'u' )
    )
  ;

fragment
Digit
  : ('0'..'9')
  ;

fragment
HexDigit
  : ('0'..'9' | 'a'..'f' | 'A'..'F')
  ;

// Single character fragments (not tokens, but become part of tokens)
// U+2018  LEFT SINGLE QUOTATION MARK
// U+2019  RIGHT SINGLE QUOTATION MARK
fragment SingleQuote      : '\'' | '\u2018' | '\u2019';
// U+201C  LEFT DOUBLE QUOTATION MARK
// U+201D  RIGHT DOUBLE QUOTATION MARK
fragment DoubleQuote      : '"' | '\u201C' | '\u201D';

COLON     : ':' ;
PLUS      : '+' ;
MINUS     : '-' ;
TILDE     : '~' ;
CARET     : '^' ;
DOLLAR    : '$' ;
LPAREN    : '(' ;
RPAREN    : ')' ;
LBRACKET  : '[' ;
RBRACKET  : ']' ;
LBRACE    : '{' ;
RBRACE    : '}' ;

WHITESPACE : ( ' ' | '\r' | '\t' | '\u000C' | '\n' ) -> skip;

0 个答案:

没有答案