对于这个语法:(我将其余部分包含在下面。)
defaultBooleanExpression
: nested += maybeAndExpression (nested += maybeAndExpression)+
;
fuzzyQuery
: text =
( UNQUOTED
| UNSIGNED_NUMBER
| SIGNED_NUMBER
)
TILDE
(similarity = UNSIGNED_NUMBER)?
;
如果我输入:
abc~0.5
我希望得到一个类似的结构:
{ fuzzyQuery text=abc similarity=0.5 }
但实际上我得到的是:
{ defaultBooleanQuery
{ fuzzyQuery text=abc similarity=null }
{ unquoted text=0.5 }
}
我用来运行解析器的代码如下。我们正在按照常见问题解答中的建议应用性能黑客...
QueryLexer lexer = new QueryLexer(input);
CommonTokenStream tokens = new CommonTokenStream(lexer);
QueryParser parser = new QueryParser(tokens);
parser.getInterpreter().setPredictionMode(PredictionMode.SLL);
ParseTree expression;
try
{
expression = parser.startExpression();
}
catch (Exception e)
{
parser.reset();
parser.getInterpreter()
.setPredictionMode(PredictionMode.LL);
expression = parser.startExpression();
}
调试它:
我尝试使用ANTLRWorks 2和IDEA插件按规则尝试调试语法规则,但这些工具似乎都没有工作 - 他们都拒绝解析语法而不打印任何类型的错误来解释这是为什么。
完整的语法如下。
/**
* Grammar specification for our query syntax.
*/
grammar Query;
@header {
package com.nuix.storage.textindex.search.queryparser.antlr;
}
///////////////////////////////////////////////////////////////////////////
// Parser Rules
startExpression
: expression EOF
;
expression
: maybeOrExpression
;
maybeOrExpression
: orExpression
| maybeDefaultBooleanExpression
;
/**
* e.g., a OR b
*/
orExpression
: nested += maybeDefaultBooleanExpression (OR nested += maybeDefaultBooleanExpression)+
;
maybeDefaultBooleanExpression
: defaultBooleanExpression
| maybeAndExpression
;
/**
* e.g., a b
*/
defaultBooleanExpression
: nested += maybeAndExpression (nested += maybeAndExpression)+
;
maybeAndExpression
: andExpression
| maybeProximityExpression
;
/**
* e.g., a AND b
*/
andExpression
: nested += maybeProximityExpression (AND nested += maybeProximityExpression)+
;
maybeProximityExpression
: withinExpression
| notWithinExpression
| precedingExpression
| notPrecedingExpression
| maybeUnaryExpression
;
/**
* e.g., a W/4 b
*/
withinExpression
: left = maybeUnaryExpression op = W_SLASH_N right = maybeUnaryExpression
;
/**
* e.g., a NOT W/4 b
*/
notWithinExpression
: left = maybeUnaryExpression NOT op = W_SLASH_N right = maybeUnaryExpression
;
/**
* e.g., a PRE/4 b
*/
precedingExpression
: left = maybeUnaryExpression op = PRE_SLASH_N right = maybeUnaryExpression
;
/**
* e.g., a NOT PRE/4 b
*/
notPrecedingExpression
: left = maybeUnaryExpression NOT op = PRE_SLASH_N right = maybeUnaryExpression
;
maybeUnaryExpression
: notExpression
| plusExpression
| minusExpression
| maybeBoostedQueryFragment
;
/**
* e.g., NOT a
*/
notExpression
: NOT nested = maybeBoostedQueryFragment
;
/**
* e.g., +a
*/
plusExpression
: PLUS nested = maybeBoostedQueryFragment
;
/**
* e.g., -a
*/
minusExpression
: MINUS nested = maybeBoostedQueryFragment
;
maybeBoostedQueryFragment
: boostedQueryFragment
| maybeFieldedQueryFragment
;
/**
* e.g., a^2.0
*/
boostedQueryFragment
: nested = maybeFieldedQueryFragment CARET boost = UNSIGNED_NUMBER
;
maybeFieldedQueryFragment
: plainFieldedQueryFragment
| subFieldedQueryFragment
| wildcardSubFieldedQueryFragment
| queryFragment
;
/**
* e.g., properties:a
*/
plainFieldedQueryFragment
: fieldName =
( UNQUOTED
| LONE_WILDCARD
| TO
)
COLON
nested = queryFragment
;
/**
* e.g., integer-properties:"File Size":3
*/
subFieldedQueryFragment
: fieldName =
( UNQUOTED
| LONE_WILDCARD
| TO
)
COLON
subFieldName =
( QUOTED
| UNQUOTED
)
COLON
nested = queryFragment
;
/**
* e.g., date-properties:*:20010101
*/
wildcardSubFieldedQueryFragment
: fieldName =
( UNQUOTED
| LONE_WILDCARD
| TO
)
COLON
subFieldName =
( UNQUOTED_WILDCARD
| QUOTED_WILDCARD
| LONE_WILDCARD
)
COLON
nested = queryFragment
;
queryFragment
: fuzzyQuery
| unquotedQuery
| dateOffsetQuery
| unquotedWildcardQuery
| loneWildcardQuery
| slopQuery
| rangeQuery
| groupQuery
| unquotedMacro
| quotedMacro
| geoDistanceQuery
;
/**
* e.g., GEODISTANCE((40N 50E) 60km)
*/
geoDistanceQuery
: GEODISTANCE
LPAREN
LPAREN
latitude =
( UNQUOTED
| UNSIGNED_NUMBER
| SIGNED_NUMBER
)
longitude =
( UNQUOTED
| UNSIGNED_NUMBER
| SIGNED_NUMBER
)
RPAREN
distance =
( UNQUOTED
| UNSIGNED_NUMBER
)
RPAREN
;
/**
* e.g., "some query"~2
*/
slopQuery
: nested = slopCapableQuery
( TILDE slop = UNSIGNED_NUMBER )?
;
slopCapableQuery
: quotedQuery
| quotedWildcardQuery
| exactQuery
| regexQuery
;
/**
* e.g., a
*/
unquotedQuery
: UNQUOTED
| UNSIGNED_NUMBER
| SIGNED_NUMBER
;
/**
* e.g., +2Y
*/
dateOffsetQuery
: DATE_OFFSET
| TODAY
;
/**
* e.g., query~0.8
*/
fuzzyQuery
: text =
( UNQUOTED
| UNSIGNED_NUMBER
| SIGNED_NUMBER
)
TILDE
(similarity = UNSIGNED_NUMBER)?
;
/**
* e.g., a*
*/
unquotedWildcardQuery
: UNQUOTED_WILDCARD
;
/**
* e.g., *
*/
loneWildcardQuery
: LONE_WILDCARD
;
/**
* e.g., "a"
*/
quotedQuery
: QUOTED
;
/**
* e.g., "a*"
*/
quotedWildcardQuery
: QUOTED_WILDCARD
;
/**
* e.g., 'a'
*/
exactQuery
: QUOTED_EXACT
;
/**
* e.g., /a+/
*/
regexQuery
: QUOTED_REGEX
;
/**
* e.g., $a
*/
unquotedMacro
: DOLLAR
name = UNQUOTED
;
/**
* e.g., $"a"
*/
quotedMacro
: DOLLAR
name = QUOTED
;
/**
* e.g., [a TO b}
*/
rangeQuery
: lowerBoundSymbol = ( LBRACE | LBRACKET )
lowerBound = rangeQueryBound
TO?
upperBound = rangeQueryBound
upperBoundSymbol = ( RBRACE | RBRACKET )
;
rangeQueryBound
: unquotedQuery
| dateOffsetQuery
| quotedQuery
| loneWildcardQuery
;
/**
* <p>e.g., (a)</p>
*
* <p>If a ~N style suffix is present then the thing inside can only be an OR query. TODO: Not enforced yet though</p>
*/
groupQuery
: LPAREN nested = expression RPAREN
( TILDE minimumMatches = UNSIGNED_NUMBER )?
;
///////////////////////////////////////////////////////////////////////////
// Lexer Rules
// Most specific rules go first, otherwise the more general ones will blot them out.
AND : ('A'|'a')('N'|'n')('D'|'d') | '&' '&' ;
OR : ('O'|'o')('R'|'r') | '|' '|' ;
NOT : ('N'|'n')('O'|'o')('T'|'t') | '!' ;
TO : ('T'|'t')('O'|'o') ;
UNSIGNED_NUMBER : Digit+ ('.' Digit+)?
| '.' Digit+
;
SIGNED_NUMBER : ( '+' | '-' ) UNSIGNED_NUMBER ;
DATE_OFFSET : ( '+' | '-' ) UNSIGNED_NUMBER ('D'|'d'|'W'|'w'|'M'|'m'|'Y'|'y') ;
GEODISTANCE
: ('G'|'g')('E'|'e')('O'|'o')('D'|'d')('I'|'i')('S'|'s')('T'|'t')('A'|'a')('N'|'n')('C'|'c')('E'|'e')
;
PRE_SLASH_N
: ('P'|'p')('R'|'r')('E'|'e') '/' UNSIGNED_NUMBER
;
W_SLASH_N
: ('W'|'w') '/' UNSIGNED_NUMBER
;
TODAY
: ('T'|'t')('O'|'o')('D'|'d')('A'|'a')('Y'|'y')
;
UNQUOTED
: UnquotedStartChar
UnquotedChar*
;
LONE_WILDCARD
: '*'
;
UNQUOTED_WILDCARD
: ( UnquotedStartChar
UnquotedChar*
)?
( WildcardChar
UnquotedChar*
)+
;
fragment
UnquotedStartChar
: EscapeSequence
| ~( ' ' | '\r' | '\t' | '\u000C' | '\n' | '\\' | ':'
| '"' | '\u201C' | '\u201D' // DoubleQuote
| '\'' | '\u2018' | '\u2019' // SingleQuote
| '(' | ')' | '[' | ']' | '{' | '}' | '~'
| '&' | '|' | '!' | '^' | '?' | '*' | '/' | '+' | '-' | '$' )
;
fragment
UnquotedChar
: EscapeSequence
| ~( ' ' | '\r' | '\t' | '\u000C' | '\n' | '\\' | ':'
| '"' | '\u201C' | '\u201D' // DoubleQuote
| '\'' | '\u2018' | '\u2019' // SingleQuote
| '(' | ')' | '[' | ']' | '{' | '}' | '~'
| '&' | '|' | '!' | '^' | '?' | '*' )
;
QUOTED
: DoubleQuote
QuotedChar*
DoubleQuote
;
QUOTED_WILDCARD
: DoubleQuote
QuotedChar*
( WildcardChar
QuotedChar*
)+
DoubleQuote
;
fragment
QuotedChar
: EscapeSequence
| ~( '\\'
| '"' | '\u201C' | '\u201D' // DoubleQuote
| '\r' | '\n' | '?' | '*' )
;
fragment
WildcardChar
: ( '?' | '*' )
;
QUOTED_EXACT
: SingleQuote
( EscapeSequence
| ~( '\\' | '\'' | '\r' | '\n' )
)*
SingleQuote
;
QUOTED_REGEX
: '/'
( EscapeSequence
| ~( '\\' | '/' | '\r' | '\n' )
)*
'/'
;
fragment
EscapeSequence
: '\\'
( 'u' HexDigit HexDigit HexDigit HexDigit
| ~( 'u' )
)
;
fragment
Digit
: ('0'..'9')
;
fragment
HexDigit
: ('0'..'9' | 'a'..'f' | 'A'..'F')
;
// Single character fragments (not tokens, but become part of tokens)
// U+2018 LEFT SINGLE QUOTATION MARK
// U+2019 RIGHT SINGLE QUOTATION MARK
fragment SingleQuote : '\'' | '\u2018' | '\u2019';
// U+201C LEFT DOUBLE QUOTATION MARK
// U+201D RIGHT DOUBLE QUOTATION MARK
fragment DoubleQuote : '"' | '\u201C' | '\u201D';
COLON : ':' ;
PLUS : '+' ;
MINUS : '-' ;
TILDE : '~' ;
CARET : '^' ;
DOLLAR : '$' ;
LPAREN : '(' ;
RPAREN : ')' ;
LBRACKET : '[' ;
RBRACKET : ']' ;
LBRACE : '{' ;
RBRACE : '}' ;
WHITESPACE : ( ' ' | '\r' | '\t' | '\u000C' | '\n' ) -> skip;