提高FParsec解析器的可读性

时间:2011-09-21 12:52:56

标签: parsing f# fparsec

我在C#中完成了一个手写的CSS解析器,这个解析器变得难以管理,并且正在尝试使用FParsec来使其更具可持续性。这是一个解析用regexes制作的css选择器元素的片段:

var tagRegex = @"(?<Tag>(?:[a-zA-Z][_\-0-9a-zA-Z]*|\*))";
var idRegex = @"(?:#(?<Id>[a-zA-Z][_\-0-9a-zA-Z]*))";
var classesRegex = @"(?<Classes>(?:\.[a-zA-Z][_\-0-9a-zA-Z]*)+)";
var pseudoClassRegex = @"(?::(?<PseudoClass>link|visited|hover|active|before|after|first-line|first-letter))";
var selectorRegex = new Regex("(?:(?:" + tagRegex + "?" + idRegex + ")|" +
                                 "(?:" + tagRegex + "?" + classesRegex + ")|" +
                                  tagRegex + ")" +
                               pseudoClassRegex + "?");

var m = selectorRegex.Match(str);

if (m.Length != str.Length) {
    cssParserTraceSwitch.WriteLine("Unrecognized selector: " + str);
    return null;
}

string tagName = m.Groups["Tag"].Value;

string pseudoClassString = m.Groups["PseudoClass"].Value;
CssPseudoClass pseudoClass;
if (pseudoClassString.IsEmpty()) {
    pseudoClass = CssPseudoClass.None;
} else {
    switch (pseudoClassString.ToLower()) {
        case "link":
            pseudoClass = CssPseudoClass.Link;
            break;
        case "visited":
            pseudoClass = CssPseudoClass.Visited;
            break;
        case "hover":
            pseudoClass = CssPseudoClass.Hover;
            break;
        case "active":
            pseudoClass = CssPseudoClass.Active;
            break;
        case "before":
            pseudoClass = CssPseudoClass.Before;
            break;
        case "after":
            pseudoClass = CssPseudoClass.After;
            break;
        case "first-line":
            pseudoClass = CssPseudoClass.FirstLine;
            break;
        case "first-letter":
            pseudoClass = CssPseudoClass.FirstLetter;
            break;
        default:
            cssParserTraceSwitch.WriteLine("Unrecognized selector: " + str);
            return null;
    }
}

string cssClassesString = m.Groups["Classes"].Value;
string[] cssClasses = cssClassesString.IsEmpty() ? EmptyArray<string>.Instance : cssClassesString.Substring(1).Split('.');
allCssClasses.AddRange(cssClasses);

return new CssSelectorElement(
    tagName.ToLower(),
    cssClasses,
    m.Groups["Id"].Value,
    pseudoClass);

我的第一次尝试产生了这个:

type CssPseudoClass =
    | None = 0
    | Link = 1
    | Visited = 2
    | Hover = 3
    | Active = 4
    | Before = 5
    | After = 6
    | FirstLine = 7
    | FirstLetter = 8

type CssSelectorElement = 
    { Tag : string
      Id : string
      Classes : string list    
      PseudoClass : CssPseudoClass } 
with
    static member Default = 
        { Tag = "";
          Id = "";
          Classes = [];
          PseudoClass = CssPseudoClass.None; }

open FParsec

let ws = spaces
let str = skipString
let strWithResult str result = skipString str >>. preturn result

let identifier =
    let isIdentifierFirstChar c = isLetter c || c = '-'
    let isIdentifierChar c = isLetter c || isDigit c || c = '_' || c = '-'    
    optional (str "-") >>. many1Satisfy2L isIdentifierFirstChar isIdentifierChar "identifier"

let stringFromOptional strOption =
    match strOption with
    | Some(str) -> str
    | None -> ""

let pseudoClassFromOptional pseudoClassOption =
    match pseudoClassOption with
    | Some(pseudoClassOption) -> pseudoClassOption
    | None -> CssPseudoClass.None

let parseCssSelectorElement =
    let tag = identifier <?> "tagName"
    let id = str "#" >>. identifier <?> "#id"
    let classes = many1 (str "." >>. identifier) <?> ".className"
    let parseCssPseudoClass =
        choiceL [ strWithResult "link" CssPseudoClass.Link;
                  strWithResult "visited" CssPseudoClass.Visited;
                  strWithResult "hover" CssPseudoClass.Hover;
                  strWithResult "active" CssPseudoClass.Active;
                  strWithResult "before" CssPseudoClass.Before;
                  strWithResult "after" CssPseudoClass.After;
                  strWithResult "first-line" CssPseudoClass.FirstLine;
                  strWithResult "first-letter" CssPseudoClass.FirstLetter]
                 "pseudo-class"    
    // (tag?id|tag?classes|tag)pseudoClass?
    pipe2 ((pipe2 (opt tag) 
                  id
                  (fun tag id -> 
                      { CssSelectorElement.Default with 
                          Tag = stringFromOptional tag;
                          Id = id })) |> attempt
           <|>
           (pipe2 (opt tag) 
                  classes
                  (fun tag classes -> 
                      { CssSelectorElement.Default with 
                          Tag = stringFromOptional tag;
                          Classes = classes })) |> attempt
           <|>
           (tag |>> (fun tag -> { CssSelectorElement.Default with Tag = tag })))
           (opt (str ":" >>. parseCssPseudoClass) |> attempt)
           (fun selectorElem pseudoClass -> { selectorElem with PseudoClass = pseudoClassFromOptional pseudoClass })

但我并不喜欢它的形成方式。我期待得到一些更容易理解的东西,但是部分解析(tag?id | tag?classes | tag)pseudoClass?有几个pipe2和尝试是非常糟糕的。

来自有更多FParsec经验的人教会我更好的方法来实现这一目标吗? 我正在考虑尝试FSLex / Yacc或Boost.Spirit而不是FParsec,看看我是否能用它们提出更好的代码

2 个答案:

答案 0 :(得分:5)

您可以将该复杂解析器的某些部分提取到变量中,例如:

let tagid = 
    pipe2 (opt tag) 
      id
      (fun tag id -> 
          { CssSelectorElement.Default with 
              Tag = stringFromOptional tag
              Id = id })

您也可以尝试使用applicative interface,我个人觉得它比pipe2更容易使用和思考:

let tagid =
    (fun tag id -> 
          { CssSelectorElement.Default with 
              Tag = stringFromOptional tag
              Id = id })
    <!> opt tag
    <*> id

答案 1 :(得分:4)

正如Mauricio所说,如果你发现自己在FParsec解析器中重复代码,你总是可以将公共部分分解为变量或自定义组合器。这是组合子库的一大优势。

但是,在这种情况下,您还可以通过重新组织语法来简化和优化解析器。例如,您可以使用

替换parseCssSelectorElement解析器的下半部分
let defSel = CssSelectorElement.Default

let pIdSelector = id |>> (fun str -> {defSel with Id = str})
let pClassesSelector = classes |>> (fun strs -> {defSel with Classes = strs})

let pSelectorMain = 
     choice [pIdSelector 
             pClassesSelector 
             pipe2 tag (pIdSelector <|> pClassesSelector <|>% defSel)
                   (fun tagStr sel -> {sel with Tag = tagStr})]

pipe2 pSelectorMain (opt (str ":" >>. parseCssPseudoClass))
      (fun sel optPseudo ->
           match optPseudo with
           | None -> sel
           | Some pseudo -> {sel with PseudoClass = pseudo})

顺便说一句,如果你想解析大量的字符串常量,那么使用基于字典的解析器会更有效,比如

let pCssPseudoClass : Parser<CssPseudoClass,unit> =
    let pseudoDict = dict ["link", CssPseudoClass.Link
                           "visited", CssPseudoClass.Visited
                           "hover", CssPseudoClass.Hover
                           "active", CssPseudoClass.Active
                           "before", CssPseudoClass.Before
                           "after", CssPseudoClass.After
                           "first-line", CssPseudoClass.FirstLine
                           "first-letter", CssPseudoClass.FirstLetter]        
    fun stream ->
        let reply = identifier stream            
        if reply.Status <> Ok then Reply(reply.Status, reply.Error)
        else 
            let mutable pseudo = CssPseudoClass.None
            if pseudoDict.TryGetValue(reply.Result, &pseudo) then Reply(pseudo)
            else // skip to beginning of invalid pseudo class                   
                stream.Skip(-reply.Result.Length)
                Reply(Error, messageError "unknown pseudo class")