在scala中解析wiki样式标记的正确方法是什么?

时间:2013-12-26 11:11:30

标签: scala parsing

我正在尝试解析wiki标记,例如:

*bold text*, /italics/, [[www.example.com][Title]] etc,

创建

粗体文字斜体Title等,

并试图找到解决这个问题的最佳方法。我开始将我的字符串转换为列表,并通过char解析递归char,这适用于基本格式,但我现在正在尝试扩展段和表的支持语法,单个传递不会似乎已经足够了。

我已经看到了一些关于Parsers和combinators的东西,但是我还没有找到解释如何获取格式化文本的资源,特别是当它被嵌套为wiki标记有倾向时。正则表达式似乎是非常低效,并且逐行解析,然后char by char似乎比我已经错综复杂的尝试更糟糕。我意识到这是一个不好的问题,问“我该怎么做?”,但我没有想法。只是一个简单的例子将不胜感激。

修改 Welp,这是我能做的最好的事情 - 决定正则表达式开始时太尴尬并最终得到这个......

private def parse(toGo: List[Char], past: List[Char], current: List[Char], matching: Option[Char]) : (String, String) = {
  // There has to be a better way to do this....
  (matching, toGo) match {
    // Match whole line logic first
    case (Some('$'), t) if t.startsWith(List(' ', ' ', ' ')) => {
      t.dropWhile(x => x == ' ') match {
        case ('*' :: t) => ("<li>" + parse(t, List(), List(), None)._1 + "</li>", "ul")  
        case ('#' :: t) => ("<li>" + parse(t, List(), List(), None)._1 + "</li>", "ol")
        case ('h' :: '1' :: t) => ("<h1>" + parse(t, List(), List(), None)._1 + "</h1>", "")
        case ('h' :: '2' :: t) => ("<h2>" + parse(t, List(), List(), None)._1 + "</h2>", "")
        case ('h' :: '3' :: t) => ("<h3>" + parse(t, List(), List(), None)._1 + "</h3>", "")
        case ('h' :: '4' :: t) => ("<h4>" + parse(t, List(), List(), None)._1 + "</h4>", "")
        case ('h' :: '5' :: t) => ("<h5>" + parse(t, List(), List(), None)._1 + "</h5>", "")
        case ('h' :: '6' :: t) => ("<h6>" + parse(t, List(), List(), None)._1 + "</h6>", "")
        case _ => parse(t, List(), List(), None)
      }
    }
    case (Some('$'), '|' :: t) => {
      val cols = t.foldLeft(List(List.empty[Char])) {
        (acc, i) =>
          if (i == '|') acc :+ List.empty
          else acc.init :+ (acc.last :+ i)
      }
      val parsedCols = for (s <- cols) yield {
        "<td>" + parse(s, List(), List(), None)._1 + "</td>"
      }
      ("<tr>" + parsedCols.mkString + "</tr>", "table")
    }
    case (Some('$'), _) => parse(toGo, List(), List(), None)
    case (Some(':'), ':' :: t) => {
      val foundArticle = Article.getArticleByName(current.reverse.mkString)
      val title = t.takeWhile(x => wikiTitle(x))
      val link:String = foundArticle.id match {
        case -1 => "<a href=\"edit/" + foundArticle.title + "\"> " + title.mkString + "</a>"
        case _ => "<a href=\"" + foundArticle.title + "\"> " + title.mkString + "</a>"
      }
    parse(t.dropWhile(x => wikiTitle(x)), link.toList.reverse ::: past, List(), None)
    }
    case (Some(':'), c :: t) if (wikiTitle.apply(c) == false) => {
      val foundArticle = Article.getArticleByName(current.reverse.mkString)
      val link:String = foundArticle.id match {
        case -1 => "<a href=\"edit/" + foundArticle.title + "\"> " + foundArticle.title + "</a>" + c
        case _ => "<a href=\"" + foundArticle.title + "\"> " + foundArticle.title + "</a>" + c
      }
      parse(t, link.toList.reverse ::: past, List(), None)
    }
    case (Some(x), y :: t) if (x == y) => {
      val inner = parse(current.reverse, List(), List(), None)._1
      // Sort out the wiki char mappings here
      val wrapped = x match {
        case '*' => "<span style=\"font-weight:bold;\">" + inner + "</span>"
        case '/' => "<span style=\"font-style:italic\">" + inner + "</span>"
        case '_' => "<span style=\"text-decoration:underline\">" + inner + "</span>"
        case '-' => "<span style=\"text-decoration:line-through\">" + inner + "</span>"
        case '~' => "<pre>" + current.reverse.mkString + "</pre>"
        case _ => inner
      }
      parse(t, wrapped.toList.reverse ::: past, List(), None)
    }
    case (Some(x), y :: t) => parse(t, past, y :: current, Some(x))
    case (Some(x), Nil) => parse(current.reverse, x :: past, List(), None)
    case (None, w :: t) if (wikiMarkup(w)) => parse(t, current ::: past, List(), Some(w))
    case (None, h :: t) => parse(t, past, h :: current, None)
    case (None, Nil) => ((past.reverse ::: current.reverse) mkString, "p")
  }
}

private def parsedStringBuilder(parsedInfo: Iterator[(String, String)]): String = {
  var last = ""
  (for (line <- parsedInfo) yield (
    (line, last) match {
      case ((s, tag), l) if tag == l && tag == "p" => last = tag; s + "<br/>"
      case ((s, tag), l) if tag == l => last = tag; s
      case ((s, tag), l) if tag == "" => last = tag; "</" + last + ">" + s
      case ((s, tag), l) if l == "" && tag == "p" => last = tag; "<" + tag + ">" + s + "<br/>"
      case ((s, tag), l) if l == "" => last = tag; "<" + tag + ">" + s
      case ((s, tag), l) if tag == "p" => last = tag; "</" + l + "><" + tag + ">" + s + "<br/>"
      case ((s, tag), l) => last = tag; "</" + l + "><" + tag + ">" + s
    }
  )).mkString
} 

def toHTML: String = {
  parsedStringBuilder(for (l <- content.lines) yield parse(l.toList, List(), List(), Some('$')))

}

可能已经清理了一些东西但我并不打算这样可能不完全清楚,但仍然 - 必须有一个更简洁的方式?

0 个答案:

没有答案