如何使RegEx更高效?

时间:2014-03-05 21:02:17

标签: java regex performance scala

在我的代码中,我正在Scala中执行正则表达式匹配案例:

line match {
    case regexp(unix_time, elapsed, remotehost, code_status, bytes, method, url, rfc931, peerstatus_peerhost, file_type) => 
        LogLine(getHumanDate(unix_time), elapsed, remotehost, code_status, bytes, method, url, rfc931, peerstatus_peerhost, file_type)
    case _ => throw new IllegalArgumentException("Could not parse row: " + line)
}

我正在使用这种正则表达式模式。

val regexp = """(\d{9,10}\.\d{3})\s*(\d+) (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) (\w+\/\d+) (\d+) (\w+) (\S+) (\-) (\w+\/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|\w+\/\-) (\S+)""".r

我有兴趣让它更高效。我得到了它的工作,但它不是很快,我想它可以做得更好。

以下是一些需要匹配的示例日志行:

1393930710.739 278 192.168.1.20 TCP_MISS/200 5848 GET http://www.coderanch.com/templates/default/images/quote.gif - HIER_DIRECT/145.20.133.81 text/plain

1393930719.989 73 192.168.178.27 TCP_MEM_HIT/200 268805 GET http://sunny:8080/viewapp/classpath/jquery.js - HIER_NONE/- application/x-javascript

1393997284.209  59287 192.168.1.2 TCP_MISS/503 0 CONNECT 172.104.89.123:5228 - HIER_NONE/- -

3 个答案:

答案 0 :(得分:0)

由于你的目标是匹配线,你可以做的第一个改进是使用线锚^的开头(这将使图案失败更快)

^(\d{9,10}\.\d{3})\s*(\d+) (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) (\w+/\d+) (\d+) (\w+) (\S+) (-) (\w+/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|\w+/-) (\S+)

(我已经删除了所有不必要的逃脱)。您可以做的另一件事是删除不需要的捕获组(我不确定(-)是非常有用的)并尝试放置行结束$

答案 1 :(得分:0)

不确定你是否真的可以获得独特的正则表达式。如果你真的不需要检查每个部分格式,如果你使用.split(“”)进行解析然后检查结果长度并且只检查潜在的格式错误的部分可能会更快......

答案 2 :(得分:0)

使用正则表达式匹配你不太可能快得多,因为即使只是在空格上分割:

def split_apart(line: String) = line.split("""\s+""") match {
  case Array(unix_time, elapsed, remotehost, code_status, bytes, method, url, rfc931, peerstatus_peerhost, file_type) =>
    (unix_time, elapsed, remotehost, code_status, bytes, method, url, rfc931, peerstatus_peerhost, file_type)
  case _ => throw new Exception(":(")
}

占完整正则表达式匹配的60%的时间。

如果您完全确定必须关心这一点,那么您需要手动完成。像这样的东西大约快6倍(在Java6上,子串实际上没有复制字符串数据;我没有检查过7):

def parse(line: String) = {
  def fail(s: String) = throw new Exception("Could not parse '"+s+"' in "+line)
  def checkA(s: String) = {
    if (s.length < 13 || s.length > 14 || s(s.length-4) != '.') fail(s)
    var i = 0
    while (i < s.length-4) { if (!s(i).isDigit) fail(s); i += 1 }
    i += 1
    while (i < s.length) { if (!s(i).isDigit) fail(s); i += 1 }
    s
  }
  def checkB(s: String) = {
    if (s.length == 0) fail(s)
    var i = 0
    while (i < s.length) { if (!s(i).isDigit) fail(s); i += 1 }
    s
  }
  def checkC(s: String) = {
    if (s.length < 7) fail(s)
    var i = 0
    while (i < s.length && s(i).isDigit) i += 1
    if (i < 1 || i > 3 || s(i) != '.') fail(s)
    var j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i < j+1 || i > j+3 || i >= s.length || s(i) != '.') fail(s)
    j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i < j+1 || i > j+3 || i >= s.length || s(i) != '.') fail(s)
    j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i != s.length) fail(s)
    s
  }
  def checkD(s: String) = {
    if (s.length < 3) fail(s)
    var i = 0
    while (i < s.length && { var c = s(i); c.isLetterOrDigit || c=='_' }) i += 1
    if (i+1 >= s.length || !(s(i)=='/')) fail(s)
    i += 1
    while (i < s.length && s(i).isDigit) i += 1
    if (i != s.length) fail(s)
    s
  }
  def checkE(s: String) = checkB(s)
  def checkF(s: String) = {
    if (s.length < 0) fail(s)
    var i = 0
    while (i < s.length) { var c = s(i); if (!(c.isLetterOrDigit || c=='_')) fail(s); i += 1 }
    s
  }
  def checkG(s: String) = s
  def checkH(s: String) = { if (s != "-") fail(s); s }
  def checkI(s: String): String = {
    if (s.length < 3) fail(s)
    var i = 0
    while (i < s.length && { var c = s(i); (c.isLetterOrDigit || c=='_') }) i += 1
    if (i+1 >= s.length || !(s(i)=='/')) fail(s)
    i += 1
    if (s(i) == '-' && i+1 == s.length) return s
    var j = i
    while (i < s.length && s(i).isDigit) i += 1
    if (i < j+1 || i > j+3 || s(i) != '.') fail(s)
    j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i < j+1 || i > j+3 || i >= s.length || s(i) != '.') fail(s)
    j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i < j+1 || i > j+3 || i >= s.length || s(i) != '.') fail(s)
    j = i+1
    i = j
    while (i < s.length && s(i).isDigit) i += 1
    if (i != s.length) fail(s)
    s
  }
  def checkJ(s: String) = s
  val cs = line
  val a0 = 0
  var a1 = 0
  while (a0 < line.length && !cs(a1).isWhitespace) a1 += 1
  var b0 = a1+1
  while (b0 < line.length && cs(b0).isWhitespace) b0 += 1
  var b1 = b0+1
  while (b1 < line.length && !cs(b1).isWhitespace) b1 += 1
  val c0 = b1+1
  var c1 = c0+1
  while (c1 < line.length && !cs(c1).isWhitespace) c1 += 1
  val d0 = c1+1
  var d1 = d0+1
  while (d1 < line.length && !cs(d1).isWhitespace) d1 += 1
  val e0 = d1+1
  var e1 = e0+1
  while (e1 < line.length && !cs(e1).isWhitespace) e1 += 1
  val f0 = e1+1
  var f1 = f0+1
  while (f1 < line.length && !cs(f1).isWhitespace) f1 += 1
  val g0 = f1+1
  var g1 = g0+1
  while (g1 < line.length && !cs(g1).isWhitespace) g1 += 1
  val h0 = g1+1
  var h1 = h0+1
  while (h1 < line.length && !cs(h1).isWhitespace) h1 += 1
  val i0 = h1+1
  var i1 = i0+1
  while (i1 < line.length && !cs(i1).isWhitespace) i1 += 1
  val j0 = i1+1
  var j1 = j0+1
  while (j1 < line.length && !cs(j1).isWhitespace) j1 += 1
  ( checkA(line.substring(a0,a1)),
    checkB(line.substring(b0,b1)),
    checkC(line.substring(c0,c1)),
    checkD(line.substring(d0,d1)),
    checkE(line.substring(e0,e1)),
    checkF(line.substring(f0,f1)),
    checkG(line.substring(g0,g1)),
    checkH(line.substring(h0,h1)),
    checkI(line.substring(i0,i1)),
    checkJ(line.substring(j0,j1))
  ) 
}

但你最好真的非常关心6倍的加速,以便这样做。这是一场维护噩梦。