非常持久的查询

时间:2015-01-27 12:48:55

标签: arangodb aql

这个查询是否有可能不能很好地进行查询优化,并且有人可能会提示我如何使其更好?目前它在我杀死它之前持续数小时。

图中:

  • 收集少量200万条小记录(需要2 GB RAM)
  • 有16个索引(需要1.2 GB RAM)

arangosh [SES]> db.AuditsSearch.figures()

{
  "alive" : {
    "count" : 1940004,
    "size" : 2052009624
  },
  "dead" : {
    "count" : 397017,
"size" : 431456792,
"deletion" : 52950
  },
  "datafiles" : {
    "count" : 20,
    "fileSize" : 2132549880
  },
  "journals" : {
    "count" : 1,
    "fileSize" : 67108864
  },
  "compactors" : {
    "count" : 1,
    "fileSize" : 256528080
  },
  "shapefiles" : {
    "count" : 0,
    "fileSize" : 0
  },
  "shapes" : {
    "count" : 1004,
    "size" : 1310704
  },
  "attributes" : {
    "count" : 65,
    "size" : 3408
  },
  "indexes" : {
    "count" : 16,
    "size" : 1198718256
  },
  "lastTick" : "14686717826252",
  "uncollectedLogfileEntries" : 0
}

查询:

FOR a IN AuditsSearch

  // split the task as it currently is not possible to execute at once:
  FILTER IS_NULL(a.analytics)
  LIMIT 200000, 200000
  // end of split ... which also does not work

    LET utcTimestamp = DATE_TIMESTAMP(a.timestamp)
    LET intNumResults = TO_NUMBER(a.resultcount)
    LET intDuration = TO_NUMBER(a.duration)
    LET url = SPLIT(a.docid, "|")[1]

UPDATE a WITH { "analytics": { "utcTimestamp": utcTimestamp, "duration": intDuration, "numResults": intNumResults, "url": url } } IN AuditsSearch

索引:

[
{"id":"AuditsSearch/0","type":"primary","unique":true,"fields":["_key"]},
{"id":"AuditsSearch/13943073289094","type":"hash","unique":false,"fields":["eventtype"]},
{"id":"AuditsSearch/13943144067974","type":"hash","unique":false,"fields":["profile"]},
{"id":"AuditsSearch/13943163138950","type":"hash","unique":false,"fields":["sessionid"]},
{"id":"AuditsSearch/13943169299334","type":"hash","unique":false,"fields":["resultid"]},
{"id":"AuditsSearch/13943195644806","type":"skiplist","unique":false,"fields":["duration"]},
{"id":"AuditsSearch/13947101328262","type":"skiplist","unique":false,"fields":["timestamp"]},
{"id":"AuditsSearch/14023678636934","type":"skiplist","unique":false,"fields":["analytics.utcTimestamp"]},
{"id":"AuditsSearch/14064254132425","type":"skiplist","unique":false,"fields":["resultcount"]},
{"id":"AuditsSearch/14101960466633","type":"skiplist","unique":false,"fields":["analytics.duration"]},
{"id":"AuditsSearch/14101968134345","type":"skiplist","unique":false,"fields":["analytics.numResults"]},
{"id":"AuditsSearch/14140104909001","type":"hash","unique":false,"fields":["analytics.url"]},
{"id":"AuditsSearch/14168504672457","type":"skiplist","unique":false,"fields":["sessionid"]},
{"id":"AuditsSearch/14168754823369","type":"skiplist","unique":false,"fields":["eventtype"]},
{"id":"AuditsSearch/14169726263497","type":"hash","unique":false,"fields":["isadmin"]},
{"id":"AuditsSearch/14169732554953","type":"hash","unique":false,"fields":["isdelegatedadmin"]}
]

执行计划:

{
  "plan": {
    "nodes": [{
      "type": "SingletonNode",
      "dependencies": [],
      "id": 1,
      "estimatedCost": 1,
      "estimatedNrItems": 1
    },
    {
      "type": "EnumerateCollectionNode",
      "dependencies": [1],
      "id": 2,
      "estimatedCost": 1704564,
      "estimatedNrItems": 1704563,
      "database": "SES",
      "collection": "AuditsSearch",
      "outVariable": {
        "id": 0,
        "name": "a"
      },
      "random": false
    },
    {
      "type": "CalculationNode",
      "dependencies": [2],
      "id": 3,
      "estimatedCost": 3409127,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "function call",
        "name": "IS_NULL",
        "subNodes": [{
          "type": "array",
          "subNodes": [{
            "type": "attribute access",
            "name": "analytics",
            "subNodes": [{
              "type": "reference",
              "name": "a",
              "id": 0
            }]
          }]
        }]
      },
      "outVariable": {
        "id": 5,
        "name": "5"
      },
      "canThrow": false
    },
    {
      "type": "FilterNode",
      "dependencies": [3],
      "id": 4,
      "estimatedCost": 5113690,
      "estimatedNrItems": 1704563,
      "inVariable": {
        "id": 5,
        "name": "5"
      }
    },
    {
      "type": "CalculationNode",
      "dependencies": [4],
      "id": 6,
      "estimatedCost": 6818253,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "function call",
        "name": "DATE_TIMESTAMP",
        "subNodes": [{
          "type": "array",
          "subNodes": [{
            "type": "attribute access",
            "name": "timestamp",
            "subNodes": [{
              "type": "reference",
              "name": "a",
              "id": 0
            }]
          }]
        }]
      },
      "outVariable": {
        "id": 1,
        "name": "utcTimestamp"
      },
      "canThrow": false
    },
    {
      "type": "CalculationNode",
      "dependencies": [6],
      "id": 7,
      "estimatedCost": 8522816,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "function call",
        "name": "TO_NUMBER",
        "subNodes": [{
          "type": "array",
          "subNodes": [{
            "type": "attribute access",
            "name": "resultcount",
            "subNodes": [{
              "type": "reference",
              "name": "a",
              "id": 0
            }]
          }]
        }]
      },
      "outVariable": {
        "id": 2,
        "name": "intNumResults"
      },
      "canThrow": false
    },
    {
      "type": "CalculationNode",
      "dependencies": [7],
      "id": 8,
      "estimatedCost": 10227379,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "function call",
        "name": "TO_NUMBER",
        "subNodes": [{
          "type": "array",
          "subNodes": [{
            "type": "attribute access",
            "name": "duration",
            "subNodes": [{
              "type": "reference",
              "name": "a",
              "id": 0
            }]
          }]
        }]
      },
      "outVariable": {
        "id": 3,
        "name": "intDuration"
      },
      "canThrow": false
    },
    {
      "type": "CalculationNode",
      "dependencies": [8],
      "id": 9,
      "estimatedCost": 11931942,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "indexed access",
        "subNodes": [{
          "type": "function call",
          "name": "SPLIT",
          "subNodes": [{
            "type": "array",
            "subNodes": [{
              "type": "attribute access",
              "name": "docid",
              "subNodes": [{
                "type": "reference",
                "name": "a",
                "id": 0
              }]
            },
            {
              "type": "value",
              "value": "|"
            }]
          }]
        },
        {
          "type": "value",
          "value": 1
        }]
      },
      "outVariable": {
        "id": 4,
        "name": "url"
      },
      "canThrow": false
    },
    {
      "type": "CalculationNode",
      "dependencies": [9],
      "id": 10,
      "estimatedCost": 13636505,
      "estimatedNrItems": 1704563,
      "expression": {
        "type": "object",
        "subNodes": [{
          "type": "object element",
          "name": "analytics",
          "subNodes": [{
            "type": "object",
            "subNodes": [{
              "type": "object element",
              "name": "utcTimestamp",
              "subNodes": [{
                "type": "reference",
                "name": "utcTimestamp",
                "id": 1
              }]
            },
            {
              "type": "object element",
              "name": "duration",
              "subNodes": [{
                "type": "reference",
                "name": "intDuration",
                "id": 3
              }]
            },
            {
              "type": "object element",
              "name": "numResults",
              "subNodes": [{
                "type": "reference",
                "name": "intNumResults",
                "id": 2
              }]
            },
            {
              "type": "object element",
              "name": "url",
              "subNodes": [{
                "type": "reference",
                "name": "url",
                "id": 4
              }]
            }]
          }]
        }]
      },
      "outVariable": {
        "id": 6,
        "name": "6"
      },
      "canThrow": false
    },
    {
      "type": "LimitNode",
      "dependencies": [10],
      "id": 5,
      "estimatedCost": 13836505,
      "estimatedNrItems": 200000,
      "offset": 0,
      "limit": 200000,
      "fullCount": false
    },
    {
      "type": "UpdateNode",
      "dependencies": [5],
      "id": 11,
      "estimatedCost": 14036505,
      "estimatedNrItems": 0,
      "inDocVariable": {
        "id": 6,
        "name": "6"
      },
      "database": "SES",
      "collection": "AuditsSearch",
      "modificationFlags": {
        "ignoreErrors": false,
        "waitForSync": false,
        "nullMeansRemove": false,
        "mergeObjects": true,
        "ignoreDocumentNotFound": false
      },
      "inKeyVariable": {
        "id": 0,
        "name": "a"
      }
    }],
    "rules": ["move-calculations-up",
    "move-filters-up",
    "move-calculations-up-2",
    "move-filters-up-2"],
    "collections": [{
      "name": "AuditsSearch",
      "type": "write"
    }],
    "variables": [{
      "id": 0,
      "name": "a"
    },
    {
      "id": 1,
      "name": "utcTimestamp"
    },
    {
      "id": 4,
      "name": "url"
    },
    {
      "id": 2,
      "name": "intNumResults"
    },
    {
      "id": 3,
      "name": "intDuration"
    },
    {
      "id": 6,
      "name": "6"
    },
    {
      "id": 5,
      "name": "5"
    }],
    "estimatedCost": 14036505,
    "estimatedNrItems": 0
  },
  "warnings": [],
  "stats": {
    "rulesExecuted": 19,
    "rulesSkipped": 0,
    "plansCreated": 1
  }
}

1 个答案:

答案 0 :(得分:1)

总结上述讨论的主要内容:

现在您应该使用db._explain(<your aql query goes here>)而不是获取原始执行计划。在较新的arangodb版本中,一些经验丰富的行为得到了改进。

  • The optimizer isn't always perfect in its decisions, but its improving。过滤器之前经历的LET拉动现在由优化器检测并避免。因此,ArangoDB团队总是热衷于了解您的疑问。
  • 如果您想知道复杂查询的哪些部分会导致资源使用率过高,那么将其拆分并尝试测量其时间的位可能会很有用。
  • 有时使用临时volatile集合是有意义的。
  • 索引维护成本高昂。如果您计划移动大量文档,则删除索引并在之后重新创建它们可能是有意义的。
  • 非uniq哈希索引如果选择性变坏则可能需要很多性能;意味着如果你有很多哈希冲突或重复密钥,索引使用的成本在插入和查询时都会上升。
  • 非唯一哈希索引的插入成本为O(1)(因此索引创建仍然很快),但对于更新/删除,它需要找到&#34; right&#34;具有相同键的那些元素。当有许多索引条目具有相同的密钥时,找到&#34;正确的&#34; item将进行 n 比较(n为具有相同键的项目数)。当然,唯一的哈希索引没有这个问题,但是一个具有许多相同键的非唯一哈希索引会显示它。
  • 有许多索引总结了上述索引维护的开销。
  • truncate()涉及索引维护。您可能希望删除索引并重新创建它。