全文检索耗时太长

时间:2014-12-22 11:03:36

标签: mongodb

到目前为止,我的数据库有2583000个文档。它是静态的(我们不经常插入新文档)。

这是一份示例文件:

{
    "_id" : ObjectId("5492d118426f72116a436a0c"),
    "dst" : "string",
    "n" : "All the Light We Cannot See",
    "su" : "some string...",
    "detail" : {
        "_id" : ObjectId("982736408170649823746"),
        "au" : [ 
            "string"
        ],
        "auu" : [ 
            "some string..."
        ],
        "be" : null,
        "bf" : "some string...",
        "bp" : 531,
        "ch" : [],
        "ela" : "English",
        "fp" : null,
        "g" : [ 
            "Historical Fiction", 
            "Fiction", 
            "Book Club", 
            "History", 
            "World War II", 
            "War", 
            "Cultural", 
            "France", 
            "Adult", 
            "Adult Fiction", 
            "Literary Fiction", 
            "Cultural", 
            "Germany"
        ],
        "i13" : "some string...",
        "ior" : "some string...",
        "isb" : "some string...",
        "ism" : "some string...",
        "ist" : "some string...",
        "k" : "book",
        "law" : [ 
            "some string here"
        ],
        "oec" : 17,
        "oeu" : "some string",
        "ot" : "All The Light We Cannot See",
        "ott" : {
            "English" : "All the Light We Cannot See",
            "German" : "Alles Licht, das wir nicht sehen"
        },
        "pu" : "Published May 6th 2014 by Scribner",
        "r" : 4.18,
        "rc" : 27738,
        "rvc" : 4220,
        "stt" : [ 
            "Paris"
        ],
        "sy" : "sime text here...",
        "tad" : null,
        "tau" : null,
        "updated_at" : ISODate("2014-11-03T11:36:15.436Z")
    },
    "sv" : "1",
    "updated_at" : ISODate("2014-12-18T13:05:28.190Z"),
    "created_at" : ISODate("2014-12-18T13:05:28.190Z")
}

我设定的“文字”索引:

db.items.ensureIndex(

{"n": "text", "detail.est" : "text", "detail.ot" : "text"}, {default_language: "none"}

)

请注意,并非所有索引创建时使用的字段都在所有文档中。另外,我使用default_language : none作为text_search同时通过不同的语言。

前几天我收到的文件有一半,搜索速度很快。现在,以下结果对我来说似乎很奇怪,有些事情我不太明白:

和搜索

  • 时间:138秒

  • 查询:db.items.find({$ text:{$ search:“\”all \“\”\“\”light \“\”我们\“\”不能\“\”参见\ “”}})

解释:

{
    "cursor" : "TextCursor",
    "n" : 2,
    "nscannedObjects" : 720806,
    "nscanned" : 733110,
    "nscannedObjectsAllPlans" : 720806,
    "nscannedAllPlans" : 733110,
    "scanAndOrder" : false,
    "nYields" : 13395,
    "nChunkSkips" : 0,
    "millis" : 161997,
    "server" : "myserver.com",
    "filterSet" : false,
    "stats" : {
        "type" : "TEXT",
        "works" : 1453924,
        "yields" : 13395,
        "unyields" : 13395,
        "invalidates" : 0,
        "advanced" : 2,
        "needTime" : 1453921,
        "needFetch" : 0,
        "isEOF" : 1,
        "keysExamined" : 733110,
        "fetches" : 720806,
        "parsedTextQuery" : {
            "terms" : [ 
                "all", 
                "the", 
                "light", 
                "we", 
                "cannot", 
                "see"
            ],
            "negatedTerms" : [],
            "phrases" : [ 
                "all", 
                "the", 
                "light", 
                "we", 
                "cannot", 
                "see"
            ],
            "negatedPhrases" : []
        },
        "children" : []
    }
}

或搜索

  • 时间:4.4秒

  • 查询:db.items.find({$ text:{$ search:“我们看不到的所有灯光”}}}

解释:

{
    "cursor" : "TextCursor",
    "n" : 720806,
    "nscannedObjects" : 720806,
    "nscanned" : 733110,
    "nscannedObjectsAllPlans" : 720806,
    "nscannedAllPlans" : 733110,
    "scanAndOrder" : false,
    "nYields" : 12317,
    "nChunkSkips" : 0,
    "millis" : 127775,
    "server" : "myserver.com",
    "filterSet" : false,
    "stats" : {
        "type" : "TEXT",
        "works" : 1453924,
        "yields" : 12317,
        "unyields" : 12317,
        "invalidates" : 0,
        "advanced" : 720806,
        "needTime" : 733117,
        "needFetch" : 0,
        "isEOF" : 1,
        "keysExamined" : 733110,
        "fetches" : 720806,
        "parsedTextQuery" : {
            "terms" : [ 
                "all", 
                "the", 
                "light", 
                "we", 
                "cannot", 
                "see"
            ],
            "negatedTerms" : [],
            "phrases" : [],
            "negatedPhrases" : []
        },
        "children" : []
    }
}

完全搜索

  • 时间:0.06秒

  • 查询:db.items.find({“n”:“我们看不见的所有光”})

解释:

{
    "cursor" : "BtreeCursor n_1",
    "isMultiKey" : false,
    "n" : 2,
    "nscannedObjects" : 2,
    "nscanned" : 2583249,
    "nscannedObjectsAllPlans" : 2,
    "nscannedAllPlans" : 2583249,
    "scanAndOrder" : false,
    "indexOnly" : false,
    "nYields" : 20183,
    "nChunkSkips" : 0,
    "millis" : 9056,
    "indexBounds" : {
        "n" : [ 
            [ 
                "", 
                {}
            ], 
            [ 
                /All the Light We Cannot See/i, 
                /All the Light We Cannot See/i
            ]
        ]
    },
    "server" : "myserver.com",
    "filterSet" : false,
    "stats" : {
        "type" : "FETCH",
        "works" : 2583250,
        "yields" : 20183,
        "unyields" : 20183,
        "invalidates" : 0,
        "advanced" : 2,
        "needTime" : 2583247,
        "needFetch" : 0,
        "isEOF" : 1,
        "alreadyHasObj" : 0,
        "forcedFetches" : 0,
        "matchTested" : 0,
        "children" : [ 
            {
                "type" : "IXSCAN",
                "works" : 2583249,
                "yields" : 20183,
                "unyields" : 20183,
                "invalidates" : 0,
                "advanced" : 2,
                "needTime" : 2583247,
                "needFetch" : 0,
                "isEOF" : 1,
                "keyPattern" : "{ n: 1.0 }",
                "isMultiKey" : 0,
                "boundsVerbose" : "field #0['n']: [\"\", {}), [/All the Light We Cannot See/i, /All the Light We Cannot See/i]",
                "yieldMovedCursor" : 0,
                "dupsTested" : 0,
                "dupsDropped" : 0,
                "seenInvalidated" : 0,
                "matchTested" : 2,
                "keysExamined" : 2583249,
                "children" : []
            }
        ]
    }
}

正则表达式搜索

  • 时间:9.64秒

  • 查询:db.items.find({“n”:/我们看不到的所有光/ i})

解释:

{
    "cursor" : "BtreeCursor n_1",
    "isMultiKey" : false,
    "n" : 2,
    "nscannedObjects" : 2,
    "nscanned" : 2583249,
    "nscannedObjectsAllPlans" : 2,
    "nscannedAllPlans" : 2583249,
    "scanAndOrder" : false,
    "indexOnly" : false,
    "nYields" : 20183,
    "nChunkSkips" : 0,
    "millis" : 9056,
    "indexBounds" : {
        "n" : [ 
            [ 
                "", 
                {}
            ], 
            [ 
                /All the Light We Cannot See/i, 
                /All the Light We Cannot See/i
            ]
        ]
    },
    "server" : "myserver.com0",
    "filterSet" : false,
    "stats" : {
        "type" : "FETCH",
        "works" : 2583250,
        "yields" : 20183,
        "unyields" : 20183,
        "invalidates" : 0,
        "advanced" : 2,
        "needTime" : 2583247,
        "needFetch" : 0,
        "isEOF" : 1,
        "alreadyHasObj" : 0,
        "forcedFetches" : 0,
        "matchTested" : 0,
        "children" : [ 
            {
                "type" : "IXSCAN",
                "works" : 2583249,
                "yields" : 20183,
                "unyields" : 20183,
                "invalidates" : 0,
                "advanced" : 2,
                "needTime" : 2583247,
                "needFetch" : 0,
                "isEOF" : 1,
                "keyPattern" : "{ n: 1.0 }",
                "isMultiKey" : 0,
                "boundsVerbose" : "field #0['n']: [\"\", {}), [/All the Light We Cannot See/i, /All the Light We Cannot See/i]",
                "yieldMovedCursor" : 0,
                "dupsTested" : 0,
                "dupsDropped" : 0,
                "seenInvalidated" : 0,
                "matchTested" : 2,
                "keysExamined" : 2583249,
                "children" : []
            }
        ]
    }
}

请注意,确切的搜索结果是最快的。我不明白的是,如果搜索中涉及的字段被编入索引,那么AND搜索需要花费更多时间。我的猜测是MongoDB文本索引没有针对AND进行优化,在最简单的表达式(OR搜索)中,它比常规单字段索引慢得多。

这里有两个问题:

  • 有没有办法使用Mongodb Text Search改善这些搜索?
  • 创建索引时是否可以遗漏某些内容,以便文本搜索无法按预期执行?

0 个答案:

没有答案