使用$ first删除重复的文档

时间:2018-12-17 08:21:48

标签: mongodb

我以前写过一个问题,但是我认为我不能很好地解释它,因此没有回答。我正在再次尝试,这样我就可以尝试并解决问题。

我有一个可以运行的聚合脚本,但它不能解决重复项。还有一个问题,我想计算文档数量,但是由于$ unwind,我得到的计数远远高于真实值。

我正在共享脚本,并注释掉有问题的部分,这些示例文档是相同的事务,但是在不同的时间和日期组成,到目前为止,最终结果与脚本处于当前状态有关,我希望得到。

首先聚合脚本:

db.getCollection("9SP_Data").aggregate([



// find documents of a type and within a number range

{"$match" : {"_id.object_category" : "revenue-transaction"

        ,"_id.transaction_date": {

            $gte: 20160101000000,

            $lt: 20170101000000

            },

}},



/*

// sort into order so that if duplicates, the new document listed at top

{$sort : { "_id.connection":1,

       "_id.company":1,

       "_id.transaction_reference":1, 

       "object_creation_date": -1 }},



// only use first record to avoid duplicate records

{$group : { "_id" : ["$_id.transaction_refererence", "$_id.connection","$_id.object_origin","$_id.object_origin_category", "$_id.transaction_date"],

        "transaction_status" : {$first: "$_id.transaction_status"},

        "line_items" : {$first: "$line_items"}}},

*/



    {"$unwind" :  "$line_items"},

    {"$match"  :  {"line_items.item_category":"sales-revenue"}},

    {"$group" : {

       "_id":

           {

            "company" : "$_id.connection",

            "sum_by_date":  {$trunc:{ $divide: ["$_id.transaction_date", 100000000 ]}},

            //  10000000000 - by year

            //  100000000 - by month

            //  1000000 - by date

            //  10000 - by hour

            //  100 - by minute

            "category" : "$line_items.item_category",

            "origin_category" : "$_id.object_origin_category",

            "object_origin_type" : "$_id.object_origin_type",

            "object_origin" : "$_id.object_origin"

           },

        "metric_value"  : { $sum: "$line_items.item_net_total_value" },



        // count number of documents (I think this is counting line_items but I need number of distinct documents by _id.transaction_reference)

        "metric_volume":{$sum:1}}

},



// format the output to include the following values

{$project : {

"_id.company"               : "$_id.company",

"_id.metric_name"           : {$literal : "revenue"},

"_id.metric_category"       : {$literal : "sales"},

"_id.metric_type"           : {$literal : "month"},

"_id.metric_lookup"         : "$_id.sum_by_date",

"_id.object_origin_category": "$_id.origin_category",

"_id.object_origin_type"    : "$_id.object_origin_type",

"_id.object_origin"         : "$_id.object_origin",

"metric_value"              : "$metric_value",

"metric_volume"             : "$metric_volume"

}}

])

目前的结果如下:

/* 1 */
{
"_id" : {
    "company" : "2f758916-2eb1-4d95-a3bb-7d6258bc2143",
    "metric_name" : "revenue",
    "metric_category" : "sales",
    "metric_type" : "month",
    "metric_lookup" : 201602,
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend"
},
"metric_value" : 403.3333,
"metric_volume" : 6
},

/* 2 */
{
"_id" : {
    "company" : "2f758916-2eb1-4d95-a3bb-7d6258bc2143",
    "metric_name" : "revenue",
    "metric_category" : "sales",
    "metric_type" : "month",
    "metric_lookup" : 201609,
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend"
},
"metric_value" : 370,
"metric_volume" : 5
},

/* 3 */
{
"_id" : {
    "company" : "2f758916-2eb1-4d95-a3bb-7d6258bc2143",
    "metric_name" : "revenue",
    "metric_category" : "sales",
    "metric_type" : "month",
    "metric_lookup" : 201601,
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend"
},
"metric_value" : 1140.0001,
"metric_volume" : 18
},

/* 4 */
{
"_id" : {
    "company" : "cb1c4a56-1544-4e9d-a433-abb33429a300",
    "metric_name" : "revenue",
    "metric_category" : "sales",
    "metric_type" : "month",
    "metric_lookup" : 201605,
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend"
},
"metric_value" : 2618.1821,
"metric_volume" : 20
}

一旦我通过$ sort和$ group获得重复句柄,我希望在以下两个文档中,如上所述的$ project可能仅使用通过$ first函数使用的最新记录。

{
"_id" : {
    "connection" : "cb1c4a56-1544-4e9d-a433-abb33429a300",
    "transaction_date" : 20171129170558,
    "transaction_date_utc" : "2017-11-29 17:05:58",
    "object_class" : "goods-service-transaction",
    "object_category" : "revenue-transaction",
    "object_type" : "receipt",
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend",
    "transaction_status" : "OPEN",
    "related_reference" : "85"
},
"object_creation_date" : "20181210010904",
"party_identifier" : "WALKIN",
"staff_identifier" : "02dcd191-ae2b-11e6-f485-7967ed9c6343",
"staff_name" : "uat1@9spokes.com",
"line_items" : [
    {
    "item_name" : "Summer Dress / 10",
    "item_system_id" : "02dcd191-ae20-11e6-f485-7967ee5a21ee",
    "item_identifier" : "10017",
    "item_category" : "sales-revenue",
    "item_type" : "goods-service",
    "item_quantity" : 1,
    "item_net_unit_sale_value" : 102.2727,
    "item_net_unit_discount_value" : 0,
    "item_unit_tax_value" : 11.3636,
    "item_net_total_value" : 102.2727,
    "item_total_tax_value" : 11.36364,
    "item_total_gross_value" : 113.63636
},
    {
    "item_name" : "Dress Shirt / Polyester / Medium",
    "item_system_id" : "02dcd191-ae20-11e6-f485-7967eee35001",
    "item_identifier" : "10023",
    "item_category" : "sales-revenue",
    "item_type" : "goods-service",
    "item_quantity" : 1,
    "item_net_unit_sale_value" : 61.3636,
    "item_net_unit_discount_value" : 0,
    "item_unit_tax_value" : 6.8182,
    "item_net_total_value" : 61.3636,
    "item_total_tax_value" : 6.81818,
    "item_total_gross_value" : 68.18182
}
]
},
{
"_id" : {
    "connection" : "cb1c4a56-1544-4e9d-a433-abb33429a300",
    "transaction_date" : 20171129170558,
    "transaction_date_utc" : "2017-11-29 17:05:58",
    "object_class" : "goods-service-transaction",
    "object_category" : "revenue-transaction",
    "object_type" : "receipt",
    "object_origin_category" : "point-of-sale",
    "object_origin_type" : "offline",
    "object_origin" : "vend",
    "transaction_status" : "CLOSED",
    "related_reference" : "85"
},
"object_creation_date" : "20181210120904",
"party_identifier" : "WALKIN",
"staff_identifier" : "02dcd191-ae2b-11e6-f485-7967ed9c6343",
"staff_name" : "uat1@9spokes.com",
"line_items" : [
    {
    "item_name" : "Summer Dress / 10",
    "item_system_id" : "02dcd191-ae20-11e6-f485-7967ee5a21ee",
    "item_identifier" : "10017",
    "item_category" : "sales-revenue",
    "item_type" : "goods-service",
    "item_quantity" : 1,
    "item_net_unit_sale_value" : 102.2727,
    "item_net_unit_discount_value" : 0,
    "item_unit_tax_value" : 11.3636,
    "item_net_total_value" : 102.2727,
    "item_total_tax_value" : 11.36364,
    "item_total_gross_value" : 113.63636
},
    {
    "item_name" : "Dress Shirt / Polyester / Medium",
    "item_system_id" : "02dcd191-ae20-11e6-f485-7967eee35001",
    "item_identifier" : "10023",
    "item_category" : "sales-revenue",
    "item_type" : "goods-service",
    "item_quantity" : 1,
    "item_net_unit_sale_value" : 61.3636,
    "item_net_unit_discount_value" : 0,
    "item_unit_tax_value" : 6.8182,
    "item_net_total_value" : 61.3636,
    "item_total_tax_value" : 6.81818,
    "item_total_gross_value" : 68.18182
}
]
}

我希望从201812的transaction_date起,该组的总和中仅使用最后一个示例文档(基于object_creation_date)。

我还希望metric_volume能够计数1个文档而不是2个,这是每个$ unwind的line_items的数量。

感谢您的帮助。马特

0 个答案:

没有答案