Aggregate duplicate documents with values in array in Mongo

Question

I have a large collection of documents that look as follows:

{ "_id": "5a760191813a54000b8475f1", "orders": [{ "row": "3", "seat": "11" }, { "row": "3", "seat": "12" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75f6f17abe45000a3ba05e", "orders": [{ "row": "3", "seat": "12" }, { "row": "3", "seat": "13" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75ebdf813a54000b8475e7", "orders": [{ "row": "5", "seat": "16" }, { "row": "5", "seat": "15" }], "product_id": "5a75f711dbcc42000c459efc" }

I need to be able to find any documents where the product_id and items in the orders array are duplicates. I can't quite seem to wrap my head around accomplishing this. Any pointers?

duplicates you mean same product_id and have at least one common order? For example above there is duplicate because of { "row": "3", "seat": "12" }? And what output data you would like to get after the query?For example only the "_id" of the duplicates would be enough?Or you want to keep all the document information? — Takis
– Takis, Commented Sep 8, 2021 at 11:22

Takis · Accepted Answer · 2021-09-08 15:18:46Z

I don't know what output you want, but this has the information about the duplicates, maybe you want to add unwind on duplicates also.

Result documents

product_id
order (that found duplicated)
duplicates (the documents that had that order as duplicate)

For your data would print

[{
  "duplicates": [
    "5a760191813a54000b8475f1",
    "5a75f6f17abe45000a3ba05e"
  ],
  "order": {
    "row": "3",
    "seat": "12"
  },
  "product_id": "5a7628bedbcc42000aa7f614"
}]

Query
(run it on your driver, MongoPlayground doesn't keep the order of fields and can show wrong results)

aggregate(
[{"$unwind" : {"path" : "$orders"}},
 {
  "$group" : {
    "_id" : {
      "orders" : "$orders",
      "product_id" : "$product_id"
    },
    "duplicates" : {
      "$push" : "$_id"
    }
  }
 },
 {"$match" : {"$expr" : {"$gt" : [ {"$size" : "$duplicates"}, 1 ]}}},
 {
  "$project" : {
    "_id" : 0,
    "order" : "$_id.orders",
    "product_id" : "$_id.product_id",
    "duplicates" : 1
  }
 } 
])

Data (i added some more data)

[
  {
    "_id": "5a760191813a54000b8475f1",
    "orders": [
      {
        "row": "3",
        "seat": "11"
      },
      {
        "row": "3",
        "seat": "12"
      }
    ],
    "product_id": "5a7628bedbcc42000aa7f614"
  },
  {
    "_id": "5a75f6f17abe45000a3ba05g",
    "orders": [
      {
        "row": "3",
        "seat": "12"
      },
      {
        "row": "3",
        "seat": "13"
      }
    ],
    "product_id": "5a7628bedbcc42000aa7f614"
  },
  {
    "_id": "5a75f6f17abe45000a3ba05e",
    "orders": [
      {
        "row": "3",
        "seat": "12"
      },
      {
        "row": "3",
        "seat": "13"
      }
    ],
    "product_id": "5a7628bedbcc42000aa7f614"
  },
  {
    "_id": "5a75ebdf813a54000b8475e7",
    "orders": [
      {
        "row": "5",
        "seat": "16"
      },
      {
        "row": "5",
        "seat": "15"
      }
    ],
    "product_id": "5a75f711dbcc42000c459efc"
  }
]

Results

[{
  "duplicates": [
    "5a75f6f17abe45000a3ba05g",
    "5a75f6f17abe45000a3ba05e"
  ],
  "order": {
    "row": "3",
    "seat": "13"
  },
  "product_id": "5a7628bedbcc42000aa7f614"
},
{
  "duplicates": [
    "5a760191813a54000b8475f1",
    "5a75f6f17abe45000a3ba05g",
    "5a75f6f17abe45000a3ba05e"
  ],
  "order": {
    "row": "3",
    "seat": "12"
  },
  "product_id": "5a7628bedbcc42000aa7f614"
}]

s7vr · Accepted Answer · 2021-09-08 09:30:57Z

You could use below query. $unwind the orders array, $group by order row and product and collect matching ids and count. Keep the documents where count is greater than 1. $lookup to pull in the matching documents by id and $replaceRoot to flatten the documents.

db.collection.aggregate([
  {
    "$unwind": "$orders"
  },
  {
    "$group": {
      "_id": {
        "order": "$orders",
        "product_id": "$product_id"
      },
      "count": {
        "$sum": 1
      },
      "doc_ids": {
        "$push": "$_id"
      }
    }
  },
  {
    "$match": {
      "count": {
        "$gt": 1
      }
    }
  },
  {
    "$lookup": {
      "from": "collection",
      "localField": "doc_ids",
      "foreignField": "_id",
      "as": "documents"
    }
  },
  {
    "$unwind": "$documents"
  },
  {
    "$replaceRoot": {
      "newRoot": "$documents"
    }
  }
])

https://mongoplayground.net/p/YbztEGttUMx

Tom Slabbaert · Accepted Answer · 2021-09-13 15:20:01Z

While this can be done purely in Mongo I do not recommend it as it's very very very memory inefficient. you basically have to hold the entire collection in memory the entire time while you do certain manipulations on it.

I will however show the pipeline for this because we will use it with the second more scaleable approach.

We want to $group based on orders and product_id, however there are 2 issues standing in our way.

The orders field might not be sorted the same in all documents, because Mongo does not support "nested" sorting we have to $unwind the array, $sort it and restore the original structure. ( mind you you're sorting the entire collection here in memory ). This step which is one of the pain points of this pipeline can be skipped if you can ensure sort order is maintained in the orders array.
Mongo is inconsistent when $grouping an array of objects. full disclosure I'm not entirely sure what's going on in there but I'm guessing there are some "shortcuts" done for efficiency which affects the stability somehow. So our approach will be to convert these objects into a string (concating the "row" and "seat" together).

db.collection.aggregate([
  {
    "$unwind": "$orders"
  },
  {
    $sort: {
      "orders.row": 1,
      "orders.seat": 1
    }
  },
  {
    $group: {
      _id: "$_id",
      tmpOrders: {
        $push: {
          $concat: [
            "$orders.row",
            "$orders.seat"
          ]
        }
      },
      product_id: {
        $first: "$product_id"
      }
    }
  },
  {
    $group: {
      _id: {
        orders: "$tmpOrders",
        product: "$product_id"
      },
      dupIds: {
        $push: "$_id"
      }
    }
  },
  {
    $match: {
      "dupIds.0": {
        $exists: true
      }
    }
  },
  {
    $project: {
      _id: 0,
      dups: "$dupIds",
      
    }
  }
])

Mongo Playground

Now as I said this approach is not scaleable, and on large collections will take a very long time to run. So I recommend utilizing indexes and iterating over product_id's and executing each pipeline separately.

// wraps the native Promise, not required.
import Bluebird = require('bluebird');

// very fast with index.
const productIds = await collection.distinct('product_id')

await Bluebird.map(productIds, async (productId) => {
    const dups = await collection.aggregate([
        {
            $match: {
                product_id: productId
            }
        }
         ... same pipeline ...
    ])
    
    if (dups.length) {
        // logic required.
    }
    // can control concurrency based on db workload.
}, { concurrency: 5})

Make sure with this approach you have an index built on product_id so it will work efficiently.

Collectives™ on Stack Overflow

Aggregate duplicate documents with values in array in Mongo

3 Answers 3

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related