Search code examples
pythonmongodbmongodb-queryaggregation-frameworkpymongo

'join' two MongoDB collections on a key, without creating a nested structure


I'd like to 'join' two MongoDB collections on a key, without creating a nested structure.

Code:

# Init client
client_mongo = pymongo.MongoClient(
    host="mongo", port=27017, username="user", password="pass"
)

# Use db gen
db = client_mongo.gen

# Create collections
db.create_collection("wikipedia")
db.create_collection("twitter")

# Add sample record to collection twitter
db.twitter.insert_one({'_id': "1527275059001274368",
 'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?'})

# Add sample record to collection wikipedia
db.wikipedia.insert_one({'_id': '6286417074ef92666893148a',
 'name': 'Nike',
 'page_wikipedia': 'Nike,_Inc.',
 'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'})

I'd like to (do the equivalent of) join on key "name" and create a new collection named "twitter_wikipedia", without a nested structure.

Current solution with nested structure:

db.twitter.aggregate(
    [
        {
            "$lookup": {
                "from": "wikipedia",  # other table name
                "localField": "name",  # key field in collection 2
                "foreignField": "name",  # key field in collection 1
                "as": "linked_collections",  # alias for resulting table
            }
            
        },
        {
        '$out': 'twitter_wikipedia'
    }
    ])

Current output

db.twitter_wikipedia.find_one({})
{'_id': '1527275059001274368',
 'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?',
 'linked_collections': [{'_id': '6286417074ef92666893148a',
   'name': 'Nike',
   'page_wikipedia': 'Nike,_Inc.',
   'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'}]}

# Desired output
{'name': 'Nike',
 'username_twitter': 'nikestore',
 'user_id_twitter': 17351972,
 'text_twitter': '@sillspb Hey, Paul. What error message are you seeing on your end?',
 'page_wikipedia': 'Nike,_Inc.',
 'text_wikipedia': 'Nike, Inc. ( or ) is an American multinational corporation that is engaged in the design, development, manufacturing, and worldwide marketing and sales of footwear, apparel, equipment, accessories, and services.'}

Solution

  • Just perform simple $unwind and $project after you do the $lookup

    db.twitter.aggregate([
      {
        "$lookup": {
          "from": "wikipedia",
          "localField": "name",
          "foreignField": "name",
          "as": "linked_collections"
        }
      },
      {
        "$unwind": "$linked_collections"
      },
      {
        "$project": {
          "name": 1,
          "username_twitter": 1,
          "user_id_twitter": 1,
          "text_twitter": 1,
          "page_wikipedia": "$linked_collections.page_wikipedia",
          "text_wikipedia": "$linked_collections.text_wikipedia"
        }
      },
      {
        $out: "twitter_wikipedia"
      }
    ])
    

    Here is the Mongo Playground for your reference.