Search code examples
arangodbarangodb-phparangojs

How to do Full Text indexing and search on the below json Document in ArangoDb?


{
"batters":
    {
    "batter":[
            { "id": "1001", "type": "Regular" },
            { "id": "1002", "type": "Chocolate" },
            { "id": "1003", "type": "Blueberry" },
            { "id": "1004", "type": "Devil's Food" }
    ]
    },
    "topping":[
            { "id": "5001", "type": "None" },
            { "id": "5002", "type": "Glazed" },
            { "id": "5005", "type": "Sugar" },
            { "id": "5007", "type": "Powdered Sugar" },
            { "id": "5006", "type": "Chocolate with Sprinkles" },
            { "id": "5003", "type": "Chocolate" },
            { "id": "5004", "type": "Maple" }
     ]
}

Basically to have full-text search here I would need to do indexing on "batters.batter" and also on "batters.topping" i.e. on two attributes. How to handle this kind of full text searching. Please explain on the method and I would to implement my search through REST API. Thanking You in advance.


Solution

  • The best way to solve this is to change the data layout a little, since fulltext indices can only work on one attribute, and requesting the index twice won't be fast by any means. Therefore we use an anonymous graph to connect the strings to their object.

    So, we create two (vertex)collections, one edge collection, one vertex collection with the fultext index:

    db._create("dishStrings")
    db._createEdgeCollection("dishEdges")
    db._create("dish")
    
    db.dishStrings.ensureIndex({type: "fulltext", fields: [ "name" ]});
    

    And save the documents to them with the relations tying them together. We use the _key attribute which is used to reference vertices in the _from and _to edge relations:

    db.dishStrings.save({"_key": "1001", "name": "Regular" , type: "Batter"});
    db.dishStrings.save({"_key": "1002", "name": "Chocolate", type: "Batter" });
    db.dishStrings.save({"_key": "1003", "name": "Blueberry", type: "Batter"});
    db.dishStrings.save({"_key": "1004", "name": "Devil's Food", type: "Batter"});
    db.dishStrings.save({"_key": "5001", "name": "None", type: "Topping"});
    db.dishStrings.save({"_key": "5002", "name": "Glazed", type: "Topping"});
    db.dishStrings.save({"_key": "5005", "name": "Sugar", type: "Topping"});
    db.dishStrings.save({"_key": "5007", "name": "Powdered Sugar", type: "Topping"});
    db.dishStrings.save({"_key": "5006", "name": "Chocolate with Sprinkles", type: "Topping"});
    db.dishStrings.save({"_key": "5003", "name": "Chocolate", type: "Topping"});
    db.dishStrings.save({"_key": "5004", "name": "Maple", type: "Topping"});
    
    db.dishEdges.save("dishStrings/1001", "dish/batter", {tasty: true, type: "Batter"})
    db.dishEdges.save("dishStrings/1002", "dish/batter", {tasty: true, type: "Batter"})
    db.dishEdges.save("dishStrings/1003", "dish/batter", {tasty: true, type: "Batter"})
    db.dishEdges.save("dishStrings/1004", "dish/batter", {tasty: true, type: "Batter"})
    db.dishEdges.save("dishStrings/5001", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5002", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5003", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5004", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5005", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5006", "dish/batter", {tasty: true, type: "Topping"})
    db.dishEdges.save("dishStrings/5007", "dish/batter", {tasty: true, type: "Topping"})
    
    db.dish.save({_key: "batter", tasty: true})
    

    We revalidate that the fulltext index will work:

    db._query("FOR oneDishStr IN FULLTEXT(dishStrings, 'name', 'Chocolate')" +
              " RETURN oneDishStr").toArray()
    

    ( .toArray() will print us the result on the console) We get 3 hits, one batter, two toppings. Since search strings may contain unvalidated strings, we rather use bind variables to circumvent injections:

    db._query("FOR oneDishStr IN FULLTEXT(dishStrings, 'name', @searchString) " + 
              " RETURN oneDishStr", 
              {searchString: "Chocolate"});
    

    Now lets use the edge relation to find the connected dish:

    db._query("FOR oneDishStr IN FULLTEXT(dishStrings, 'name', @searchString) "+ 
              "RETURN {str: oneDishStr, " + 
                      "dishes: NEIGHBORS(dishStrings, dishEdges, oneDishStr," + 
                                         " 'outbound')}",
               {searchString: "Chocolate"})
    

    This was the old (up to 2.7) way to use graphs, since we want to use fast filters, lets translate this to the new 2.8 syntax:

    db._query("FOR oneDishStr IN FULLTEXT(dishStrings, 'name', @searchString) " + 
              "  FOR v IN 1..1 OUTBOUND oneDishStr dishEdges RETURN " + 
              "    {str: oneDishStr, dish: v}",
             {searchString: "Chocolate"})
    

    We can see in both cases that we get one traversal for each of the 3 fulltext search hits for Chocolate. Now we are just interested in hits that are Toppings, so we will filter all those edges that aren't of type Topping:

    db._query("FOR oneDishStr IN FULLTEXT(dishStrings, 'name', @searchString) "+
              "   FOR v, e IN 1..1 OUTBOUND oneDishStr dishEdges " + 
              "      FILTER e.type == 'Topping' " +
              "         RETURN {str: oneDishStr, dish: v}", 
              {searchString: "Chocolate"})