Search code examples
searchluceneelasticsearchtire

Multi-field, multi-word, match without query_string


I would like to be able to match a multi word search against multiple fields where every word searched is contained in any of the fields, any combination. The catch is I would like to avoid using query_string.

curl -X POST "http://localhost:9200/index/document/1" -d '{"id":1,"firstname":"john","middlename":"clark","lastname":"smith"}'
curl -X POST "http://localhost:9200/index/document/2" -d '{"id":2,"firstname":"john","middlename":"paladini","lastname":"miranda"}'

I would like the search for 'John Smith' to match only document 1. The following query does what I need but I would rather avoid using query_string in case the user passes "OR", "AND" and any of the other advanced params.

curl -X GET 'http://localhost:9200/index/_search?per_page=10&pretty' -d '{
  "query": {
    "query_string": {
      "query": "john smith",
      "default_operator": "AND",
      "fields": [
        "firstname",
        "lastname",
        "middlename"
      ]
    }
  }
}'

Solution

  • What you are looking for is the multi-match query, but it doesn't perform in quite the way you would like.

    Compare the output of validate for multi_match vs query_string.

    multi_match (with operator and) will make sure that ALL terms exist in at least one field:

    curl -XGET 'http://127.0.0.1:9200/_validate/query?pretty=1&explain=true'  -d '
    {
       "multi_match" : {
          "operator" : "and",
          "fields" : [
             "firstname",
             "lastname"
          ],
          "query" : "john smith"
       }
    }
    '
    
    # {
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 1,
    #       "total" : 1
    #    },
    #    "explanations" : [
    #       {
    #          "index" : "test",
    #          "explanation" : "((+lastname:john +lastname:smith) | (+firstname:john +firstname:smith))",
    #          "valid" : true
    #       }
    #    ],
    #    "valid" : true
    # }
    

    While query_string (with default_operator AND) will check that EACH term exists in at least one field:

    curl -XGET 'http://127.0.0.1:9200/_validate/query?pretty=1&explain=true'  -d '
    {
       "query_string" : {
          "fields" : [
             "firstname",
             "lastname"
          ],
          "query" : "john smith",
          "default_operator" : "AND"
       }
    }
    '
    
    # {
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 1,
    #       "total" : 1
    #    },
    #    "explanations" : [
    #       {
    #          "index" : "test",
    #          "explanation" : "+(firstname:john | lastname:john) +(firstname:smith | lastname:smith)",
    #          "valid" : true
    #       }
    #    ],
    #    "valid" : true
    # }
    

    So you have a few choices to achieve what you are after:

    1. Preparse the search terms, to remove things like wildcards, etc, before using the query_string

    2. Preparse the search terms to extract each word, then generate a multi_match query per word

    3. Use index_name in your mapping for the name fields to index their data into a single field, which you can then use for search. (like your own custom all field):

    As follows:

    curl -XPUT 'http://127.0.0.1:9200/test/?pretty=1'  -d '
    {
       "mappings" : {
          "test" : {
             "properties" : {
                "firstname" : {
                   "index_name" : "name",
                   "type" : "string"
                },
                "lastname" : {
                   "index_name" : "name",
                   "type" : "string"
                }
             }
          }
       }
    }
    '
    
    curl -XPOST 'http://127.0.0.1:9200/test/test?pretty=1'  -d '
    {
       "firstname" : "john",
       "lastname" : "smith"
    }
    '
    
    curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1'  -d '
    {
       "query" : {
          "match" : {
             "name" : {
                "operator" : "and",
                "query" : "john smith"
             }
          }
       }
    }
    '
    
    # {
    #    "hits" : {
    #       "hits" : [
    #          {
    #             "_source" : {
    #                "firstname" : "john",
    #                "lastname" : "smith"
    #             },
    #             "_score" : 0.2712221,
    #             "_index" : "test",
    #             "_id" : "VJFU_RWbRNaeHF9wNM8fRA",
    #             "_type" : "test"
    #          }
    #       ],
    #       "max_score" : 0.2712221,
    #       "total" : 1
    #    },
    #    "timed_out" : false,
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 5,
    #       "total" : 5
    #    },
    #    "took" : 33
    # }
    

    Note however, that firstname and lastname are no longer searchable independently. The data for both fields has been indexed into name.

    You could use multi-fields with the path parameter to make them searchable both independently and together, as follows:

    curl -XPUT 'http://127.0.0.1:9200/test/?pretty=1'  -d '
    {
       "mappings" : {
          "test" : {
             "properties" : {
                "firstname" : {
                   "fields" : {
                      "firstname" : {
                         "type" : "string"
                      },
                      "any_name" : {
                         "type" : "string"
                      }
                   },
                   "path" : "just_name",
                   "type" : "multi_field"
                },
                "lastname" : {
                   "fields" : {
                      "any_name" : {
                         "type" : "string"
                      },
                      "lastname" : {
                         "type" : "string"
                      }
                   },
                   "path" : "just_name",
                   "type" : "multi_field"
                }
             }
          }
       }
    }
    '
    
    curl -XPOST 'http://127.0.0.1:9200/test/test?pretty=1'  -d '
    {
       "firstname" : "john",
       "lastname" : "smith"
    }
    '
    

    Searching the any_name field works:

    curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1'  -d '
    {
       "query" : {
          "match" : {
             "any_name" : {
                "operator" : "and",
                "query" : "john smith"
             }
          }
       }
    }
    '
    
    # {
    #    "hits" : {
    #       "hits" : [
    #          {
    #             "_source" : {
    #                "firstname" : "john",
    #                "lastname" : "smith"
    #             },
    #             "_score" : 0.2712221,
    #             "_index" : "test",
    #             "_id" : "Xf9qqKt0TpCuyLWioNh-iQ",
    #             "_type" : "test"
    #          }
    #       ],
    #       "max_score" : 0.2712221,
    #       "total" : 1
    #    },
    #    "timed_out" : false,
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 5,
    #       "total" : 5
    #    },
    #    "took" : 11
    # }
    

    Searching firstname for john AND smith doesn't work:

    curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1'  -d '
    {
       "query" : {
          "match" : {
             "firstname" : {
                "operator" : "and",
                "query" : "john smith"
             }
          }
       }
    }
    '
    
    # {
    #    "hits" : {
    #       "hits" : [],
    #       "max_score" : null,
    #       "total" : 0
    #    },
    #    "timed_out" : false,
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 5,
    #       "total" : 5
    #    },
    #    "took" : 2
    # }
    

    But searching firstname for just john works correctly:

    curl -XGET 'http://127.0.0.1:9200/test/test/_search?pretty=1'  -d '
    {
       "query" : {
          "match" : {
             "firstname" : {
                "operator" : "and",
                "query" : "john"
             }
          }
       }
    }
    '
    
    # {
    #    "hits" : {
    #       "hits" : [
    #          {
    #             "_source" : {
    #                "firstname" : "john",
    #                "lastname" : "smith"
    #             },
    #             "_score" : 0.30685282,
    #             "_index" : "test",
    #             "_id" : "Xf9qqKt0TpCuyLWioNh-iQ",
    #             "_type" : "test"
    #          }
    #       ],
    #       "max_score" : 0.30685282,
    #       "total" : 1
    #    },
    #    "timed_out" : false,
    #    "_shards" : {
    #       "failed" : 0,
    #       "successful" : 5,
    #       "total" : 5
    #    },
    #    "took" : 3
    # }