Search code examples
elasticsearchanalyzercamelcasing

Elasticsearch : search results on clicking on Hashtag


I have a hashtag with tags in camel Case like #teamIndia. Now when this hashtag is clicked, it should fetch all results which have "#teamIndia" in it, It should first show results with "#teamIndia", then results with "teamIndia" and then "team India" and then "team" or "India" and so on.

What I am doing:

Search text: "#teamIndia", "#NEWYORK", "#profession", "#2016"

POST /clip
{
    "settings": {
        "analysis": {
            "char_filter" : {
                "space_hashtags" : {
                    "type" : "mapping",
                    "mappings" : ["#=>|#"]
                }
            },
            "filter": {
                "substring": {
                    "max_gram": "20",
                    "type": "nGram",
                    "min_gram": "1",
                    "token_chars": [
                        "whitespace"
                    ]
                },
                "camelcase": {
                    "type": "word_delimiter",
                    "type_table": ["# => ALPHANUM", "@ => ALPHANUM"]
                },
                "stopword": {
                    "type":       "stop",
                    "stopwords": ["and", "is", "the"]
                }
            },
            "analyzer": {
                "substring_analyzer": {
                    "filter": [
                        "lowercase",
                        "substring"
                    ],
                    "tokenizer": "standard"
                },
                "camelcase_analyzer": {
                    "type" : "custom",
                    "char_filter" : "space_hashtags",
                    "tokenizer" : "whitespace",
                    "filter": [
                        "camelcase",
                        "lowercase",
                        "stopword"
                    ]
                }
            }
        }
    },
    "mappings": {
        "Clip": {
            "properties": {
                "description": {
                    "type": "multi_field",
                    "fields": {
                        "description": {
                            "type": "string",
                            "analyzer": "substring_analyzer",
                            "search_analyzer": "standard"
                        },
                        "raw": {
                            "type": "string",
                            "index": "not_analyzed"
                        },
                        "hashtag": {
                            "type": "string",
                            "index": "analyzed",
                            "analyzer": "camelcase_analyzer"
                        }
                    }
                },
                ....
            }
        }
    }
}

Docs example :-

POST /clip/Clip/2 {"id" : 1, "description" : "TheBestAndTheBeast"}

POST /clip/Clip/3 {"id" : 2, "description" : "bikes in DUBAI TheBestAndTheBeast profession"}

POST /clip/Clip/3 {"id" : 2, "description" : "Know how a software engineer surprised his wife!    <a href="search/clips?q=%23theProvider&source=hashtag" ng-click="handleModalClick()"> #theProvider </a>     rioOlympic   <a href="search/clips?q=%23DUBAI&source=hashtag" ng-click="handleModalClick()"> #DUBAI </a>    <a href="search/clips?q=%23TheBestAndTheBeast&source=hashtag" ng-click="handleModalClick()"> #TheBestAndTheBeast </a>   <a href="search/clips?q=%23rioOlympic&source=hashtag" ng-click="handleModalClick()"> #rioOlympic </a>"}

** Search Query **

GET clip/_search
{
"size": 100,
"query": {
    "filtered": {
        "query": {
            "bool": {
                "must":
                   {
                    "query_string": {
                        "fields": [
                           "description.hashtag"
                        ],
                        "query": "teamIndia"
                    }                         
                },
                "should": { 
                    "match": 
                        { "description.raw": "#teamIndia"}   
                }
            }
        }
    }
}

}

Excepted Result: "#teamIndia", "teamIndia", "team India", "team", "India",

and similar for other test keywords.


Solution

  • One of the reasons the query in the original post does not work as intended is because description.raw is not_analyzed . As a result #teamIndia would never match a document with description: "Animals and Pets and #teamIndia" since the description.raw would contain the non-analyzed term Animals and Pets and #teamIndia and not #teamIndia

    Assuming that the documents you have are like 2nd example in the OP.

    Example:

    {"id" : 2, "description" : "Animals and Pets and #teamIndia"}
    

    OR

    {"id":7,"description":"This <a href="search/clips?q=%23teamIndia&source=hashtag">#teamIndia</a>"}
    

    Then you should be able to rank documents in the following order :

    1) description containing "#teamIndia",
    2) description containing "teamIndia"
    3) description containing "team India"
    4) description containing "India"

    by enabling preserve_orginal and catenate_words in the wordlimiter filter as shown in the example below

    Example:

    Index Documents

     PUT clip
    {
       "settings": {
          "analysis": {
             "char_filter": {
                "zwsp_normalize": {
                   "type": "mapping",
                   "mappings": [
                      "\\u200B=>",
                      "\\u200C=>",
                      "\\u200D=>"
                   ]
                },
                "html_decoder": {
                   "type": "mapping",
                   "mappings": [
                      "&lt;=> <",
                      "&gt;=> >"
                   ]
                }
             },
             "filter": {
                "camelcase": {
                   "type": "word_delimiter",
                   "preserve_original": "true",
                   "catenate_all": "true"
                },
                "stopword": {
                   "type": "stop",
                   "stopwords": [
                      "and",
                      "is",
                      "the"
                   ]
                }
             },
             "analyzer": {
                "camelcase_analyzer": {
                   "type": "custom",
                   "tokenizer": "whitespace",
                   "filter": [
                      "camelcase",
                      "lowercase",
                      "stopword"
                   ],
                   "char_filter": [
                      "zwsp_normalize",
                      "html_decoder",
                      "html_strip"
                   ]
                }
             }
          }
       },
       "mappings": {
          "Clip": {
             "properties": {
                "description": {
                   "type": "multi_field",
                   "fields": {
                      "hashtag": {
                         "type": "string",
                         "index": "analyzed",
                         "analyzer": "camelcase_analyzer",
                         "norms": {
                             "enabled": false
                         }
                      }
                   }
                }
             }
          }
       }
    }
    
    
    
    POST /clip/Clip/1
    {
       "id": 1,
       "description": "Animals and Pets and #teamIndia"
    }
    
    POST /clip/Clip/2 
    {
       "id": 2,
       "description": "Animals and Pets and teamIndia"
    }
    
    
    POST /clip/Clip/3
    {
       "id": 3,
       "description": "Animals and Pets and team India"
    }
    
    
    POST /clip/Clip/4 
    {
       "id": 4,
       "description": "Animals and Pets and India"
    }
    
    
    
      POST /clip/Clip/7
        {
           "id": 7,
           "description": "This &lt;a href=&quot;search/clips?q=%23teamIndia&amp;source=hashtag&quot;&gt;#teamIndia&lt;/a&gt;"
        }
    

    Query Result:

    POST clip/_search?search_type=dfs_query_then_fetch
    {
       "size": 100,
       "query": {
          "filtered": {
             "query": {
                "bool": {
                   "must": [
                      {
                         "query_string": {
                            "fields": [
                               "description.hashtag"
                            ],
                            "query": "#teamIndia"
                         }
                      }
                   ]
                }
             }
          }
       }
    }
    

    Results:

          "hits": {
          "total": 5,
          "max_score": 1.4969246,
          "hits": [
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "7",
                "_score": 1.4969246,
                "_source": {
                   "id": 7,
                   "description": "This &lt;a href=&quot;search/clips?q=%23teamIndia&amp;source=hashtag&quot;&gt;#teamIndia&lt;/a&gt;"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "1",
                "_score": 1.4969246,
                "_source": {
                   "id": 1,
                   "description": "Animals and Pets and #teamIndia"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "2",
                "_score": 1.0952718,
                "_source": {
                   "id": 2,
                   "description": "Animals and Pets and teamIndia"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "3",
                "_score": 0.5207714,
                "_source": {
                   "id": 3,
                   "description": "Animals and Pets and team India"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "4",
                "_score": 0.11123338,
                "_source": {
                   "id": 4,
                   "description": "Animals and Pets and India"
                }
             }
          ]
       }
    

    Example #dubai:

    POST /clip/Clip/5
    {
       "id": 5,
       "description": "#dubai is hot"
    }
    
    POST /clip/Clip/6
    {
       "id": 6,
       "description": "dubai airport is huge"
    }
    
    POST clip/_search?search_type=dfs_query_then_fetch
    {
       "size": 100,
       "query": {
          "filtered": {
             "query": {
                "bool": {
                   "must": [
                      {
                         "query_string": {
                            "fields": [
                               "description.hashtag"
                            ],
                            "query": "#dubai"
                         }
                      }
                   ]
                }
             }
          }
       }
    }
    
       "hits": {
          "total": 2,
          "max_score": 1.820827,
          "hits": [
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "5",
                "_score": 1.820827,
                "_source": {
                   "id": 5,
                   "description": "#dubai is hot"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "6",
                "_score": 0.5856731,
                "_source": {
                   "id": 6,
                   "description": "dubai airport is huge"
                }
             }
          ]
       }
    

    Example #professionalAndPunctual :

    POST /clip/Clip/7
    {
       "id": 7,
       "description": "professionalAndPunctual"
    }
    POST clip/_search?search_type=dfs_query_then_fetch
    {
       "size": 100,
       "query": {
          "filtered": {
             "query": {
                "bool": {
                   "must": [
                      {
                         "query_string": {
                            "fields": [
                               "description.hashtag"
                            ],
                            "query": "#professionalAndPunctual"
                         }
                      }
                   ]
                }
             }
          }
       }
    } 
    
     "hits": [
         {
            "_index": "clip",
            "_type": "Clip",
            "_id": "7",
            "_score": 2.2149992,
            "_source": {
               "id": 7,
               "description": "professionalAndPunctual"
            }
         }
      ]
    

    Edited Example

    Example: #TheBestAndTheBea‌​st

       POST /clip/Clip/10
    {"id" : 10, "description" : "TheBestAndTheBeast"}
    
    POST /clip/Clip/11
    {"id" :11, "description" : "bikes in DUBAI TheBestAndTheBeast profession"}
    
    POST /clip/Clip/12
    {"id" : 12, "description" : "Know how a software engineer surprised his wife! <a href=\"search/clips?q=%23theProvider&source=hashtag\" ng-click=\"handleModalClick()\"> #theProvider </a> rioOlympic <a href=\"search/clips?q=%23DUBAI&source=hashtag\" ng-click=\"handleModalClick()\"> #DUBAI </a> <a href=\"search/clips?q=%23TheBestAndTheBeast&source=hashtag\" ng-click=\"handleModalClick()\"> #TheBestAndTheBeast </a> <a href=\"search/clips?q=%23rioOlympic&source=hashtag\" ng-click=\"handleModalClick()\"> #rioOlympic </a>"}
    
    POST clip/_search?search_type=dfs_query_then_fetch
    {
       "size": 100,
       "query": {
          "filtered": {
             "query": {
                "bool": {
                   "must": [
                      {
                         "query_string": {
                            "fields": [
                               "description.hashtag"
                            ],
                            "query": "#TheBestAndTheBeast"
                         }
                      }
                   ]
                }
             }
          }
       }
    }
    

    #Results

     "hits": [
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "12",
                "_score": 1.8701664,
                "_source": {
                   "id": 12,
                   "description": "Know how a software engineer surprised his wife! <a href=\"search/clips?q=%23theProvider&source=hashtag\" ng-click=\"handleModalClick()\"> #theProvider </a> rioOlympic <a href=\"search/clips?q=%23DUBAI&source=hashtag\" ng-click=\"handleModalClick()\"> #DUBAI </a> <a href=\"search/clips?q=%23TheBestAndTheBeast&source=hashtag\" ng-click=\"handleModalClick()\"> #TheBestAndTheBeast </a> <a href=\"search/clips?q=%23rioOlympic&source=hashtag\" ng-click=\"handleModalClick()\"> #rioOlympic </a>"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "10",
                "_score": 0.9263139,
                "_source": {
                   "id": 10,
                   "description": "TheBestAndTheBeast"
                }
             },
             {
                "_index": "clip",
                "_type": "Clip",
                "_id": "11",
                "_score": 0.9263139,
                "_source": {
                   "id": 11,
                   "description": "bikes in DUBAI TheBestAndTheBeast profession"
                }
             }
          ]
    

    Analyzer Example :

    get clip/_analyze?analyzer=camelcase_analyzer&text=%23DUBAI
    
       {
       "tokens": [
          {
             "token": "#dubai",
             "start_offset": 0,
             "end_offset": 6,
             "type": "word",
             "position": 0
          },
          {
             "token": "dubai",
             "start_offset": 1,
             "end_offset": 6,
             "type": "word",
             "position": 0
          }
       ]
    }
    
    get clip/_analyze?analyzer=camelcase_analyzer&text=This%20%26lt%3Ba%20href%3D%26quot%3Bsearch%2Fclips%3Fq%3D%2523teamIndia%26amp%3Bsource%3Dhashtag%26quot%3B%26gt%3B%23teamIndia%26lt%3B%2Fa%26gt%3B
    
      {
       "tokens": [
          {
             "token": "this",
             "start_offset": 0,
             "end_offset": 4,
             "type": "word",
             "position": 0
          },
          {
             "token": "#teamindia",
             "start_offset": 78,
             "end_offset": 98,
             "type": "word",
             "position": 1
          },
          {
             "token": "india",
             "start_offset": 78,
             "end_offset": 98,
             "type": "word",
             "position": 2
          },
          {
             "token": "team",
             "start_offset": 78,
             "end_offset": 98,
             "type": "word",
             "position": 2
          },
          {
             "token": "teamindia",
             "start_offset": 78,
             "end_offset": 98,
             "type": "word",
             "position": 2
          }
       ]
    }