Search code examples
elasticsearchtokenize

Elasticsearch: custom tokenizer split by words and dots


I'm trying to create a tokenizer that will work this way:

POST dev_threats/_analyze
{
  "tokenizer": "my_tokenizer",
  "text": "some.test.domain.com"
}

and get tokens like:

[some, some.test, some.test.domain, some.test.domain.com, test, test.domain, test.domain.com, domain, domain.com]

I tried ngram tokenizer:

    "ngram_domain_tokenizer": {
      "type": "ngram",
      "min_gram": 1,
      "max_gram": 63,
      "token_chars": [
        "letter",
        "digit",
        "punctuation"
      ]
    },

But for long values, it generates too many tokens...

Any idea how to get such result?


Solution

  • You don't need two different analyzers for this. There's another solution using shingles and it goes this way:

    First you need to create an index with the proper analyzer, which I called domain_shingler:

    PUT dev_threats
    {
      "settings": {
        "analysis": {
          "analyzer": {
            "domain_shingler": {
              "type": "custom",
              "tokenizer": "dot_tokenizer",
              "filter": [
                "shingles",
                "joiner"
              ]
            }
          },
          "tokenizer": {
            "dot_tokenizer": {
              "type": "char_group",
              "tokenize_on_chars": [
                "punctuation"
              ]
            }
          },
          "filter": {
            "shingles": {
              "type": "shingle",
              "min_shingle_size": 2,
              "max_shingle_size": 4,
              "output_unigrams": true
            },
            "joiner": {
              "type": "pattern_replace",
              "pattern": """\s""",
              "replacement": "."
            }
          }
        }
      },
      "mappings": {
        "properties": {
          "domain": {
            "type": "text",
            "analyzer": "domain_shingler",
            "search_analyzer": "standard"
          }
        }
      }
    }
    

    If you try to analyze some.test.domain.com with that analyzer, you'll get the following tokens:

    POST dev_threats/_analyze
    {
      "analyzer": "domain_shingler",
      "text": "some.test.domain.com"
    }
    

    Results:

    {
      "tokens" : [
        {
          "token" : "some",
          "start_offset" : 0,
          "end_offset" : 4,
          "type" : "word",
          "position" : 0
        },
        {
          "token" : "some.test",
          "start_offset" : 0,
          "end_offset" : 9,
          "type" : "shingle",
          "position" : 0,
          "positionLength" : 2
        },
        {
          "token" : "some.test.domain",
          "start_offset" : 0,
          "end_offset" : 16,
          "type" : "shingle",
          "position" : 0,
          "positionLength" : 3
        },
        {
          "token" : "some.test.domain.com",
          "start_offset" : 0,
          "end_offset" : 20,
          "type" : "shingle",
          "position" : 0,
          "positionLength" : 4
        },
        {
          "token" : "test",
          "start_offset" : 5,
          "end_offset" : 9,
          "type" : "word",
          "position" : 1
        },
        {
          "token" : "test.domain",
          "start_offset" : 5,
          "end_offset" : 16,
          "type" : "shingle",
          "position" : 1,
          "positionLength" : 2
        },
        {
          "token" : "test.domain.com",
          "start_offset" : 5,
          "end_offset" : 20,
          "type" : "shingle",
          "position" : 1,
          "positionLength" : 3
        },
        {
          "token" : "domain",
          "start_offset" : 10,
          "end_offset" : 16,
          "type" : "word",
          "position" : 2
        },
        {
          "token" : "domain.com",
          "start_offset" : 10,
          "end_offset" : 20,
          "type" : "shingle",
          "position" : 2,
          "positionLength" : 2
        },
        {
          "token" : "com",
          "start_offset" : 17,
          "end_offset" : 20,
          "type" : "word",
          "position" : 3
        }
      ]
    }