Search code examples
elasticsearchluceneelasticsearch-query

How to use elasticsearch to query email from text with regex


I want to query all emails from text which stored in es,now I used this query terms and got query result

{
"query": {
    "regexp": {
        "sys_content": {
            "value": "[-a-zA-Z0-9_]+(\\.[-a-zA-Z0-9_]+)*@[-a-zA-Z0-9_]+(\\.[-a-zA-Z0-9_]+)+",
            "flags_value": 65535,
            "max_determinized_states": 10000,
            "boost": 1.0
        }
    }
},
"highlight": {
    "pre_tags": [
        "<span style='color:red'>"
    ],
    "post_tags": [
        "</span>"
    ],
    "fragment_size": 100,
    "require_field_match": true,
    "fields": {
        "sys_content": {}
    }
}

}

And then ,I tryed to query "\@" and got nothing


Solution

  • Here is a solution using uax url email tokenizer. This will do most of the work at index time, making your search much faster.

    Create an index with a custom analyzer to create <EMAIL> tokens and a filter to keep only those <EMAIL> token:

    PUT test-index
    {
      "settings": {
        "analysis": {
          "analyzer": {
            "my_analyzer": {
              "tokenizer": "my_tokenizer",
              "filter": ["extract_email"]
            }
          },
          "tokenizer": {
            "my_tokenizer": {
              "type": "uax_url_email",
              "max_token_length": 50
            }
          },
          "filter": {
            "extract_email": {
              "type": "keep_types",
              "types": [ "<EMAIL>" ]
            }
          }
        }
      },
      "mappings" : {
          "properties" : {
            "sys_content" : {
              "type" : "text",
              "fields": {
                "email": {
                  "type": "text",
                  "analyzer": "my_analyzer"
                }
              }
            }
          }
        }
    }
    

    Then add a document:

    POST test-index/_doc
    {
      "sys_content": "test [email protected] not@ a@a email [email protected]"
    }
    

    And finally search and highlight emails. Finding email has already been done at index time thanks to the uax url email tokenizer so at search time, you just have to match any token from the sys_content.email field:

    GET test-index/_search
    {
      "query": {
        "regexp": {
          "sys_content.email": {
            "value": ".*",
            "flags": "ALL",
            "case_insensitive": true,
            "max_determinized_states": 10000,
            "rewrite": "constant_score"
          }
        }
      },
      "highlight": {
        "pre_tags": [
            "<span style='color:red'>"
        ],
        "post_tags": [
            "</span>"
        ],
        "fragment_size": 100,
        "require_field_match": true,
        "fields": {
            "sys_content.email": {}
        }
      }
    }
    

    This yield the following result:

    {
      "took" : 3,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "skipped" : 0,
        "failed" : 0
      },
      "hits" : {
        "total" : {
          "value" : 1,
          "relation" : "eq"
        },
        "max_score" : 1.0,
        "hits" : [
          {
            "_index" : "test-index",
            "_type" : "_doc",
            "_id" : "GxSbM3oBJxdf7EzzH4jM",
            "_score" : 1.0,
            "_source" : {
              "sys_content" : "test [email protected] not@ a@a email [email protected]"
            },
            "highlight" : {
              "sys_content.email" : [
                "test <span style='color:red'>[email protected]</span> not@ a@a email <span style='color:red'>[email protected]</span>"
              ]
            }
          }
        ]
      }
    }
    

    Note: There must be a better way to match any token from a field without using a regex search but I couldn't find it. Anyway this works and the regex is quite simple so it should be fast.