I am trying the Terms Aggregation for the first time and there seems to be an issue with the custom pattern tokenizer I am using.
Here is the Mapping:
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
},
"theme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
When I am trying to search with the aggregation API http://my_server/index_name/_search here is the result:
{
"aggregations": {
"group_by_contentItemType": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Correspondence; Reports",
"doc_count": 3
},
{
"key": "Correspondence",
"doc_count": 2
},
{
"key": "Meeting Minutes; Administrative Records; Reports",
"doc_count": 2
},
{
"key": "Correspondence; Legal and Treaty Material; Reports",
"doc_count": 1
},
{
"key": "Correspondence; Memoranda",
"doc_count": 1
},
{
"key": "Memoranda",
"doc_count": 1
},
{
"key": "Reports",
"doc_count": 1
}
]
},
"group_by_theme": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "International Relations",
"doc_count": 2
},
{
"key": "Key Events; Dissent; Dissent; Resistance; Human Rights",
"doc_count": 2
},
{
"key": "Border Security and Migration; Key Events",
"doc_count": 1
},
{
"key": "Border Security and Migration; Second World War Aftermath",
"doc_count": 1
},
{
"key": "Domestic Politics",
"doc_count": 1
},
{
"key": "Domestic Politics; Border Security and Migration",
"doc_count": 1
},
{
"key": "Economics and Trade; International Relations",
"doc_count": 1
},
{
"key": "Embassy and Consulate Administration; Industry and Agriculture; International Relations",
"doc_count": 1
},
{
"key": "Populations and Social Policy; Second World War Aftermath; International Relations",
"doc_count": 1
}
]
}
}
}
As you can see the issue with the aggregation. I have been stuck on this problem for quite a few days. I have seen so many examples and all but still not able to solve this issue. Please help. Thanks in Advance!!!
EDIT!!! Here is the full mapping after @CatalinM answer:
{
"local_cwee": {
"mappings": {
"dynamic": "false",
"properties": {
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"commentaries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentDateEndMonth": {
"type": "integer"
},
"contentDateEndSpecified": {
"type": "boolean"
},
"contentDateEndYear": {
"type": "integer"
},
"contentDateMonth": {
"type": "integer"
},
"contentDateMonthSpecified": {
"type": "boolean"
},
"contentDateStartMonth": {
"type": "integer"
},
"contentDateStartSpecified": {
"type": "boolean"
},
"contentDateStartYear": {
"type": "integer"
},
"contentDateYear": {
"type": "integer"
},
"contentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"contentItemType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentItemTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"contentTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"copyrightNotices": {
"type": "nested",
"properties": {
"imageName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"text": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"countries": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"country": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"coverDateEndMonth": {
"type": "integer"
},
"coverDateEndSpecified": {
"type": "boolean"
},
"coverDateEndYear": {
"type": "integer"
},
"coverDateMonth": {
"type": "integer"
},
"coverDateMonthSpecified": {
"type": "boolean"
},
"coverDateStartMonth": {
"type": "integer"
},
"coverDateStartSpecified": {
"type": "boolean"
},
"coverDateStartYear": {
"type": "integer"
},
"coverDateYear": {
"type": "integer"
},
"displayName": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"documentLevel": {
"type": "integer"
},
"keyEvents": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"language": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languageFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languages": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"languagesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"moduleNumber": {
"type": "integer"
},
"notes": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"pageTranscript": {
"type": "text",
"term_vector": "with_positions",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
},
"analyzer": "whiteSpaceAnalyzer"
},
"people": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationDate": {
"type": "integer"
},
"publicationDateEndMonth": {
"type": "integer"
},
"publicationDateEndSpecified": {
"type": "boolean"
},
"publicationDateEndYear": {
"type": "integer"
},
"publicationDateMonth": {
"type": "integer"
},
"publicationDateMonthSpecified": {
"type": "boolean"
},
"publicationDateStartMonth": {
"type": "integer"
},
"publicationDateStartSpecified": {
"type": "boolean"
},
"publicationDateStartYear": {
"type": "integer"
},
"publicationDateYear": {
"type": "integer"
},
"publicationDoi": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationId": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationIdFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTitle": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"publicationType": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationTypeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publicationYear": {
"type": "integer"
},
"publisherName": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"publisherNameFacet": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
"subject": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreas": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectAreasFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountries": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectCountriesFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeyword": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subjectKeywordFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"subthemes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"theme": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themeFacets": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
},
"themes": {
"type": "text",
"analyzer": "patternAnalyzer",
"fielddata": true
}
}
}
}
}
Using your custom tokenizer, the tokens in the text field are "Correspondence", "Meeting Minutes", "Administrative Records", ..etc. So i don't think you need the keyword field.
To make aggregations work on the text field, you'll have to add "fielddata": true
in the mapping. This is by default disabled because aggregations on large text fields are not wanted, but in your case the tokens are exactly the values you want to aggregate on.
here's the simplified configuration
{
"mappings": {
"properties": {
"contentItemType": {
"type": "text",
"fielddata": true,
"analyzer": "patternAnalyzer"
}
}
},
"settings": {
"analysis": {
"analyzer": {
"patternAnalyzer": {
"tokenizer": "patternTokenizer"
}
},
"tokenizer": {
"patternTokenizer": {
"type": "pattern",
"pattern": ";"
}
}
}
}
}
the query:
{
"aggregations" : {
"test" : {
"terms" : { "field" : "contentItemType" }
}
}
}
and result:
"aggregations": {
"test": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": " Administrative Records",
"doc_count": 1
},
{
"key": "Meeting Minutes",
"doc_count": 1
},
{
"key": " Reports",
"doc_count": 1
}
]
}
}