Я в основном пытаюсь воспроизвести функциональность автозаполнения Google Places с помощью ElasticSearch.
У меня все места проиндексированы в одном поле, например «Колумбия, Южная Каролина 29044». Цель состоит в том, чтобы разрешить функцию автозаполнения / опережения ввода, где, если пользователь вводит «Колумбия, S C», «2904» или «Колумбия, Южная Каролина», тогда пользователю предоставляется вышеупомянутый вариант (при условии, что варианты сопоставления достаточно редки
Наиболее очевидная проблема, с которой я сейчас сталкиваюсь, заключается в том, что synonym filter
токенизируется и производит ошибочные смеси.
Мой индекс:
{
"settings": {
"analysis": {
"analyzer": {
"stateAnalyzer": {
"tokenizer": "autocomplete",
"filter": [
"lowercase",
"asciifolding",
"synonymFilter"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 30,
"token_chars": ["letter", "digit"]
}
},
"filter": {
"synonymFilter": {
"type": "synonym",
"synonyms": [
"Florida,FL",
"United States Virgin Islands,VI",
"Montana,MT",
"Minnesota,MN",
"Maryland,MD",
"South Carolina,SC",
"Maine,ME",
"Hawaii,HI",
"District of Columbia,DC",
"Commonwealth of the Northern Mariana Islands,MP",
"Rhode Island,RI",
"Nebraska,NE",
"Washington,WA",
"New Mexico,NM",
"Puerto Rico,PR",
"South Dakota,SD",
"Texas,TX",
"California,CA",
"Alabama,AL",
"Georgia,GA",
"Arkansas,AR",
"Pennsylvania,PA",
"Missouri,MO",
"Utah,UT",
"Oklahoma,OK",
"Tennessee,TN",
"Wyoming,WY",
"Indiana,IN",
"Kansas,KS",
"Idaho,ID",
"Alaska,AK",
"Nevada,NV",
"Illinois,IL",
"Vermont,VT",
"Connecticut,CT",
"New Jersey,NJ",
"North Dakota,ND",
"Iowa,IA",
"New Hampshire,NH",
"Arizona,AZ",
"Delaware,DE",
"Guam,GU",
"American Samoa,AS",
"Kentucky,KY",
"Ohio,OH",
"Wisconsin,WI",
"Oregon,OR",
"Mississippi,MS",
"Colorado,CO",
"North Carolina,NC",
"Virginia,VA",
"West Virginia,WV",
"Louisiana,LA",
"New York,NY",
"Michigan,MI",
"Massachusetts,MA"
],
"expand": true
}
}
}
},
"mappings": {
"properties": {
"fullName": {
"type": "text",
"analyzer": "stateAnalyzer",
"search_analyzer": "stateAnalyzer"
},
"route": {
"type": "text"
}
}
}
}
Если я проанализирую это следующим образом:
{
"analyzer": "stateAnalyzer",
"text": "columbia SC"
}
Среди прочего, он выдаст:
{
"tokens" : [
{
"token" : "co",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 0
},
{
"token" : "co",
"start_offset" : 0,
"end_offset" : 2,
"type" : "SYNONYM",
"position" : 0
},
{
"token" : "col",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 1
},
{
"token" : "col",
"start_offset" : 0,
"end_offset" : 3,
"type" : "SYNONYM",
"position" : 1
},
{
"token" : "colu",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 2
},
{
"token" : "colo",
"start_offset" : 0,
"end_offset" : 4,
"type" : "SYNONYM",
"position" : 2
},
{
"token" : "colum",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 3
},
{
"token" : "color",
"start_offset" : 0,
"end_offset" : 5,
"type" : "SYNONYM",
"position" : 3
},
{
"token" : "columb",
"start_offset" : 0,
"end_offset" : 6,
"type" : "word",
"position" : 4
},
{
"token" : "colora",
"start_offset" : 0,
"end_offset" : 6,
"type" : "SYNONYM",
"position" : 4
},
{
"token" : "columbi",
"start_offset" : 0,
"end_offset" : 7,
"type" : "word",
"position" : 5
},
{
"token" : "colorad",
"start_offset" : 0,
"end_offset" : 7,
"type" : "SYNONYM",
"position" : 5
},
{
"token" : "columbia",
"start_offset" : 0,
"end_offset" : 8,
"type" : "word",
"position" : 6
},
{
"token" : "colorado",
"start_offset" : 0,
"end_offset" : 8,
"type" : "SYNONYM",
"position" : 6
},
{
"token" : "sc",
"start_offset" : 9,
"end_offset" : 11,
"type" : "word",
"position" : 7
},
{
"token" : "so",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 7
},
{
"token" : "sou",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 8
},
{
"token" : "sout",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 9
},
{
"token" : "south",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 10
},
{
"token" : "ca",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 11
},
{
"token" : "car",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 12
},
{
"token" : "caro",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 13
},
{
"token" : "carol",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 14
},
{
"token" : "caroli",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 15
},
{
"token" : "carolin",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 16
},
{
"token" : "carolina",
"start_offset" : 9,
"end_offset" : 11,
"type" : "SYNONYM",
"position" : 17
}
]
}
Проблема, похоже, в том, что, поскольку EaslticSearch анализирует текст, он видит " co lum", что соответствует синониму "Colorado, CO". Однако я не могу этого избежать, потому что установка min_gram: 3
приводит к ошибке "term: FL was completely eliminated by analyzer"
Я предполагаю разбить адрес на части и индексировать каждое поле как completion
, а не edge_ngram
на каждая часть может решить некоторые из этих проблем. Проблема в том, что я не знаю, как мне заставить работать подсветку. В настоящее время у меня есть:
{
highlight: {
fields: {
fullName: {
type: 'plain'
}
}
}
}
редактировать:
копировать пасту из кибаны:
DELETE territories
PUT territories
{
"settings": {
"analysis": {
"analyzer": {
"stateAnalyzer": {
"tokenizer": "autocomplete",
"filter": [
"asciifolding",
"lowercase",
"synonymFilter"
]
}
},
"tokenizer": {
"autocomplete": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 30,
"token_chars": [
"letter",
"digit"
]
}
},
"filter": {
"synonymFilter": {
"type": "synonym",
"synonyms": [
"FL => Florida",
"VI => United States Virgin Islands",
"MT => Montana",
"MN => Minnesota",
"MD => Maryland",
"SC => South Carolina",
"ME => Maine",
"HI => Hawaii",
"DC => District of Columbia",
"MP => Commonwealth of the Northern Mariana Islands",
"RI => Rhode Island",
"NE => Nebraska",
"WA => Washington",
"NM => New Mexico",
"PR => Puerto Rico",
"SD => South Dakota",
"TX => Texas",
"CA => California",
"AL => Alabama",
"GA => Georgia",
"AR => Arkansas",
"PA => Pennsylvania",
"MO => Missouri",
"UT => Utah",
"OK => Oklahoma",
"TN => Tennessee",
"WY => Wyoming",
"IN => Indiana",
"KS => Kansas",
"ID => Idaho",
"AK => Alaska",
"NV => Nevada",
"IL => Illinois",
"VT => Vermont",
"CT => Connecticut",
"NJ => New Jersey",
"ND => North Dakota",
"IA => Iowa",
"NH => New Hampshire",
"AZ => Arizona",
"DE => Delaware",
"GU => Guam",
"AS => American Samoa",
"KY => Kentucky",
"OH => Ohio",
"WI => Wisconsin",
"OR => Oregon",
"MS => Mississippi",
"CO => Colorado",
"NC => North Carolina",
"VA => Virginia",
"WV => West Virginia",
"LA => Louisiana",
"NY => New York",
"MI => Michigan",
"MA => Massachusetts"
],
"expand": true
}
}
}
},
"mappings": {
"properties": {
"fullName": {
"type": "text",
"analyzer": "stateAnalyzer",
"search_analyzer": "stateAnalyzer"
},
"route": {
"type": "text"
}
}
}
}
POST territories/_analyze
{
"analyzer": "stateAnalyzer",
"text": "columbia SC"
}