Elasticsearch Fuzzy Phrases
Eventually figured out that I needed to use a combination of span
queries, which give an excellent amount of fine tuning to fuzziness and slop. I needed to add a function to manually tokenize my phrases and add to the "clauses" array in an programmatically:
{"query":
{
"span_near": {
"clauses": [
{
"span_multi": {
"match": {
"fuzzy": {
"content": {
"fuzziness": "2",
"value": "word"
}
}
}
}
},
{
"span_multi": {
"match": {
"fuzzy": {
"content": {
"fuzziness": "2",
"value": "another"
}
}
}
}
}
],
"slop": 1,
"in_order": "true"
@econgineer Excellent post.
I wanted to try this for an ES query we are working on - but I am too lazy to keep doing the JSON data....
I think this code works... strangely it causes jq to complain but ElasticSearch work....
import json
import pprint
from collections import defaultdict
nested_dict = lambda: defaultdict(nested_dict)
query=nested_dict()
query['span_near']['clauses']=list()
query['slop']='1'
query['in_order']="true"
words=['what','is','this']
for w in words:
nest = nested_dict()
nest["span_multi"]["match"]["fuzzy"]["msg"]["fuzziness"]["value"]=w
nest["span_multi"]["match"]["fuzzy"]["msg"]["fuzziness"]["fuzziness"]="2"
json.dumps(nest)
query['span_near']['clauses'].append(json.loads(json.dumps(nest)))
pprint.pprint(json.loads(json.dumps(query)))
If you beautify the output by
cat t2.json | tr "\'" "\"" | jq '.'
You should see something like
{
"in_order": "true",
"slop": "1",
"span_near": {
"clauses": [
{
"span_multi": {
"match": {
"fuzzy": {
"msg": {
"fuzziness": {
"fuzziness": "2",
"value": "what"
}
}
}
}
}
},
{
"span_multi": {
"match": {
"fuzzy": {
"msg": {
"fuzziness": {
"fuzziness": "2",
"value": "is"
}
}
}
}
}
},
{
"span_multi": {
"match": {
"fuzzy": {
"msg": {
"fuzziness": {
"fuzziness": "2",
"value": "this"
}
}
}
}
}
}
]
}
}
And then to query ES it is just a normal
curl --silent My_ES_Server:9200:/INDEX/_search -d @t2.json
Many thanks for the initial guidance, I hope someone else find this of use.
Indeed, an excellent question and answer. I'm surprised that this 'fuzzy phrase match' doesn't have support out of the box.
Here's a tested NodeJS code that generates the fuzzy phrase match (multi clause) query block, in the context of a multi search (msearch), but that should work just the same with a single search.
Usage:
const queryBody = [
{ index: 'YOUR_INDEX' },
createESFuzzyPhraseQueryBlock('YOUR PHRASE', 'YOUR_FIELD_NAME', 2)
];
client.msearch({
body: queryBody
})
Functions:
const createESFuzzyPhraseClauseBlock = (word, esFieldName, fuzziness) => {
const clauseBlock = {
"span_multi": {
"match": {
"fuzzy": {
[esFieldName]: {
"fuzziness": fuzziness,
"value": word
}
}
}
}
});
return clauseBlock;
};
const createESFuzzyPhraseQueryBlock = (phrase, esFieldName, fuzziness) => {
const clauses = phrase.split(' ').map(word => createESFuzzyPhraseClauseBlock(word, esFieldName, fuzziness));
const queryBlock =
{
"query":
{
"span_near": {
"clauses": clauses,
"slop": 1,
"in_order": "true"
}
}
};
return queryBlock;
};