ICU Analyzer 예제

Intro::

ICU Analyzer를 통한 오타교정 인덱스 생성 예제이다.

오타교정

인덱스 및 설명


PUT icu_ex
{
  "settings": {
    "analysis": {
      "analyzer": {
        "nfd_analyzer": {
          "type": "custom",
          "filter": [// 토큰 필터
            "lowercase"
          ],
          "char_filter": [// 캐릭터 필터
            "nfd_normalizer"
          ],
          "tokenizer": "standard"// 토크나이저
        }
      },
      "char_filter": {
        "nfd_normalizer": {
          "mode": "decompose",
          "type": "icu_normalizer"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "fields": {
          "raw": {
            "type": "keyword"
          },
          "spell": {
            "type": "text",
            "analyzer": "nfd_analyzer"
          }
        }
      }
    }
  }
}

데이터 추가 및 suggest 사용


// 데이터 추가
POST _bulk
{"index":{"_index":"icu_ex", "_id":"1"}}
{ "title": "해리포터와 마법사의 돌" }
{"index":{"_index":"icu_ex", "_id":"2"}}
{ "title": "해리포터와 비밀의 방" }
{"index":{"_index":"icu_ex", "_id":"3"}}
{ "title": "해리포터와 아즈카반의 죄수" }
{"index":{"_index":"icu_ex", "_id":"4"}}
{ "title": "해리포터와 불의 잔" }
{"index":{"_index":"icu_ex", "_id":"5"}}
{ "title": "해리포터와 불사조 기사단" }
{"index":{"_index":"icu_ex", "_id":"6"}}
{ "title": "해리포터" }

// 데이터 추천
POST /icu_ex/_search
{
  "suggest": {
    "my-suggestion": {
      "text": "해포터와 불닭조 기샤단",
      "term": {
        "field": "title.spell",
        "string_distance": "jaro_winkler"
      }
    }
  }
}


// 응답 결과
{
  "took" : 11,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 0,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "suggest" : {
    "my-suggestion" : [
      {
        "text" : "해리표퉈와",
        "offset" : 0,
        "length" : 5,
        "options" : [
          {
            "text" : "해리포터와",
            "score" : 0.9333333,
            "freq" : 5
          }
        ]
      },
      {
        "text" : "불살종",
        "offset" : 6,
        "length" : 3,
        "options" : [
          {
            "text" : "불사조",
            "score" : 0.962963,
            "freq" : 1
          }
        ]
      },
      {
        "text" : "기수닭",
        "offset" : 10,
        "length" : 3,
        "options" : [
          {
            "text" : "기사단",
            "score" : 0.82,
            "freq" : 1
          }
        ]
      }
    ]
  }
}

Trouble Shooting


GET icu_ex/_search
{
  "_source": ["title"],
  "fields": ["title.raw", "title.spell"],
  "query": {
    "match_all": {}
  }
}

위 와같은 쿼리를 실행했을때 왜 title.spell의 결과가 다음과 같다.


{
  "tokens" : [
    {
      "token" : "ㅎㅐㄹㅣㅍㅗㅌㅓㅇㅘ",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "<HANGUL>",
      "position" : 0
    },
    {
      "token" : "ㅁㅏㅂㅓㅂㅅㅏㅇㅢ",
      "start_offset" : 6,
      "end_offset" : 10,
      "type" : "<HANGUL>",
      "position" : 1
    },
    {
      "token" : "ᄃㅗㄹ",
      "start_offset" : 11,
      "end_offset" : 12,
      "type" : "<HANGUL>",
      "position" : 2
    }
  ]
}

이처럼 자모가 분리돼 보이는데, 다음과 같은 쿼리에서는 자모 분리가 안되어져 보인다.


GET icu_ex/_search
{
  "_source": ["title"],
  "fields": ["title.raw", "title.spell"],
  "query": {
    "match_all": {}
  }
}

결론

저장된 데이터는 원래 텍스트 형태를 유지합니다.

분석된 데이터는 nfd_analyzer를 통해 자모 단위로 분해됩니다.

검색 결과에서 _source는 원래 텍스트를 반환하며, 분석된 자모 단위의 형태를 직접 보려면 _analyze API를 사용해야 합니다.

💡

이로 인해 검색 결과에서 자모 분해된 형태가 보이지 않을 수 있으며, 이는 저장된 데이터가 원래 형태를 유지하기 때문입니다. 분석된 토큰을 확인하려면 _analyze API를 사용하여 분석 과정을 직접 확인하는 것이 필요합니다.

💡

실제로 분리된 텀들을 보고싶다면 다음과 같은 쿼리를 작성하여 확인하면 된다.


// 텀벡터
GET icu_ex/_doc/1/_termvectors?fields=title.spell

// 응답 결과
{
  "_index" : "icu_ex",
  "_type" : "_doc",
  "_id" : "1",
  "_version" : 1,
  "found" : true,
  "took" : 0,
  "term_vectors" : {
    "title.spell" : {
      "field_statistics" : {
        "sum_doc_freq" : 3,
        "doc_count" : 1,
        "sum_ttf" : 3
      },
      "terms" : {
        "돌" : {
          "term_freq" : 1,
          "tokens" : [
            {
              "position" : 2,
              "start_offset" : 11,
              "end_offset" : 12
            }
          ]
        },
        "마법사의" : {
          "term_freq" : 1,
          "tokens" : [
            {
              "position" : 1,
              "start_offset" : 6,
              "end_offset" : 10
            }
          ]
        },
        "해리포터와" : {
          "term_freq" : 1,
          "tokens" : [
            {
              "position" : 0,
              "start_offset" : 0,
              "end_offset" : 5
            }
          ]
        }
      }
    }
  }
}

한영 변환

해당 부분 부터는 icu가 아닌 다른 플러그인을 통한 예제이다.


// 인덱스 생성
PUT haneng_test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "engtohan_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "hanhinsam_engtohan"
          ]
        },
        "hantoeng_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "hanhinsam_hantoeng"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword",
        "copy_to": ["name_hantoeng", "name_engtohan"]
      },
      "name_hantoeng": {
        "type": "text",
        "search_analyzer": "hantoeng_analyzer"
      },
      "name_engtohan": {
        "type": "text",
        "search_analyzer": "engtohan_analyzer"
      }
    }
  }
}


// 데이터 색인
POST /_bulk
{ "index" : { "_index" : "haneng_test", "_id" : "1" } }
{ "name" : "손오공" }
{ "index" : { "_index" : "haneng_test", "_id" : "2" } }
{ "name" : "elastic" }
{ "index" : { "_index" : "haneng_test", "_id" : "3" } }
{ "name" : "아메리카노" }

// 한영, 영한 검색
POST /haneng_test/_search
{
  "query": {
    "query_string": {
      "fields": ["name_engtohan","name_hantoeng"], 
      "query": "딤ㄴ샻 OR thsdhrhd"
    }
  }
}
// 응답 결과
{
  "took" : 23,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 0.9808291,
    "hits" : [
      {
        "_index" : "haneng_test",
        "_id" : "1",
        "_score" : 0.9808291,
        "_source" : {
          "name" : "손오공"
        }
      },
      {
        "_index" : "haneng_test",
        "_id" : "2",
        "_score" : 0.9808291,
        "_source" : {
          "name" : "elastic"
        }
      }
    ]
  }
}

초성


// 인덱스 생성
PUT /chosung_test
{
  "settings": {
    "analysis": {
      "analyzer": {
        "chosung_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "hanhinsam_chosung"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword",
        "copy_to": ["name_chosung"]
      },
      "name_chosung": {
        "type": "text",
        "analyzer": "chosung_analyzer"
      }
    }
  }
}


// 데이터 색인
POST /_bulk
{ "index" : { "_index" : "chosung_test", "_id" : "2" } }
{ "name" : "엘라스틱서치" }
{ "index" : { "_index" : "chosung_test", "_id" : "3" } }
{ "name" : "아메리카노" }

// 초성 검색
POST /chosung_test/_search
{
  "query": {
    "match": {
      "name_chosung": "ㅇㄹㅅㅌㅅㅊ"
    }
  }
}

// 응답 결과
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 0.6931471,
    "hits" : [
      {
        "_index" : "chosung_test",
        "_id" : "2",
        "_score" : 0.6931471,
        "_source" : {
          "name" : "엘라스틱서치"
        }
      }
    ]
  }
}

자동완성


// 인덱스 생성
PUT /ac_test
{
  "settings": {
    "index.max_ngram_diff": 19,
    "analysis": {
      "filter": {
        "ngram_filter": {
          "type": "ngram",
          "min_gram": 1,
          "max_gram": 20
        }
      },
      "analyzer": {
        "jamo_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "hanhinsam_jamo"
          ]
        },
        "ngram_jamo_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "hanhinsam_jamo",
            "ngram_filter"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "name": {
        "type": "keyword",
        "copy_to": "name_ngram"
      },
      "name_ngram": {
        "type": "text",
        "analyzer": "ngram_jamo_analyzer",
        "search_analyzer": "jamo_analyzer"
      }
    }
  }
}


// 데이터 색인
POST /_bulk
{ "index" : { "_index" : "ac_test", "_id" : "1" } }
{ "name" : "손오공" }
{ "index" : { "_index" : "ac_test", "_id" : "2" } }
{ "name" : "elastic" }
{ "index" : { "_index" : "ac_test", "_id" : "3" } }
{ "name" : "아메리카노" }

// 자동 완성 검색
POST /ac_test/_search
{
  "query": {
    "match": {
      "name_ngram": "아멜"
    }
  }
}

// 응답 결과
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 1,
      "relation" : "eq"
    },
    "max_score" : 1.631392,
    "hits" : [
      {
        "_index" : "ac_test",
        "_id" : "3",
        "_score" : 1.631392,
        "_source" : {
          "name" : "아메리카노"
        }
      }
    ]
  }
}

References::

엘라스틱서치 오타교정 API 만들어보기

엘라스틱서치의 Suggest API를 사용하여 검색 키워드에 대한 교정 키워드를 가져오기

https://danawalab.github.io/elastic/2020/05/21/Elasticsearch-SuggestApi.html