Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応

Avatar for Kunihiko Kido Kunihiko Kido
September 16, 2014

Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応

第6回elasticsearch勉強会「Elasticsearch 日本語スキーマレス環境構築と、ついでに多言語対応」資料

Avatar for Kunihiko Kido

Kunihiko Kido

September 16, 2014
Tweet

More Decks by Kunihiko Kido

Other Decks in Technology

Transcript

  1. Ϛοϐϯάఆٛͱ໨త ̍ ̎ ̏ ̐ ̑ ̒ ద߹཰Λ ޲্͍ͨ͠ ࠶ݱ཰Λ

    ޲্͍ͨ͠ ਖ਼֬ʹ ߜΓࠐΈ͍ͨ ूܭ͍ͨ͠ ද͍ࣔͨ͠ BOENPSF ͔͠΋ϑΟʔϧυຖʹɻɻɻ ߟ͑Δ͜ͱ͕͍ͬͺ͍ʂ
  2. Ϛοϐϯάఆٛͱ໨త ̍ ̎ ̏ ̐ ̑ ̒ ׬શҰகͰ ݕࡧɾूܭ͍ͨ͠ ʮ౎ʯʮ෎ʯʮݝʯ

    Λলུͯ͠ݕࡧ͍ͨ͠ ϤϛͰݕࡧ͍ͨ͠ ౎ಓ෎ݝίʔυ ॱͰιʔτ͍ͨ͠ ʮؔ౦ʯͳͲ ผ໊Ͱݕࡧɾूܭ͍ͨ͠ BOENPSFʜ ྫ͑͹ɺ౎ಓ෎ݝ໊ϑΟʔϧυ̍ͭͱͬͯ΋
  3. %FNP DELETE /myindex! ! PUT /myindex/mytype/1! ! {! "title": "Elasticsearch

    ಛ௃·ͱΊ",! "description": "Elasticsearch Features — ओʹγεςϜΛத৺ͱͨ͠ಛ௃·ͱΊ",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": "2014-03-12T11:09"! } ͍ͭ΋௨Γɺ͜Μͳײ͡Ͱσʔλొ࿥͢Δ͚ͩͰɺ ϑΟʔϧυຖʹඞཁͳϚοϐϯάఆ͕ٛग़དྷ্͕Δ
  4. %FNP {! ...! "title" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! },! "yomi" : {! "type" : "string",! "analyzer" : "katakana"! }! }! }! ...! }! UJUMFϑΟʔϧυͷϚοϐϯάఆٛ ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ
  5. %FNP {! ...! "description" : {! "type" : "string",! "fields"

    : {! "substring" : {! "type" : "string",! "analyzer" : "cjk"! }! }! },! ...! } ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ EFTDSJQUJPOϑΟʔϧυͷϚοϐϯάఆٛ
  6. %FNP {! ...! "author" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! },! "yomi" : {! "type" : "string",! "analyzer" : "katakana"! }! }! }! ...! }! BVUIPSϑΟʔϧυͷϚοϐϯάఆٛ ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ
  7. %FNP {! ...! "link" : {! "type" : "string",! "index"

    : "not_analyzed",! "fields" : {! "domain" : {! "type" : "string",! "analyzer" : "domain_analyzer"! },! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "substring" : {! "type" : "string",! "analyzer" : "standard"! }! }! },! ...! } ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ MJOLϑΟʔϧυͷϚοϐϯάఆٛ
  8. %FNP {! ...! "tags" : {! "type" : "string",! "fields"

    : {! "keyword" : {! "type" : "string",! "analyzer" : "keyword_analyzer"! },! "raw" : {! "type" : "string",! "index" : "not_analyzed"! },! "substring" : {! "type" : "string",! "analyzer" : "cjk"! }! }! },! ...! }! ˞ಈతʹ࡞੒͞ΕͨϚοϐϯάఆٛ UBHTϑΟʔϧυͷϚοϐϯάఆٛ
  9. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch Overview",! "description": "Elasticsearch Features —

    System Overview",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “en",! } %FNP ʮӳޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ͜Ε͚ͩ
  10. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch ѐਃ",! "description": "Elasticsearch Features —

    दझమ ѐਃ",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “ko",! } %FNP ʮؖࠃޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ͍͕͍ͱ؆୯ʂ
  11. PUT /myindex/mytype/1! ! {! "title": "Elasticsearch Überblick",! "description": "Elasticsearch Features —

    Systemübersicht",! "author": "Kunihiko Kido",! "link": "https://medium.com/hello-elasticsearch/elasticsearch-500996e47c70",! "tags": ["Elasticsearch"],! "pubDate": “2014-03-12T11:09”,! "language": “de",! } %FNP ʮυΠπޠʯͷจষͱͯ͠ΠϯσοΫε͍ͨ͠৔߹ ศར͔΋ʂ
  12. EZOBNJDUFNQMBUFT {! ...! "dynamic_templates": [! {! "my_field": {! "match": "*",

    /* ᶃ ϑΟʔϧυ໊ͷύλʔϯ */! "match_pattern": "regex", /* ᶄ matchͷϚονϯάํࣜ ʢলུՄʣ*/! "match_mapping_type": "string", /* ᶅ JSONϑΥʔϚοτͷλΠϓ */! "mapping": {...} /* ᶆ Ϛοϐϯάఆٛ */! }! },! ...(ෳ਺ఆٛͰ͖Δ)! ],! ...! }! ! جຊతͳઃఆ
  13. EZOBNJDUFNQMBUFT {! "url_fields": {! "match": ".*url|.*link",! "match_pattern": "regex",! "match_mapping_type": "string",!

    "mapping": {! "type": "string",! "index": "not_analyzed",! "fields": {! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "standard"! },! "domain": {! "type": "string",! "analyzer": "domain_analyzer"! }! }! }! }! }! ྫʣ63-ϑΟʔϧυ޲͚
  14. EZOBNJDUFNQMBUFT {! "special_string_fields": {! "match": ".*title.*|.*name.*|.*author.*",! "match_pattern": "regex",! "match_mapping_type": "string",!

    "mapping": {! "type": "string",! "fields": {! "raw": {! "type": "string",! "index": "not_analyzed"! },! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "ja-substring"! },! "yomi": {! "type": "string",! "analyzer": "katakana"! }! }! }! }! } ྫʣ໊শ౳ॏཁͳϑΟʔϧυ
  15. EZOBNJDUFNQMBUFT {! "long_string_fields": {! "match": ".*message.*|.*content.*|.*description.*| .*text.*|.*body.*|.*note.*|.*memo.*",! "match_pattern": "regex",! "match_mapping_type":

    "string",! "mapping": {! "type": "string",! "fields": {! "substring": {! "type": "string",! "analyzer": "ja-substring"! }! }! }! }! } ྫʣ௕จϑΟʔϧυ
  16. EZOBNJDUFNQMBUFT {! "short_string_fields": {! "match": "*",! "match_mapping_type": "string",! "mapping": {!

    "type": "string",! "fields": {! "raw": {! "type": "string",! "index": "not_analyzed"! },! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "ja-substring"! }! }! }! }! } ྫʣͦͷଞϑΟʔϧυ
  17. EZOBNJDUFNQMBUFT {! ...! "dynamic_templates": [! {! “url_field": {…}! },! {!

    "special_string_fields": {…}! },! {! "long_string_fields": {…}! },! {! "short_string_fields": {…}! },! ],! ...! } ࠷ॳʹύλʔϯʹϚονͨ͠Ϛο ϐϯά͕ఆٛ͞ΕΔ
  18. JOEFYUFNQMBUF DPOpHUFNQMBUFT഑Լʹอଘ͢Δ͚ͩͰ४උ0, ˝FMBTUJDTFBSDI ˝DPOpH ˝UFNQMBUFT CBTFKTPO DVTUPN@BOBMZ[FSTKTPO KBQBOFTF@BOBMZ[FSTKTPO MBOHVBHF@BOBMZ[FSTKTPO TUSJOH@pFMETKTPO

    %FNPͰ࢖͍ͬͯΔJOEFY UFNQMBUFϑΝΠϧͨͪ ˡ ɾϊʔυͷ࠶ىಈ͸ඞཁͳ͍ ɾ৽نͰ࡞੒ͨ͠ΠϯσοΫεͷΈʹద༻͞ΕΔ ɾςϯϓϨʔτ͸ෳ਺࡞ΕΔ ɾ"1*Ͱ΋ઃఆͰ͖Δ ɾEZOBNJDUFNQMBUFTͷఆٛ΋ؚΊΒΕΔ
  19. JOEFYUFNQMBUF {! "my_template":{ /* ςϯϓϨʔτͷ໊લ */! "template": "*", /* ςϯϓϨʔτΛద༻͢ΔΠϯσΫε໊ͷύλʔϯ

    */! "order": 0, /* ςϯϓϨʔτΛద༻͢Δ༏ઌॱҐ */! "settings": {...}, /* γϟʔυͷ਺ͱ͔ɺAnalysis ͷઃఆ౳ΠϯσοΫεఆٛʹؔΘΔઃఆ */! "mappings": {...} /* Ϛοϐϯάఆٛͷઃఆɻdynamic templates͸͜͜Ͱઃఆ */! . . .! }! }! ! ! ! ! ! ϑΝΠϧ໊ͱςϯϓϨʔτ໊͸ͱΓ͋͑ͣಉ͡ʹ͓͚ͯ͠͹ྑ͍͔ͳʁ جຊతͳઃఆ
  20. JOEFYUFNQMBUF {! "base": {! "template": "*",! "order": 1,! "mappings": {!

    "_default_": {! "_timestamp" : {! "enabled" : true,! "store" : true! },! "_analyzer": {! "path": "language"! },! "_id": {! "path": "id"! },! "_source": {! "excludes" : ["attachment_file"]! }! }! }! }! }! UFNQMBUFTCBTFKTPO σϑΥϧτͷΞφϥΠβʔมߋ͢ΔͨΊͷMBOHVBHFϑΟʔϧυͷఆٛͳͲ
  21. JOEFYUFNQMBUF {! "language_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "filter": {! "arabic_stop": {! "type": "stop",! "stopwords": "_arabic_"! },! "arabic_stemmer": {! "type": "stemmer",! "language": "arabic"! },! "armenian_stop": {! "type": "stop",! "stopwords": "_armenian_"! },! "armenian_stemmer": {! "type": "stemmer",! "language": "armenian"! },! "basque_stop": {! "type": "stop",! "stopwords": "_basque_"! },! "basque_stemmer": {! "type": "stemmer",! "language": "basque"! },! "brazilian_stop": {! "type": "stop",! "stopwords": "_brazilian_"! },! "brazilian_stemmer": {! "type": "stemmer",! "language": "brazilian"! },! "bulgarian_stop": {! "type": "stop",! "stopwords": "_bulgarian_"! },! "bulgarian_stemmer": {! "type": "stemmer",! "language": "bulgarian"! },! "catalan_elision": {! UFNQMBUFTMBOHVBHF@BOBMZ[FSTKTPO ֤ࠃݴޠຖͷϑΟϧλʔ΍ΞφϥΠβʔͷఆٛͳͲ
  22. JOEFYUFNQMBUF {! "japanese_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "char_filter": {! "japanese_normalization": {! "type": "kuromoji_iteration_mark",! "normalize_kanji": true,! "normalize_kana": true! }! },! "filter": {! "romaji_readingform": {! "type": "kuromoji_readingform",! "use_romaji": true! },! "katakana_readingform": {! "type": "kuromoji_readingform",! "use_romaji": false! },! "katakana_stemmer": {! "type": "kuromoji_stemmer",! "minimum_length": 4! },! "japanese_stop": {! "type": "kuromoji_part_of_speech"! }! },! "tokenizer": {! "japanese_tokenizer": {! "type": "kuromoji_tokenizer",! "mode": "search"! }! },! "analyzer": {! "default": {! "alias": ["japanese", "ja"],! "type": "custom",! "char_filter": [! "html_strip",! "japanese_normalization"! ],! "tokenizer": "japanese_tokenizer",! "filter": [! "cjk_width",! "lowercase",! UFNQMBUFTKBQBOFTF@BOBMZ[FSTKTPO ೔ຊޠؔ࿈ͷϑΟϧλʔ΍ΞφϥΠβʔͷఆٛͳͲ
  23. JOEFYUFNQMBUF {! "custom_analyzers": {! "template": "*",! "order": 2,! "settings": {!

    "analysis": {! "filter": {! "domain_extractor" : {! "type" : "pattern_replace",! "preserve_original": false,! "pattern" : "https?://([^/]+).*",! "replacement": "$1"! }! },! "tokenizer": {! "comma_tokenizer": {! "type": "pattern",! "pattern":"[,ɺ]+"! }! },! "analyzer": {! "domain_analyzer" : {! "alias": ["domain"],! "tokenizer" : "uax_url_email",! "filter" : [ "domain_extractor", "lowercase", "unique" ]! },! "comma_analyzer":{! "alias": ["comma"],! "type": "custom",! "tokenizer": "comma_tokenizer",! "filter": ["cjk_width", "lowercase", "trim", "unique"]! },! "keyword_analyzer": {! "type": "custom",! "tokenizer": "keyword",! "filter": ["cjk_width", "lowercase", "trim"]! }! }! }! }! }! } UFNQMBUFTDVTUPN@BOBMZ[FSTKTPO 63-͔ΒυϝΠϯ໊Λநग़ͨ͠Γݴޠʹґଘ͠ͳ͍ΞφϥΠβʔͷఆٛ
  24. JOEFYUFNQMBUF {! "string_fields": {! "template": "*",! "order": 10,! "mappings": {!

    "_default_": {! "dynamic_templates": [! {! "not_analyzed_fields": {! "match": "method|charSet|mimeType| content_type|language",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! "type": "string",! "index": "not_analyzed"! }! }! },! {! "url_fields": {! "match": ".*url|.*link",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! "type": "string",! "index": "not_analyzed",! "fields": {! "keyword": {! "type": "string",! "analyzer": "keyword_analyzer"! },! "substring": {! "type": "string",! "analyzer": "standard"! },! "domain": {! "type": "string",! "analyzer": "domain_analyzer"! }! }! }! }! },! {! "comma_separated_fields": {! "match": "meta_keywords",! "match_pattern": "regex",! "match_mapping_type": "string",! "mapping": {! UFNQMBUFTTUSJOH@pFMETKTPO TUSJOHܕϑΟʔϧυͷμΠφϛοΫςϯϓϨʔτఆٛ
  25. JOEFYUFNQMBUF ˝ΠϯσοΫε࡞੒ TFUUJOHTཁૉͷద༻ NBQQJOHTཁૉͷద༻ ˝ϑΟʔϧυσʔλͷ௥Ճ EZOBNJDUFNQMBUFTͷద༻ ΠϨΪϡϥʔͳରԠ͍ͨ͠৔߹ ✔️ ✔️ ✔️

    ର৅ͷϑΟʔϧυఆ͕ٛଘࡏ͠ͳ͍৔ ㅟ ㅟ ㅟ ㅟ ㅟ ㅟ ߹ɺEZOBNJDUFNQMBUFTͷఆٛ͸͜ͷ ㅟ λΠϛϯάͰద༻ɻ ˡ ΠϨΪϡϥʔͳϚοϐϯάఆٛΛ͍ͨ͠ ৔߹͸͜ͷλΠϛϯάͰఆٛ͢Ε͹0, ˝λΠϓͷ࡞੒