FROM article t1 LATERAL VIEW explode( tokenize_ja( normalize_unicode(content, 'NFKC'), "normal", array(“a”,”about","above","across","after","again",...), array(“෭ࢺ”,”ॿࢺ","ಈࢺ","ه߸","໊ࢺ-","෭ࢺ-Ұൠ","ॿࢺ-ಛघ","ಈࢺ-ඌ",...), "https://s3.amazonaws.com/td-cdp-tagging/stable/kuromoji-user-dict-neologd.csv.gz" ) ) t2 AS word WHERE length(word) >= 2 AND word RLIKE '^[͊-ΜʔΝ-ϲʔҰ-ᴱa-zA-Z̰-͉̖-̯ɾʂʁ]+$' -- acceptable characters AND word NOT RLIKE '^([^Ұ-ᴱ]{1,2}|[͊-Μʔ]{1,3})$' -- even if word consists of acceptable characters, reject "len-2 non-kanji word" and "len-3 hiragana-only word"