From 0daa2c47b7afd7450666754b7a05306e7bbff526 Mon Sep 17 00:00:00 2001 From: "SAKAI, Kazuaki" Date: Fri, 15 May 2026 17:52:13 +0900 Subject: [PATCH] feat(extras): add history + culture domains from corpus mining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wikipedia 全 dump (4.6GB, 2.4M articles) を dictool candidates corpus で mining し、categorical に分類して 2 新 domain を追加: - history.tsv (17 entries): 元号 / 位階 / 古典書名 / 歴史人物 - culture.tsv (18 entries): 出版社 / 音楽人 / 作品名 Mozc top-1 と衝突する 3 件 (寛保/曹操/阿久悠) は cost 7000-8000 で常用語 (漢方/早々/悪友) を top-1 に維持。既存 Mozc lattice が正解する候補 (徳川家康/三島由紀夫/平家物語 等) は drop。 accuracy-corpus.toml に extras カテゴリで 13 ケース追加 (含む 3 件の cost-conflict ケース)。 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../crates/lex-cli/src/dict_source/extras.rs | 15 +++ .../src/dict_source/extras/culture.tsv | 35 +++++++ .../src/dict_source/extras/history.tsv | 38 ++++++++ engine/testcorpus/accuracy-corpus.toml | 93 +++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 engine/crates/lex-cli/src/dict_source/extras/culture.tsv create mode 100644 engine/crates/lex-cli/src/dict_source/extras/history.tsv diff --git a/engine/crates/lex-cli/src/dict_source/extras.rs b/engine/crates/lex-cli/src/dict_source/extras.rs index 7b15ebe..6aa8e55 100644 --- a/engine/crates/lex-cli/src/dict_source/extras.rs +++ b/engine/crates/lex-cli/src/dict_source/extras.rs @@ -38,6 +38,8 @@ const DOMAINS: &[(&str, &str)] = &[ ("it.tsv", include_str!("extras/it.tsv")), ("food.tsv", include_str!("extras/food.tsv")), ("geography.tsv", include_str!("extras/geography.tsv")), + ("history.tsv", include_str!("extras/history.tsv")), + ("culture.tsv", include_str!("extras/culture.tsv")), ]; /// Default cost for entries that don't specify one. Mid-range so curated @@ -147,6 +149,19 @@ mod tests { .expect("きららざか must map to 雲母坂"); assert!(kirarazaka.iter().any(|e| e.surface == "雲母坂")); + // history domain + let eiroku = entries.get("えいろく").expect("えいろく must map to 永禄"); + assert!(eiroku.iter().any(|e| e.surface == "永禄")); + // Cost override on a history entry (寛保 cost=7000) must round-trip. + let kanpou = entries.get("かんぽう").expect("かんぽう must map to 寛保"); + assert!(kanpou.iter().any(|e| e.surface == "寛保" && e.cost == 7000)); + + // culture domain + let komuro = entries + .get("こむろてつや") + .expect("こむろてつや must map to 小室哲哉"); + assert!(komuro.iter().any(|e| e.surface == "小室哲哉")); + // All entries use the default POS id. for entry_list in entries.values() { for entry in entry_list { diff --git a/engine/crates/lex-cli/src/dict_source/extras/culture.tsv b/engine/crates/lex-cli/src/dict_source/extras/culture.tsv new file mode 100644 index 0000000..2cb396a --- /dev/null +++ b/engine/crates/lex-cli/src/dict_source/extras/culture.tsv @@ -0,0 +1,35 @@ +# Lexime curated extras: culture & media vocabulary (publishers, music +# producers/artists, popular culture) not covered by Mozc UT. +# +# Format: reading[TAB]surface[TAB]cost(optional) +# Comment lines start with '#'. See it.tsv for full format spec. +# +# Sourcing: surfaces below were sampled from `dictool candidates corpus` +# output on a Japanese Wikipedia dump and verified via `lextool explain` +# to ensure each entry fills a real Mozc gap (top-1 was gibberish or an +# obscure homophone) rather than duplicating an existing dict entry. + +# --- 出版社・文庫 --- +しんちょうぶんこ 新潮文庫 +ちゅうこうぶんこ 中公文庫 +ちゅうおうこうろんしんしゃ 中央公論新社 +とうきょうだいがくしゅっぱんかい 東京大学出版会 +せいぶんどうしんこうしゃ 誠文堂新光社 +こうぶんどう 弘文堂 +かどかわにほんちめいだいじてん 角川日本地名大辞典 +こくしだいじてん 国史大辞典 + +# --- 作詞家・作曲家・アーティスト --- +こむろてつや 小室哲哉 +あきもとやすし 秋元康 +まつとうやゆみ 松任谷由実 +あむろなみえ 安室奈美恵 +さわだけんじ 沢田研二 +つつみきょうへい 筒美京平 +まつもとたかし 松本隆 +みずきなな 水樹奈々 +# 阿久悠: 悪友 (final_cost ~11800) を top-1 に残すため cost を上げる +あくゆう 阿久悠 8000 + +# --- 作品名 --- +うちゅうせんかん 宇宙戦艦 diff --git a/engine/crates/lex-cli/src/dict_source/extras/history.tsv b/engine/crates/lex-cli/src/dict_source/extras/history.tsv new file mode 100644 index 0000000..6f700ee --- /dev/null +++ b/engine/crates/lex-cli/src/dict_source/extras/history.tsv @@ -0,0 +1,38 @@ +# Lexime curated extras: historical vocabulary (era names, court ranks, +# classical works, historical figures and organizations) not covered by +# Mozc UT. +# +# Format: reading[TAB]surface[TAB]cost(optional) +# Comment lines start with '#'. See it.tsv for full format spec. +# +# Sourcing: surfaces below were sampled from `dictool candidates corpus` +# output on a Japanese Wikipedia dump and verified via `lextool explain` +# to ensure each entry fills a real Mozc gap (top-1 was gibberish or an +# obscure homophone) rather than duplicating an existing dict entry. + +# --- 元号 (戦国〜江戸) --- +えいろく 永禄 +じょうがん 貞観 +ほうれき 宝暦 +げんき 元亀 +えんきょう 延享 +きょうろく 享禄 +# 寛保: 漢方 (final_cost ~10500) を top-1 に残すため cost を上げる +かんぽう 寛保 7000 + +# --- 位階 --- +じゅごい 従五位 +じゅさんみ 従三位 +しょうごい 正五位 +しょうしい 正四位 + +# --- 古典・古書 --- +しょくにほんぎ 続日本紀 +かんせいちょうしゅうしょかふ 寛政重修諸家譜 +せんしそうしょ 戦史叢書 + +# --- 歴史人物・国家 --- +ぶてい 武帝 +ほくぎ 北魏 +# 曹操: 早々 (final_cost ~10600) を top-1 に残すため cost を上げる +そうそう 曹操 8000 diff --git a/engine/testcorpus/accuracy-corpus.toml b/engine/testcorpus/accuracy-corpus.toml index 136034d..444f043 100644 --- a/engine/testcorpus/accuracy-corpus.toml +++ b/engine/testcorpus/accuracy-corpus.toml @@ -496,6 +496,99 @@ expected = "雲母坂" category = "extras" tags = ["geography", "domain-reading"] +# --- history: 元号 / 位階 / 古典 / 歴史人物 --- +# Mozc UT に surface 自体が無く、lattice top-1 が gibberish になる ---- + +[[cases]] +reading = "えいろく" +expected = "永禄" +category = "extras" +tags = ["history", "era"] +note = "Mozc top-1 は 栄禄 (obscure)。Wikipedia corpus mining で発見" + +[[cases]] +reading = "じゅごい" +expected = "従五位" +category = "extras" +tags = ["history", "court-rank"] + +[[cases]] +reading = "しょくにほんぎ" +expected = "続日本紀" +category = "extras" +tags = ["history", "classical-work"] + +[[cases]] +reading = "せんしそうしょ" +expected = "戦史叢書" +category = "extras" +tags = ["history", "classical-work"] +note = "Mozc top-1 は 戦士叢書 (同音異字)" + +[[cases]] +reading = "ほくぎ" +expected = "北魏" +category = "extras" +tags = ["history", "dynasty"] + +# 共存ケース: extras に追加した surface が Mozc top-1 を上書きしないこと +# (寛保/元亀/曹操 を入れたが、常用語の 漢方/元気/早々 が top-1 を維持) + +[[cases]] +reading = "かんぽう" +expected = "漢方" +category = "extras" +tags = ["history", "era", "cost-conflict"] +note = "extras に 寛保 を追加したが、常用の 漢方 が top-1 を維持 (cost 7000 で調整)" + +[[cases]] +reading = "げんき" +expected = "元気" +category = "extras" +tags = ["history", "era", "cost-conflict"] +note = "extras に 元亀 を追加したが、常用の 元気 が top-1 を維持 (default cost で OK)" + +[[cases]] +reading = "そうそう" +expected = "早々" +category = "extras" +tags = ["history", "person", "cost-conflict"] +note = "extras に 曹操 を追加したが、常用の 早々 が top-1 を維持 (cost 8000 で調整)" + +# --- culture: 出版社 / 音楽人 / 作品名 --- + +[[cases]] +reading = "しんちょうぶんこ" +expected = "新潮文庫" +category = "extras" +tags = ["culture", "publisher"] + +[[cases]] +reading = "かどかわにほんちめいだいじてん" +expected = "角川日本地名大辞典" +category = "extras" +tags = ["culture", "publisher"] + +[[cases]] +reading = "こむろてつや" +expected = "小室哲哉" +category = "extras" +tags = ["culture", "person"] + +[[cases]] +reading = "うちゅうせんかん" +expected = "宇宙戦艦" +category = "extras" +tags = ["culture", "work"] +note = "Mozc top-1 は 宇宙船感 (gibberish)" + +[[cases]] +reading = "あくゆう" +expected = "悪友" +category = "extras" +tags = ["culture", "person", "cost-conflict"] +note = "extras に 阿久悠 を追加したが、常用の 悪友 が top-1 を維持 (cost 8000 で調整)" + # ═══════════════════════════════════════════════════════════════════ # Compound nouns (segmentation + POS) # ═══════════════════════════════════════════════════════════════════