Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions engine/crates/lex-cli/src/dict_source/extras.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const DOMAINS: &[(&str, &str)] = &[
("it.tsv", include_str!("extras/it.tsv")),
("food.tsv", include_str!("extras/food.tsv")),
("geography.tsv", include_str!("extras/geography.tsv")),
("history.tsv", include_str!("extras/history.tsv")),
("culture.tsv", include_str!("extras/culture.tsv")),
];

/// Default cost for entries that don't specify one. Mid-range so curated
Expand Down Expand Up @@ -147,6 +149,19 @@ mod tests {
.expect("きららざか must map to 雲母坂");
assert!(kirarazaka.iter().any(|e| e.surface == "雲母坂"));

// history domain
let eiroku = entries.get("えいろく").expect("えいろく must map to 永禄");
assert!(eiroku.iter().any(|e| e.surface == "永禄"));
// Cost override on a history entry (寛保 cost=7000) must round-trip.
let kanpou = entries.get("かんぽう").expect("かんぽう must map to 寛保");
assert!(kanpou.iter().any(|e| e.surface == "寛保" && e.cost == 7000));

// culture domain
let komuro = entries
.get("こむろてつや")
.expect("こむろてつや must map to 小室哲哉");
assert!(komuro.iter().any(|e| e.surface == "小室哲哉"));

// All entries use the default POS id.
for entry_list in entries.values() {
for entry in entry_list {
Expand Down
35 changes: 35 additions & 0 deletions engine/crates/lex-cli/src/dict_source/extras/culture.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Lexime curated extras: culture & media vocabulary (publishers, music
# producers/artists, popular culture) not covered by Mozc UT.
#
# Format: reading[TAB]surface[TAB]cost(optional)
# Comment lines start with '#'. See it.tsv for full format spec.
#
# Sourcing: surfaces below were sampled from `dictool candidates corpus`
# output on a Japanese Wikipedia dump and verified via `lextool explain`
# to ensure each entry fills a real Mozc gap (top-1 was gibberish or an
# obscure homophone) rather than duplicating an existing dict entry.

# --- 出版社・文庫 ---
しんちょうぶんこ 新潮文庫
ちゅうこうぶんこ 中公文庫
ちゅうおうこうろんしんしゃ 中央公論新社
とうきょうだいがくしゅっぱんかい 東京大学出版会
せいぶんどうしんこうしゃ 誠文堂新光社
こうぶんどう 弘文堂
かどかわにほんちめいだいじてん 角川日本地名大辞典
こくしだいじてん 国史大辞典

# --- 作詞家・作曲家・アーティスト ---
こむろてつや 小室哲哉
あきもとやすし 秋元康
まつとうやゆみ 松任谷由実
あむろなみえ 安室奈美恵
さわだけんじ 沢田研二
つつみきょうへい 筒美京平
まつもとたかし 松本隆
みずきなな 水樹奈々
# 阿久悠: 悪友 (final_cost ~11800) を top-1 に残すため cost を上げる
あくゆう 阿久悠 8000

# --- 作品名 ---
うちゅうせんかん 宇宙戦艦
38 changes: 38 additions & 0 deletions engine/crates/lex-cli/src/dict_source/extras/history.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Lexime curated extras: historical vocabulary (era names, court ranks,
# classical works, historical figures and organizations) not covered by
# Mozc UT.
#
# Format: reading[TAB]surface[TAB]cost(optional)
# Comment lines start with '#'. See it.tsv for full format spec.
#
# Sourcing: surfaces below were sampled from `dictool candidates corpus`
# output on a Japanese Wikipedia dump and verified via `lextool explain`
# to ensure each entry fills a real Mozc gap (top-1 was gibberish or an
# obscure homophone) rather than duplicating an existing dict entry.

# --- 元号 (戦国〜江戸) ---
えいろく 永禄
じょうがん 貞観
ほうれき 宝暦
げんき 元亀
えんきょう 延享
きょうろく 享禄
# 寛保: 漢方 (final_cost ~10500) を top-1 に残すため cost を上げる
かんぽう 寛保 7000

# --- 位階 ---
じゅごい 従五位
じゅさんみ 従三位
しょうごい 正五位
しょうしい 正四位

# --- 古典・古書 ---
しょくにほんぎ 続日本紀
かんせいちょうしゅうしょかふ 寛政重修諸家譜
せんしそうしょ 戦史叢書

# --- 歴史人物・国家 ---
ぶてい 武帝
ほくぎ 北魏
# 曹操: 早々 (final_cost ~10600) を top-1 に残すため cost を上げる
そうそう 曹操 8000
93 changes: 93 additions & 0 deletions engine/testcorpus/accuracy-corpus.toml
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,99 @@ expected = "雲母坂"
category = "extras"
tags = ["geography", "domain-reading"]

# --- history: 元号 / 位階 / 古典 / 歴史人物 ---
# Mozc UT に surface 自体が無く、lattice top-1 が gibberish になる ----

[[cases]]
reading = "えいろく"
expected = "永禄"
category = "extras"
tags = ["history", "era"]
note = "Mozc top-1 は 栄禄 (obscure)。Wikipedia corpus mining で発見"

[[cases]]
reading = "じゅごい"
expected = "従五位"
category = "extras"
tags = ["history", "court-rank"]

[[cases]]
reading = "しょくにほんぎ"
expected = "続日本紀"
category = "extras"
tags = ["history", "classical-work"]

[[cases]]
reading = "せんしそうしょ"
expected = "戦史叢書"
category = "extras"
tags = ["history", "classical-work"]
note = "Mozc top-1 は 戦士叢書 (同音異字)"

[[cases]]
reading = "ほくぎ"
expected = "北魏"
category = "extras"
tags = ["history", "dynasty"]

# 共存ケース: extras に追加した surface が Mozc top-1 を上書きしないこと
# (寛保/元亀/曹操 を入れたが、常用語の 漢方/元気/早々 が top-1 を維持)

[[cases]]
reading = "かんぽう"
expected = "漢方"
category = "extras"
tags = ["history", "era", "cost-conflict"]
note = "extras に 寛保 を追加したが、常用の 漢方 が top-1 を維持 (cost 7000 で調整)"

[[cases]]
reading = "げんき"
expected = "元気"
category = "extras"
tags = ["history", "era", "cost-conflict"]
note = "extras に 元亀 を追加したが、常用の 元気 が top-1 を維持 (default cost で OK)"

[[cases]]
reading = "そうそう"
expected = "早々"
category = "extras"
tags = ["history", "person", "cost-conflict"]
note = "extras に 曹操 を追加したが、常用の 早々 が top-1 を維持 (cost 8000 で調整)"

# --- culture: 出版社 / 音楽人 / 作品名 ---

[[cases]]
reading = "しんちょうぶんこ"
expected = "新潮文庫"
category = "extras"
tags = ["culture", "publisher"]

[[cases]]
reading = "かどかわにほんちめいだいじてん"
expected = "角川日本地名大辞典"
category = "extras"
tags = ["culture", "publisher"]

[[cases]]
reading = "こむろてつや"
expected = "小室哲哉"
category = "extras"
tags = ["culture", "person"]

[[cases]]
reading = "うちゅうせんかん"
expected = "宇宙戦艦"
category = "extras"
tags = ["culture", "work"]
note = "Mozc top-1 は 宇宙船感 (gibberish)"

[[cases]]
reading = "あくゆう"
expected = "悪友"
category = "extras"
tags = ["culture", "person", "cost-conflict"]
note = "extras に 阿久悠 を追加したが、常用の 悪友 が top-1 を維持 (cost 8000 で調整)"

# ═══════════════════════════════════════════════════════════════════
# Compound nouns (segmentation + POS)
# ═══════════════════════════════════════════════════════════════════
Expand Down
Loading