Add definition fetching from web URLs

Systemcluster · Systemcluster · commit 5f14d3cb19a4 · 2026-03-10T17:29:19.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -57,6 +57,7 @@ all = [
     "regex-unicode",
     "regex-perf",
     "multiversion",
+    "web",
 ]
 
 # Enables standard library features
@@ -100,6 +101,9 @@ regex-onig = ["dep:onig"]
 # Enables the use of multiversion for generating multiple code paths with different CPU feature utilization
 multiversion = ["dep:multiversion"]
 
+# Enables fetching definitions from HuggingFace or other URLs
+web = ["std", "dep:reqwest"]
+
 # Enables the use of unstable features
 unstable = []
 
@@ -157,6 +161,9 @@ serde_json = { version = "1.0", default-features = false, features = ["alloc"],
 # optional dependencies for the multiversion feature
 multiversion = { version = "0.8", default-features = false, optional = true }
 
+# optional dependencies for the web feature
+reqwest = { version = "0.13", default-features = false, features = ["blocking", "rustls", "system-proxy"], optional = true }
+
 [dev-dependencies]
 
 kitoken = { path = ".", default-features = false, features = [
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 
 ```rust
 use kitoken::Kitoken;
-let encoder = Kitoken::from_file("models/llama4.kit")?;
+let encoder = Kitoken::from_web("hf:Qwen/Qwen3.5-9B")?;
 
 let tokens = encoder.encode("Your future belongs to me.", true)?;
 let string = String::from_utf8(encoder.decode(&tokens, true)?)?;
@@ -29,19 +29,19 @@ Kitoken is a fast and versatile tokenizer for language models compatible with [S
   Native in Rust and with bindings for [Web](./packages/javascript), [Node](./packages/javascript) and [Python](./packages/python); see [kitoken.dev](https://kitoken.dev) for a web demo.
 - **Supports input and output processing**\
   Including unicode-aware normalization, pre-tokenization and post-processing options.
-- **Compact data format**\
+- **Compact data encoding**\
   Definitions are stored in an efficient binary format and without merge list.
 
 See also [`kitoken-cli`](./packages/cli) for Kitoken in the command line.
 
 ## Compatibility
 
-Kitoken can load and convert many existing tokenizer formats. Every supported format is [tested](./tests) against the original implementation across a variety of inputs to ensure correctness and compatibility.
+Kitoken can load and convert most existing tokenizer formats. Every supported format is [tested](./tests) against the original implementation across a wide variety of inputs to ensure correctness and compatibility.
 
 > [!NOTE]
 > Most models on [Hugging Face](https://huggingface.co) are supported. Just take the `tokenizer.json` or `spiece.model` and load it into Kitoken.
 
-Kitoken aims to be output-identical with existing implementations for all models. See the notes below for differences in specific cases.
+Kitoken aims to be output-identical with existing implementations for all models. <sup>See the notes below for differences in specific cases.</sup>
 
 ### SentencePiece
 
@@ -59,8 +59,8 @@ If the model does not contain a trainer definition, `Unigram` is assumed as the
 <details>
 <summary>Notes</summary>
 
-- SentencePiece uses [different `nfkc` normalization rules in the `nmt_nfkc` and `nmt_nfkc_cf` schemes](https://github.com/google/sentencepiece/blob/master/doc/normalization.md) than during regular `nfkc` normalization. This difference is not entirely additive and prevents the normalization of `～` to `~`. Kitoken uses the regular `nfkc` normalization rules for `nmt_nfkc` and `nmt_nfkc_cf` and normalizes `～` to `~`.
-- SentencePiece's implementation of Unigram merges pieces with the same merge priority in a different order depending on preceding non-encodable pieces. For example, with `xlnet_base_cased`, SentencePiece encodes `.nnn` and `Զnnn` as `.., 8705, 180` but `ԶԶnnn` as `.., 180, 8705`. Kitoken always merges pieces with the same merge priority in the same order, resulting in `.., 180, 8705` for either case in the example and matching the behavior of Tokenizers.
+- SentencePiece uses [different `nfkc` normalization rules in the `nmt_nfkc` and `nmt_nfkc_cf` schemes](https://github.com/google/sentencepiece/blob/master/doc/normalization.md) than during regular `nfkc` normalization, preventing the normalization of `～` to `~`. Kitoken uses the regular `nfkc` normalization rules for `nmt_nfkc` and `nmt_nfkc_cf`.
+- SentencePiece's implementation of Unigram merges pieces with the same merge priority in a different order depending on preceding non-encodable pieces. Kitoken always merges pieces with the same merge priority in the same order, matching the behavior of Tokenizers.
 
 </details>
 
@@ -83,7 +83,7 @@ Some normalization, post-processing and decoding options used by Tokenizers are
 <details>
 <summary>Notes</summary>
 
-- When using a `BPE` definition with an incomplete vocabulary and without an `unk` token, Tokenizers skips over non-encodable pieces and attempts to merge the surrounding ones. Kitoken always considers non-encodable pieces as un-mergeable and encodes the surrounding pieces individually. This can affect models that exploit the behavior of Tokenizers with a deliberately restricted vocabulary.
+- Tokenizers skips over non-encodable pieces and attempts to merge the surrounding ones when using an incomplete vocabulary without an `unk` token. Kitoken always considers non-encodable pieces as un-mergeable and encodes the surrounding pieces individually. This can affect models that exploit the behavior of Tokenizers with a deliberately restricted vocabulary.
 - Tokenizers normalizes inputs character-by-character, while Kitoken normalizes inputs as one. This can result in differences during case-folding in some cases. For example, greek letter `Σ` has two lowercase forms, `σ` for within-word and `ς` for end-of-word use. Tokenizers will always lowercase `Σ` to `σ`, while Kitoken will lowercase it to either depending on the context.
 
 </details>
diff --git a/packages/cli/Cargo.toml b/packages/cli/Cargo.toml
@@ -19,13 +19,13 @@ path = "src/main.rs"
 
 kitoken = { path = "../..", features = ["all"] }
 
-log = { version = "0.4.27" }
-clap = { version = "4.5.36", features = [
+log = { version = "0.4" }
+clap = { version = "4.5", features = [
     "std",
     "color",
     "suggestions",
     "derive",
     "unicode",
     "wrap_help",
 ] }
-simple_logger = { version = "5.0" }
+simple_logger = { version = "5.2" }
diff --git a/packages/cli/src/main.rs b/packages/cli/src/main.rs
@@ -1,10 +1,10 @@
 use std::fs::File;
 use std::io::{BufReader, BufWriter, Read, Seek};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::Once;
 
 use clap::Parser;
-use kitoken::{Definition, DeserializationError, Kitoken};
+use kitoken::{Definition, DeserializationError, Kitoken, WebRequestError};
 
 #[derive(Parser)]
 enum Command {
@@ -82,20 +82,18 @@ pub fn main() {
                 std::process::exit(1);
             }
             for path in paths {
-                convert(&path, true).unwrap_or_else(|error| {
+                convert_path(path.to_str().unwrap(), true).unwrap_or_else(|error| {
                     eprintln!("{}", error);
                     std::process::exit(1);
                 });
             }
         }
         Command::Compare { one, two } => {
-            let one = Path::new(&one);
-            let two = Path::new(&two);
-            let one = convert(one, false).unwrap_or_else(|error| {
+            let one = convert_path(&one, false).unwrap_or_else(|error| {
                 eprintln!("{}", error);
                 std::process::exit(1);
             });
-            let two = convert(two, false).unwrap_or_else(|error| {
+            let two = convert_path(&two, false).unwrap_or_else(|error| {
                 eprintln!("{}", error);
                 std::process::exit(1);
             });
@@ -129,18 +127,16 @@ pub fn main() {
             }
         }
         Command::Inspect { path } => {
-            let path = Path::new(&path);
-            let model = convert(path, false).unwrap_or_else(|error| {
+            let model = convert_path(&path, false).unwrap_or_else(|error| {
                 eprintln!("{}", error);
                 std::process::exit(1);
             });
             println!("Specials: {:#?}", model.specials);
             println!("{:#?}", model);
         }
         Command::Encode { model, input } => {
-            let model = Path::new(&model);
             let inputp = Path::new(&input);
-            let model = convert(model, false).unwrap_or_else(|error| {
+            let model = convert_path(&model, false).unwrap_or_else(|error| {
                 eprintln!("{}", error);
                 std::process::exit(1);
             });
@@ -166,9 +162,8 @@ pub fn main() {
             println!()
         }
         Command::Decode { model, input } => {
-            let model = Path::new(&model);
             let inputp = Path::new(&input);
-            let model = convert(model, false).unwrap_or_else(|error| {
+            let model = convert_path(&model, false).unwrap_or_else(|error| {
                 eprintln!("{}", error);
                 std::process::exit(1);
             });
@@ -202,10 +197,27 @@ pub fn main() {
     }
 }
 
-pub fn convert(path: &Path, write: bool) -> Result<Definition, DeserializationError> {
+pub fn convert_web(url: &str) -> Result<Definition, WebRequestError> {
+    let definition = Definition::from_web(url)?;
+    eprintln!("Definition loaded from {}", definition.meta.source);
+    convert(definition, None).map_err(|e| e.into())
+}
+
+pub fn convert_path(path: &str, write: bool) -> Result<Definition, WebRequestError> {
+    if path.starts_with("hf:") || path.starts_with("http:") || path.starts_with("https:") {
+        return convert_web(path);
+    }
     let mut reader = BufReader::new(File::open(path)?);
     let definition = Definition::from_reader(&mut reader)?;
-    eprintln!("Definition loaded from {}", path.display());
+    eprintln!("Definition loaded from {}", path);
+    eprintln!("Input size: {} bytes", reader.stream_position()?);
+    convert(definition, write.then(|| PathBuf::from(path).with_extension("kit")))
+        .map_err(|e| e.into())
+}
+
+pub fn convert(
+    definition: Definition, out: Option<PathBuf>,
+) -> Result<Definition, DeserializationError> {
     match definition.model {
         kitoken::Model::BytePair { .. } => eprintln!("Model type: BPE"),
         kitoken::Model::Unigram { .. } => eprintln!("Model type: Unigram"),
@@ -214,9 +226,7 @@ pub fn convert(path: &Path, write: bool) -> Result<Definition, DeserializationEr
     }
     eprintln!("Vocab size: {}", definition.model.vocab().len());
     eprintln!("Specials size: {}", definition.specials.len());
-    eprintln!("Input size: {} bytes", reader.stream_position()?);
-    if write {
-        let out = path.with_extension("kit");
+    if let Some(out) = out {
         let mut writer = BufWriter::new(File::create(&out)?);
         definition.to_writer(&mut writer)?;
         eprintln!("Definition written to {}", out.display());
diff --git a/packages/python/README.md b/packages/python/README.md
@@ -12,7 +12,7 @@
 ```py
 from kitoken import Kitoken
 
-encoder = Kitoken.from_file("models/llama4.model")
+encoder = Kitoken.from_web("hf:Qwen/Qwen3.5-9B")
 
 tokens = encoder.encode("hello world!", True)
 string = encoder.decode(tokens).decode("utf-8")
@@ -25,12 +25,12 @@ assert string == "hello world!"
 Kitoken is a fast and versatile tokenizer for language models compatible with [SentencePiece](https://github.com/google/sentencepiece), [HuggingFace Tokenizers](https://github.com/huggingface/tokenizers), [OpenAI Tiktoken](https://github.com/openai/tiktoken) and [Mistral Tekken](https://docs.mistral.ai/guides/tokenization), supporting BPE, Unigram and WordPiece tokenization.
 
 - **Fast and efficient tokenization**\
-  Faster than most other tokenizers in both common and uncommon scenarios; see the [benchmarks](//github.com/Systemcluster/kitoken#benchmarks) for comparisons with different datasets.
+  Faster than most other tokenizers in both common and uncommon scenarios; see the [benchmarks](#benchmarks) for comparisons with different datasets.
 - **Runs in all environments**\
   Native in Rust and with bindings for [Web](./packages/javascript), [Node](./packages/javascript) and [Python](./packages/python); see [kitoken.dev](https://kitoken.dev) for a web demo.
 - **Supports input and output processing**\
-  Including unicode-aware normalization, pre-tokenization and post-decoding options.
-- **Compact data format**\
+  Including unicode-aware normalization, pre-tokenization and post-processing options.
+- **Compact data encoding**\
   Definitions are stored in an efficient binary format and without merge list.
 
 See the main [README](//github.com/Systemcluster/kitoken) for more information.
diff --git a/packages/python/kitoken.pyi b/packages/python/kitoken.pyi
@@ -49,7 +49,9 @@ class Kitoken:
         """
         ...
 
-    def encode_all(self, text: list[str], encode_specials: Optional[bool] = False) -> list[list[int]]:
+    def encode_all(
+        self, text: list[str], encode_specials: Optional[bool] = False
+    ) -> list[list[int]]:
         """
         Encodes the given texts into sequences of tokens.
         If `encode_specials` is `True`, the text is first split around special tokens which are separately encoded with the special encoder.
@@ -69,7 +71,9 @@ class Kitoken:
         """
     ...
 
-    def decode_all(self, data: list[list[int]], decode_specials: Optional[bool] = False) -> list[bytes]:
+    def decode_all(
+        self, data: list[list[int]], decode_specials: Optional[bool] = False
+    ) -> list[bytes]:
         """
         Decodes the given sequences of tokens into texts.
         Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
@@ -177,3 +181,12 @@ class Kitoken:
         :param path: The path to the file.
         """
         ...
+
+    @staticmethod
+    def from_web(url: str) -> Kitoken:
+        """
+        Initializes the tokenizer from a model URL.
+
+        :param url: The URL to the file.
+        """
+        ...
diff --git a/packages/python/src/lib.rs b/packages/python/src/lib.rs
@@ -229,6 +229,13 @@ impl Kitoken {
                 .map_err(convert_error)?,
         })
     }
+
+    #[staticmethod]
+    pub fn from_web(url: &str, py: Python<'_>) -> PyResult<Kitoken> {
+        Ok(Kitoken {
+            inner: py.detach(|| Inner::from_web(url)).map(Arc::new).map_err(convert_error)?,
+        })
+    }
 }
 
 #[pymodule]
diff --git a/packages/python/test.py b/packages/python/test.py
@@ -39,3 +39,10 @@
 
 encoder.to_bytes()
 print("OK")
+
+try:
+    encoder = Kitoken.from_web("hf:Qwen/Qwen3.5-9B")
+    print(encoder)
+    print("Web OK")
+except Exception as e:
+    print(f"Web ERR: {e}")
diff --git a/src/encoder/wordpiece.rs b/src/encoder/wordpiece.rs
@@ -170,6 +170,7 @@ impl WordPiece {
             || self.max_word_chars > 0 && indices.clone().count() > self.max_word_chars
         {
             if fallback.peek() == Some(&Fallback::Unknown) && self.unknown.is_some() {
+                #[allow(clippy::unnecessary_unwrap)]
                 result.push(self.unknown.as_ref().unwrap().id);
             } else if fallback.peek() == Some(&Fallback::Skip) {
             } else {
diff --git a/src/lib.rs b/src/lib.rs
@@ -53,6 +53,7 @@
 //! - `regex-onig`: Enables use of the `oniguruma` regex engine instead of `fancy-regex`.
 //!   Generally not recommended since it has worse runtime performance and adds a dependency on the native `oniguruma` library.
 //!   However, it may be useful for certain models that require specific regex behavior that is not supported by or differs with `fancy-regex`.
+//! - `web`: Enables fetching definitions from HuggingFace or other URLs.
 
 #![no_std]
 #![cfg_attr(docsrs, feature(doc_auto_cfg, doc_cfg_hide))]
@@ -73,6 +74,8 @@ mod vocab;
 
 #[cfg(feature = "serialization")]
 mod serialization;
+#[cfg(feature = "web")]
+mod web;
 
 pub mod convert;
 
@@ -95,6 +98,11 @@ pub use crate::vocab::*;
 
 #[cfg(feature = "serialization")]
 pub use crate::serialization::*;
+#[cfg(feature = "web")]
+pub use crate::web::*;
+
+#[doc(hidden)]
+pub mod util;
 
 /// Errors encountered during initialization.
 #[non_exhaustive]
@@ -104,13 +112,17 @@ pub enum InitializationError {
     #[error("invalid config: {0}")]
     InvalidConfig(ConfigurationError),
     /// The encoder and scores must have the same length in unigram mode.
-    #[error("encoder and scores must have the same length in unigram mode and every token must have a score")]
+    #[error(
+        "encoder and scores must have the same length in unigram mode and every token must have a score"
+    )]
     InvalidScores,
     /// The encoder and decoder must have the same length and the encoder must not have duplicates.
     #[error("encoder and decoder must have the same length and vocab must not have duplicates")]
     InvalidEncoder,
     /// The special encoder and decoder must have the same length and the special encoder must not have duplicates.
-    #[error("special encoder and decoder must have the same length and specials must not have duplicates")]
+    #[error(
+        "special encoder and decoder must have the same length and specials must not have duplicates"
+    )]
     InvalidSpecialEncoder,
     /// The split regex failed to compile.
     #[error("invalid regex: {0}")]
diff --git a/src/util.rs b/src/util.rs
@@ -0,0 +1,16 @@
+use alloc::string::{String, ToString};
+
+pub fn parse_url(url: &str) -> String {
+    if url.starts_with("hf:") {
+        let repo = url.strip_prefix("hf:").unwrap();
+        [
+            "https://huggingface.co",
+            repo,
+            "resolve/main/tokenizer.json",
+        ]
+        .join("/")
+        .to_string()
+    } else {
+        url.to_string()
+    }
+}
diff --git a/src/web.rs b/src/web.rs

Original file line number	Diff line number	Diff line change
`@@ -229,6 +229,13 @@ impl Kitoken {`
`229`	`229`	`.map_err(convert_error)?,`
`230`	`230`	`})`
`231`	`231`	`}`
	`232`	`+`
	`233`	`+ #[staticmethod]`
	`234`	`+ pub fn from_web(url: &str, py: Python<'_>) -> PyResult<Kitoken> {`
	`235`	`+ Ok(Kitoken {`
	`236`	`+ inner: py.detach(\|\| Inner::from_web(url)).map(Arc::new).map_err(convert_error)?,`
	`237`	`+ })`
	`238`	`+ }`
`232`	`239`	`}`
`233`	`240`
`234`	`241`	`#[pymodule]`
Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,7 @@ impl WordPiece {`
`170`	`170`	`\|\| self.max_word_chars > 0 && indices.clone().count() > self.max_word_chars`
`171`	`171`	`{`
`172`	`172`	`if fallback.peek() == Some(&Fallback::Unknown) && self.unknown.is_some() {`
	`173`	`+ #[allow(clippy::unnecessary_unwrap)]`
`173`	`174`	`result.push(self.unknown.as_ref().unwrap().id);`
`174`	`175`	`} else if fallback.peek() == Some(&Fallback::Skip) {`
`175`	`176`	`} else {`