From b9cd30fffb7a2ddff13dc2b9ead5c561b694dc72 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 4 Jun 2026 16:08:12 +1200 Subject: [PATCH 1/4] re-write the obs paragraph in index.md --- docs/src/img/obs.drawio | 1 + docs/src/img/obs.svg | 3 +++ docs/src/index.md | 54 ++++++++++++++++++++++++++--------------- 3 files changed, 39 insertions(+), 19 deletions(-) create mode 100644 docs/src/img/obs.drawio create mode 100644 docs/src/img/obs.svg diff --git a/docs/src/img/obs.drawio b/docs/src/img/obs.drawio new file mode 100644 index 00000000..d534d418 --- /dev/null +++ b/docs/src/img/obs.drawio @@ -0,0 +1 @@ +7Vldb5swFP01SNtDJ8CEpI9r2n1I60NbTWv3MrnggDtjI2MS0l8/O5ivmEaZmoRE7UNV7rHB9jn2PRdigWlSfOUwja9ZiIjl2mFhgUvLdR3H9uU/hSxLZKIiBUQch7pTA9zhZ6RBW6M5DlHW6SgYIwKnXTBglKJAdDDIOVt0u80Y6Y6awggZwF0AiYn+wqGIq3X5503DN4SjWFTrG5cNCaw665VkMQzZogWBKwtMOWOivEqKKSKKvIqX8r4vL7TWE+OIim1uCG5+jp+vb5/S2+KGirPvsyj6faZXkYlltWAUyvXrkHERs4hRSK4a9IKznIZIPdWWUdPnB2OpBB0JPiEhllpMmAsmoVgkRLeiAot7dfunkY4eWi2XhX7yKljqYMao0A90fBmX81aTfZEODWUs5wHawIGntxXkERIb+vm1aHK3I5YgwZfyPo4IFHjenQfU2y6q+zXKyAstzn8I5YwMpV6vA5UzawmhwoeKfBU0UqyiI9LCcYcUQ89yDkmuR9ogjmJ6EWOB7lK4WvtCZsquEG1CvZrQOeICFZspNRnQN9QJVOddT4eLVhKrMlPcyl++vSfORgZneYa45fpEjnzxqK4idRVCAQ02JQ+iS1kmOPuLpowwLhHKqMpMM0zIGgQJjqgMA8mbHA5cKFaxzO+fdUOCw3CV1vo06h6x/n3/OplGXZl8UyavRyV3Xyr5p7ezwfnQW3tskJaUddBb3cWeM/A2rsZvSTLD4gNBkFOVdORT7T/y7+OJaKTiajjLBQD4vm3vRrv6qGjtalHax2nSIx7Yl3h1wdxflGq6B6lCj7BictxtS6bxkCVTNc1jdhZv7Sg4PW58WGdxgMEae5Rl01wqxmh2ItlrF9L4a9JMhnYYs1KSypyuw+xAI7BWBfTVZYc1EnD8RlK/hz90XWUAIxlvaSTDfgcxi+2j95Hek3BYH5kYrGX5YwaTlEiejHfwd49Z5aWh32KqJbRUkydT2Yw2l9JmMA1xgLI35DTeZO2A9X3dOqzVmJ8E3184XygT1sTr/X6zI/Fk2PzAsmpr/UwFrv4B \ No newline at end of file diff --git a/docs/src/img/obs.svg b/docs/src/img/obs.svg new file mode 100644 index 00000000..0b24d7c9 --- /dev/null +++ b/docs/src/img/obs.svg @@ -0,0 +1,3 @@ + + +
user
data
user...
model
model
fit(learner, _ )
fit(learner, _ )
observations
obser...
obs(learner, _ )
obs(learner, _ )
subsampled
observations
subsa...
getobs( _ , indices)
getobs( _ , indices)
fit(learner, _ )
fit(learner, _ )
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index bbe4c372..858d43a7 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -70,25 +70,41 @@ appear as an input to training but not to prediction. ## Data interfaces and front ends -Algorithms are free to consume data in any format. However, a method called [`obs`](@ref -data_interface) (read as "observations") gives developers the option of providing a -separate data front end for their algorithms. In this case `obs` gives users and -meta-algorithms access to an algorithm-specific representation of input data, which is -additionally guaranteed to implement a standard interface for accessing individual -observations, unless the algorithm explicitly opts out. Moreover, the `fit` and `predict` -methods can directly consume these alternative data representations, for performance -benefits in some situations, such as cross-validation. - -The fallback data interface is the [MLCore.jl](https://github.com/JuliaML/MLCore.jl) -`getobs/numobs` interface (previously provided by MLUtils.jl) here tagged as -[`LearnAPI.RandomAccess()`](@ref). However, if the input consumed by the algorithm already -implements that interface (tables, arrays, etc.) then overloading `obs` is completely -optional. Plain iteration interfaces, with or without knowledge of the number of -observations, can also be specified, to support, e.g., data loaders reading images from -disk. - -Some canned data front ends (implementations of [`obs`](@ref)) are provided by the -[LearnDataFrontEnds.jl](https://juliaai.github.io/LearnDataFrontEnds.jl/stable/) package. +Algorithms are free to consume data in any format. However, this means LearnAPI.jl should +provide meta-algorithms, such as cross-validation, some means of subsampling observations, +without repeating unnecessarily internal conversions of input data into the form needed by +core algorithms. LearnAPI.jl's solution to this problem is to provide a method called +[`obs(learner, data)`](@ref data_interface) (read as "observations") which exposes to the +user, and whence third party meta-algorithms, a learner-specific, "internal" +representation of the "external" `data` ordinarily supplied to `fit` (or `predict`) by the +user. For example, `data` might be a table with mixed column types, but `obs(learner, +data)` consists only of numerical arrays. Unless the implementation opts out, such a +representation is additionally guaranteed to implement a standard interface for accessing +individual observations, the [MLCore.jl](https://github.com/JuliaML/MLCore.jl) +`getobs/numobs` API (previously provided by MLUtils.jl) which is here tagged as +[`LearnAPI.RandomAccess()`](@ref). These can then be subsampled, without caring about the +details of the representation, as in cross-validation. Moreover, such "observations" +(sampled or not) can be passed on to `fit` and `predict`, instead of the original external +form of `data`. In other words, `obs` factors out of `fit` the internal preprocessing of +user-supplied data, but in a way that ensures the intercepted, internal form of data +implements a standard subsampling API. + +![](img/obs.svg) + +> Two pathways to generating a model, with and without subsampling. Here `obs` is provided +> by an LearnAPI.jl learner implementation, while `getobs` is a MLCore.jl method for +> subsampling. + +If the input consumed by the algorithm already implements the +[`LearnAPI.RandomAccess()`](@ref) interface (tables, arrays, etc.) then overloading `obs` +is completely optional, as LearnAPI.jl provides a no-operation fallback. Plain iteration +interfaces, with or without knowledge of the number of observations, can also be +specified, to support, e.g., data loaders reading images from disk. + +In the typical case, a new implementation can avoid actually coding data preprocessing by +using a canned data front end (implementations of [`obs`](@ref)). These are provided by +the [LearnDataFrontEnds.jl](https://juliaai.github.io/LearnDataFrontEnds.jl/stable/) +package. ## Learning more From a4307a269aa5633d1813261f90f68e7577ca403f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 5 Jun 2026 08:04:48 +1200 Subject: [PATCH 2/4] config typos to ignore SVG files --- typos.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/typos.toml b/typos.toml index 8f5d6f5a..80346412 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,5 @@ +extend-exclude = ["*.svg"] + [default.extend-words] # Don't correct "mape" to "map" mape = "mape" From 5f34df7c753de8c8fb4c729b7100026c91e83588 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 5 Jun 2026 08:21:51 +1200 Subject: [PATCH 3/4] try again --- typos.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/typos.toml b/typos.toml index 80346412..da58b5db 100644 --- a/typos.toml +++ b/typos.toml @@ -1,3 +1,4 @@ +[files] extend-exclude = ["*.svg"] [default.extend-words] From b17f7dea942464c626e2f7c5267751f7de3705ea Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 5 Jun 2026 09:14:13 +1200 Subject: [PATCH 4/4] exlcude drawio files from typos --- typos.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/typos.toml b/typos.toml index da58b5db..7c9e621b 100644 --- a/typos.toml +++ b/typos.toml @@ -1,5 +1,5 @@ [files] -extend-exclude = ["*.svg"] +extend-exclude = ["*.svg", "*.drawio"] [default.extend-words] # Don't correct "mape" to "map"