diff --git a/Cslib.lean b/Cslib.lean index b504973c3..403360f05 100644 --- a/Cslib.lean +++ b/Cslib.lean @@ -139,4 +139,5 @@ public import Cslib.Logics.Modal.Denotation public import Cslib.Logics.Propositional.Defs public import Cslib.Logics.Propositional.NaturalDeduction.Basic public import Cslib.MachineLearning.PACLearning.Defs +public import Cslib.MachineLearning.PACLearning.VersionSpace public import Cslib.Probability.PMF diff --git a/Cslib/MachineLearning/PACLearning/VersionSpace.lean b/Cslib/MachineLearning/PACLearning/VersionSpace.lean new file mode 100644 index 000000000..02924bafc --- /dev/null +++ b/Cslib/MachineLearning/PACLearning/VersionSpace.lean @@ -0,0 +1,296 @@ +/- +Copyright (c) 2026 Dhruv Gupta. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Dhruv Gupta +-/ + +module + +public import Cslib.MachineLearning.PACLearning.Defs +public import Mathlib.MeasureTheory.Measure.Dirac +public import Mathlib.MeasureTheory.Measure.Map + +/-! # Version Space + +The *version space* of a concept class `C` given a labeled sample `S` is the +subset of `C` whose concepts agree with `S` on every observed point — the +classical "concepts still consistent with the data" of Mitchell (1977) and +Angluin (1980). + +## Main definitions + +- `VersionSpace C S`: the subset of `C` whose concepts agree with `S` on every + sample point. +- `IsConsistent A C`: a learner is consistent with `C` if its output always lies + in the version space at the received sample. +- `empiricalMiscount h S`: number of sample points where `h` errs (`[DecidableEq β]`). +- `empiricalMeasure S`: the uniform Dirac mixture over the sample. +- `empiricalError h S`: the empirical distribution's mass on the disagreement set. +- `Realizable C S`: some concept in `C` labels every sample point correctly. + +## Main results + +- `versionSpace_subset`, `versionSpace_empty_sample`, `versionSpace_reindex`, + `versionSpace_antitone`, `versionSpace_mono_C`: structural properties. +- `mem_versionSpace_iff_empiricalMiscount_zero`: combinatorial bridge. +- `mem_versionSpace_iff_empiricalError_zero`: measure-theoretic bridge. +- `IsConsistent.empiricalMiscount_eq_zero`, `IsConsistent.empiricalError_eq_zero`: + consistent learners achieve zero error / miscount on every sample. +- `mem_versionSpace_of_realizable`, `Realizable.versionSpace_nonempty`: realizable + samples give non-empty version spaces. +- `ae_mem_versionSpace_of_realizable`: under iid sampling from a realizable + joint distribution, the target concept lies in the version space almost surely. + +## References + +* [Mitchell1977] +* [Mitchell1982] +* [Angluin1980] +* [Mitchell1997] +-/ + +@[expose] public section + +open MeasureTheory Set +open scoped ENNReal + +namespace Cslib.MachineLearning.PACLearning + +variable {α : Type*} {β : Type*} + +/-! ### Version Space -/ + +/-- The *version space* of a concept class `C` given a labeled sample `S`: +the set of concepts in `C` whose labels agree with `S` on every observed point. -/ +def VersionSpace {m : ℕ} (C : ConceptClass α β) (S : LabeledSample α β m) : + ConceptClass α β := + {h ∈ C | ∀ i : Fin m, h (S i).1 = (S i).2} + +/-- Membership in the version space unfolds to concept membership plus +per-sample consistency. -/ +theorem mem_versionSpace_iff {m : ℕ} {C : ConceptClass α β} + {S : LabeledSample α β m} {h : α → β} : + h ∈ VersionSpace C S ↔ h ∈ C ∧ ∀ i : Fin m, h (S i).1 = (S i).2 := Iff.rfl + +/-- The version space is a subset of the original concept class. -/ +theorem versionSpace_subset {m : ℕ} (C : ConceptClass α β) + (S : LabeledSample α β m) : + VersionSpace C S ⊆ C := fun _ hh => hh.1 + +/-- Version space on the empty sample equals the whole concept class. -/ +theorem versionSpace_empty_sample (C : ConceptClass α β) + (S : LabeledSample α β 0) : + VersionSpace C S = C := by + ext h + refine ⟨fun hh => hh.1, fun hh => ⟨hh, fun i => i.elim0⟩⟩ + +/-- *Version space reindexing.* For any reindexing `f : Fin m → Fin n`, the +version space on `S` is contained in the version space on the reindexed sample +`S ∘ f`. -/ +theorem versionSpace_reindex {m n : ℕ} (f : Fin m → Fin n) (C : ConceptClass α β) + (S : LabeledSample α β n) : + VersionSpace C S ⊆ VersionSpace C (S ∘ f) := + fun _ hh => ⟨hh.1, fun i => hh.2 (f i)⟩ + +/-- *Version space antitonicity.* Given a sample of size `n` and `m ≤ n`, the +version space on all `n` observations is a subset of the version space on the +first `m` observations. Special case of `versionSpace_reindex` with +`f := Fin.castLE hmn`. -/ +theorem versionSpace_antitone {m n : ℕ} (hmn : m ≤ n) (C : ConceptClass α β) + (S : LabeledSample α β n) : + VersionSpace C S ⊆ VersionSpace C (S ∘ Fin.castLE hmn) := + versionSpace_reindex (Fin.castLE hmn) C S + +/-- *Version space is monotone in the concept class.* -/ +theorem versionSpace_mono_C {m : ℕ} {C C' : ConceptClass α β} (hCC' : C ⊆ C') + (S : LabeledSample α β m) : + VersionSpace C S ⊆ VersionSpace C' S := + fun _ hh => ⟨hCC' hh.1, hh.2⟩ + +/-! ### Empirical Error -/ + +/-- The *empirical miscount* of a hypothesis `h` on a labeled sample `S`: the +number of sample points where `h` predicts incorrectly. -/ +def empiricalMiscount [DecidableEq β] {m : ℕ} (h : α → β) + (S : LabeledSample α β m) : ℕ := + (Finset.univ.filter fun i : Fin m => h (S i).1 ≠ (S i).2).card + +section EmpiricalMeasure +variable [MeasurableSpace α] [MeasurableSpace β] + +/-- The *empirical distribution* of a labeled sample: the uniform mixture of +Dirac measures at each sample point. Equals the zero measure when `m = 0`. -/ +noncomputable def empiricalMeasure {m : ℕ} (S : LabeledSample α β m) : + Measure (α × β) := + if _hm : m = 0 then 0 + else (m : ℝ≥0∞)⁻¹ • ∑ i, Measure.dirac (S i) + +/-- The *empirical 0-1 error* of `h` on `S`: the empirical distribution's +mass on the disagreement set. -/ +noncomputable def empiricalError {m : ℕ} (h : α → β) (S : LabeledSample α β m) : + ℝ≥0∞ := + error (empiricalMeasure S) h + +end EmpiricalMeasure + +/-- Version-space membership equals concept-class membership plus zero empirical +miscount (combinatorial bridge). -/ +theorem mem_versionSpace_iff_empiricalMiscount_zero [DecidableEq β] + {m : ℕ} {C : ConceptClass α β} {S : LabeledSample α β m} {h : α → β} : + h ∈ VersionSpace C S ↔ h ∈ C ∧ empiricalMiscount h S = 0 := by + simp only [empiricalMiscount, Finset.card_eq_zero, Finset.filter_eq_empty_iff, + Finset.mem_univ, true_implies, ne_eq, Decidable.not_not] + rfl + +/-- Version-space membership equals concept-class membership plus zero empirical +error (measure-theoretic bridge). -/ +theorem mem_versionSpace_iff_empiricalError_zero + [MeasurableSpace α] [MeasurableSpace β] + [MeasurableSingletonClass α] [MeasurableSingletonClass β] + {m : ℕ} {C : ConceptClass α β} {S : LabeledSample α β m} {h : α → β} : + h ∈ VersionSpace C S ↔ h ∈ C ∧ empiricalError h S = 0 := by + refine and_congr_right fun _ => ?_ + unfold empiricalError empiricalMeasure error + rcases Nat.eq_zero_or_pos m with hm | hm + · subst hm + rw [dif_pos rfl] + simp only [Measure.coe_zero, Pi.zero_apply] + exact iff_of_true (fun i => i.elim0) trivial + · have hm_ne : m ≠ 0 := Nat.pos_iff_ne_zero.mp hm + have hm_inv_ne : (m : ℝ≥0∞)⁻¹ ≠ 0 := + ENNReal.inv_ne_zero.mpr (ENNReal.natCast_ne_top m) + rw [dif_neg hm_ne, Measure.smul_apply, Measure.finsetSum_apply] + simp only [Measure.dirac_apply, Set.indicator, Set.mem_setOf_eq, Pi.one_apply, + smul_eq_mul] + rw [mul_eq_zero] + constructor + · intro hh + right + apply Finset.sum_eq_zero + intro i _ + rw [if_neg] + intro hne + exact hne (hh i) + · rintro (h1 | h2) + · exact absurd h1 hm_inv_ne + · intro i + have hi := (Finset.sum_eq_zero_iff.mp h2) i (Finset.mem_univ i) + by_contra hne + rw [if_pos hne] at hi + exact one_ne_zero hi + +/-! ### Consistent Learners -/ + +/-- A learner is *consistent* with the concept class `C` if, on every labeled +sample it receives, its output hypothesis lies in the version space of `C` at +that sample — i.e. the output is in `C` and agrees with every observed +labeled pair. -/ +def IsConsistent {m : ℕ} (A : Learner α β m) (C : ConceptClass α β) : Prop := + ∀ S : LabeledSample α β m, A S ∈ VersionSpace C S + +/-- A consistent learner's output is always in the concept class. -/ +theorem IsConsistent.output_mem_conceptClass {m : ℕ} {A : Learner α β m} + {C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m) : + A S ∈ C := (hA S).1 + +/-- A consistent learner's output agrees with the sample on every observed +point. -/ +theorem IsConsistent.output_agrees {m : ℕ} {A : Learner α β m} + {C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m) + (i : Fin m) : + A S (S i).1 = (S i).2 := (hA S).2 i + +/-- A consistent learner has zero empirical miscount on every sample. -/ +theorem IsConsistent.empiricalMiscount_eq_zero [DecidableEq β] + {m : ℕ} {A : Learner α β m} {C : ConceptClass α β} (hA : IsConsistent A C) + (S : LabeledSample α β m) : + empiricalMiscount (A S) S = 0 := + (mem_versionSpace_iff_empiricalMiscount_zero.mp (hA S)).2 + +/-- A consistent learner has zero empirical error on every sample. -/ +theorem IsConsistent.empiricalError_eq_zero + [MeasurableSpace α] [MeasurableSpace β] + [MeasurableSingletonClass α] [MeasurableSingletonClass β] + {m : ℕ} {A : Learner α β m} {C : ConceptClass α β} + (hA : IsConsistent A C) (S : LabeledSample α β m) : + empiricalError (A S) S = 0 := + (mem_versionSpace_iff_empiricalError_zero.mp (hA S)).2 + +/-! ### Realizable case -/ + +/-- A labeled sample `S` is *realizable* by concept class `C` if some concept +in `C` labels every sample point correctly. -/ +def Realizable {m : ℕ} (C : ConceptClass α β) (S : LabeledSample α β m) : Prop := + ∃ c ∈ C, ∀ i : Fin m, (S i).2 = c (S i).1 + +/-- *Realizable version-space nonemptiness.* If a target concept `c` lies in +`C` and the sample `S` is labeled by `c` (i.e. every `(S i).2 = c (S i).1`), +then `c` itself lies in the version space `VersionSpace C S`. -/ +theorem mem_versionSpace_of_realizable {m : ℕ} {C : ConceptClass α β} + {c : α → β} (hc : c ∈ C) (S : LabeledSample α β m) + (hS : ∀ i : Fin m, (S i).2 = c (S i).1) : + c ∈ VersionSpace C S := + ⟨hc, fun i => (hS i).symm⟩ + +/-- A realizable sample has nonempty version space. -/ +theorem Realizable.versionSpace_nonempty {m : ℕ} {C : ConceptClass α β} + {S : LabeledSample α β m} (h : Realizable C S) : + (VersionSpace C S).Nonempty := + ⟨h.choose, mem_versionSpace_of_realizable h.choose_spec.1 S h.choose_spec.2⟩ + +/-! ### Probabilistic Realizable -/ + +/-- Under the pushforward of a probability measure `P` along the graph map +`x ↦ (x, c x)`, the graph of `c` has measure `1`. -/ +private lemma map_graph_eq_one + [MeasurableSpace α] [MeasurableSpace β] + {c : α → β} (hcm : Measurable c) (P : Measure α) [IsProbabilityMeasure P] + (hG : MeasurableSet {p : α × β | p.2 = c p.1}) : + (P.map (fun x => (x, c x))) {p : α × β | p.2 = c p.1} = 1 := by + have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop + rw [Measure.map_apply hφ hG] + have hpre : (fun x : α => (x, c x)) ⁻¹' {p : α × β | p.2 = c p.1} = Set.univ := by + ext x; simp + rw [hpre, measure_univ] + +/-- The iid product of the realizable joint distribution assigns measure `1` +to the set of samples where every coordinate lies on the graph of `c`. -/ +private lemma pi_map_graph_eq_one + [MeasurableSpace α] [MeasurableSpace β] + {c : α → β} (hcm : Measurable c) (P : Measure α) [IsProbabilityMeasure P] + (hG : MeasurableSet {p : α × β | p.2 = c p.1}) {m : ℕ} : + (Measure.pi (fun _ : Fin m => P.map (fun x => (x, c x)))) + (Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1})) = 1 := by + have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop + haveI : IsProbabilityMeasure (P.map (fun x : α => (x, c x))) := + Measure.isProbabilityMeasure_map hφ.aemeasurable + rw [Measure.pi_pi] + simp [map_graph_eq_one hcm P hG] + +/-- Under iid sampling from the realizable joint distribution induced by +`c ∈ C` and a probability measure `P` on `α`, the target concept `c` lies in +the version space almost surely. -/ +theorem ae_mem_versionSpace_of_realizable + [MeasurableSpace α] [MeasurableSpace β] + {C : ConceptClass α β} {c : α → β} (hc : c ∈ C) (hcm : Measurable c) + (hG : MeasurableSet {p : α × β | p.2 = c p.1}) + (P : Measure α) [IsProbabilityMeasure P] (m : ℕ) : + ∀ᵐ S : LabeledSample α β m + ∂(Measure.pi (fun _ : Fin m => P.map (fun x => (x, c x)))), + c ∈ VersionSpace C S := by + have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop + haveI : IsProbabilityMeasure (P.map (fun x : α => (x, c x))) := + Measure.isProbabilityMeasure_map hφ.aemeasurable + rw [ae_iff] + have hsub : {S : Fin m → α × β | ¬ c ∈ VersionSpace C S} ⊆ + (Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1}))ᶜ := by + intro S hS hcontra + simp only [Set.mem_pi, Set.mem_univ, true_implies, Set.mem_setOf_eq] at hcontra + exact hS ⟨hc, fun i => (hcontra i).symm⟩ + have hcompl : (Measure.pi (fun _ : Fin m => P.map (fun x : α => (x, c x)))) + ((Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1}))ᶜ) = 0 := by + rw [prob_compl_eq_one_sub (MeasurableSet.univ_pi fun _ => hG), + pi_map_graph_eq_one hcm P hG, tsub_self] + exact measure_mono_null hsub hcompl + +end Cslib.MachineLearning.PACLearning diff --git a/references.bib b/references.bib index f0d122d17..31d2c452b 100644 --- a/references.bib +++ b/references.bib @@ -370,3 +370,42 @@ @incollection{WinskelNielsen1995 url = {https://doi.org/10.1093/oso/9780198537809.003.0001}, eprint = {https://academic.oup.com/book/0/chapter/421962123/chapter-pdf/52352653/isbn-9780198537809-book-part-1.pdf}, } + +@inproceedings{Mitchell1977, + author = {Mitchell, Tom M.}, + title = {Version Spaces: A Candidate Elimination Approach to Rule Learning}, + booktitle = {Proceedings of the 5th International Joint Conference on Artificial Intelligence}, + volume = {1}, + pages = {305--310}, + year = {1977} +} + +@article{Mitchell1982, + author = {Mitchell, Tom M.}, + title = {Generalization as Search}, + journal = {Artificial Intelligence}, + volume = {18}, + number = {2}, + pages = {203--226}, + year = {1982}, + doi = {10.1016/0004-3702(82)90040-6} +} + +@article{Angluin1980, + author = {Angluin, Dana}, + title = {Inductive Inference of Formal Languages from Positive Data}, + journal = {Information and Control}, + volume = {45}, + number = {2}, + pages = {117--135}, + year = {1980}, + doi = {10.1016/S0019-9958(80)90285-5} +} + +@book{Mitchell1997, + author = {Mitchell, Tom M.}, + title = {Machine Learning}, + year = {1997}, + publisher = {McGraw-Hill}, + isbn = {0070428077} +}