Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion pkg/ingest/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,17 @@ func (p *Pipeline) persistTree(ctx context.Context, store docPersister, docID tr
// time so we never persist bytes the LLM SDKs would reject
// later. PDFs with CID-mapped fonts and no ToUnicode CMap
// leak raw glyph IDs into extracted text.
// Only assign a ContentRef when we actually wrote content. A
// leaf whose text is empty after cleanForLLM (heading-only
// sections, or CID-font garbage stripped to nothing) gets NO
// object stored, so it must get NO ref — otherwise every later
// read (summarize, HyDE, treewalk get_pages) chases a key that
// was never written and fails with "storage: object not found".
// Empty ContentRef is already the canonical "no stored content"
// state every reader guards on (summaryFor falls back to the
// title; the treewalk loader returns the summary or empty).
cleanedContent := cleanForLLM(s.Content)
contentRef := ""
if strings.TrimSpace(cleanedContent) != "" {
if err := p.Storage.Put(ctx, contentKey,
bytes.NewReader([]byte(cleanedContent)),
Expand All @@ -638,6 +648,7 @@ func (p *Pipeline) persistTree(ctx context.Context, store docPersister, docID tr
}); err != nil {
return fmt.Errorf("store section %s: %w", id, err)
}
contentRef = contentKey
}

if err := store.UpsertSection(ctx, db.Section{
Expand All @@ -647,7 +658,7 @@ func (p *Pipeline) persistTree(ctx context.Context, store docPersister, docID tr
Ordinal: i,
Depth: depth,
Title: cleanForLLM(s.Title),
ContentRef: contentKey,
ContentRef: contentRef,
TokenCount: approxTokens(cleanedContent),
PageStart: s.PageStart,
PageEnd: s.PageEnd,
Expand Down
75 changes: 75 additions & 0 deletions pkg/ingest/persist_content_ref_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package ingest

import (
"context"
"strings"
"testing"

"github.com/hallelx2/vectorless-engine/pkg/parser"
"github.com/hallelx2/vectorless-engine/pkg/storage"
)

// TestPersistTree_ContentRefMatchesStoredObjects is the HAL-316 regression:
// a leaf only gets a ContentRef when its content was actually written. An
// empty-after-clean leaf must get NO ref (and no stored object), so later
// reads never chase a key that was never written ("object not found").
func TestPersistTree_ContentRefMatchesStoredObjects(t *testing.T) {
store, err := storage.NewLocal(t.TempDir())
if err != nil {
t.Fatalf("NewLocal: %v", err)
}
p := &Pipeline{Storage: store}
fake := &fakeDocStore{}

doc := &parser.ParsedDoc{
Title: "Doc",
Sections: []parser.Section{
{
Level: 1, Title: "Parent", // internal node, no content
Children: []parser.Section{
{Level: 2, Title: "Has body", Content: "real content here", PageStart: 1, PageEnd: 1},
{Level: 2, Title: "Heading only", Content: " \n\t ", PageStart: 2, PageEnd: 2}, // whitespace → empty after clean
{Level: 2, Title: "Garbage glyphs", Content: "\x00\x01\x02", PageStart: 3, PageEnd: 3}, // stripped to empty
},
},
},
}

if err := p.persistTree(context.Background(), fake, "doc_x", doc); err != nil {
t.Fatalf("persistTree: %v", err)
}

_, _, sections := fake.snapshot()

// Every section that carries a ContentRef must have a readable object;
// every section without one must have nothing stored under its key.
withRef := 0
for _, s := range sections {
if s.ContentRef == "" {
continue
}
withRef++
rc, _, err := store.Get(context.Background(), s.ContentRef)
if err != nil {
t.Errorf("section %s has ContentRef %q but object is not readable: %v", s.ID, s.ContentRef, err)
continue
}
_ = rc.Close()
}

// Exactly one leaf ("Has body") had non-empty content, so exactly one
// ContentRef should exist — the two empty leaves and the parent must
// carry none.
if withRef != 1 {
t.Errorf("expected exactly 1 section with a ContentRef, got %d", withRef)
}

// Spot-check the empty leaves explicitly carry no ref.
for _, s := range sections {
if strings.HasPrefix(s.Title, "Heading only") || strings.HasPrefix(s.Title, "Garbage glyphs") {
if s.ContentRef != "" {
t.Errorf("empty-content leaf %q must have no ContentRef, got %q", s.Title, s.ContentRef)
}
}
}
}
Loading