Skip to content

Commit bc4f488

Browse files
CascadingRadiumCopilotLikith101abhinavdangeti
authored
[v17] MB-27666: Nested Fields (#339)
- Zapx now handles nested documents by creating and storing an edge list indicating document relationships. - Exposed new APIs: - `Ancestors` that allows the caller to fetch the ancestry chain of the requested document. - `CountRoot` which returns the number of root documents in the segment (excluding nested documents). - `AddNestedDocuments` which updates the deleted bitmap from the segment snapshot to contain nested documents as well. - The total number of documents in the segment will include the nested documents as well. Requires: - blevesearch/bleve_index_api#70 - blevesearch/scorch_segment_api#63 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Likith B <likith.b@couchbase.com> Co-authored-by: Abhi Dangeti <abhinav@couchbase.com>
1 parent 0af0ece commit bc4f488

17 files changed

Lines changed: 2065 additions & 75 deletions

.github/workflows/tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ on:
22
push:
33
branches:
44
- master
5+
- v16.x
56
- v15.x
67
- v14.x
78
- v13.x

.golangci.yml

Lines changed: 0 additions & 28 deletions
This file was deleted.

build.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
170170
invIndexCache: newInvertedIndexCache(),
171171
vecIndexCache: newVectorIndexCache(),
172172
synIndexCache: newSynonymIndexCache(),
173+
nstIndexCache: newNestedIndexCache(),
173174
// following fields gets populated by loadFields
174175
fieldsMap: make(map[string]uint16),
175176
fieldsOptions: make(map[string]index.FieldIndexingOptions),
@@ -189,5 +190,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64
189190
return nil, err
190191
}
191192

193+
// initialize any of the caches if needed
194+
err = sb.nstIndexCache.initialize(sb.numDocs, sb.getEdgeListOffset(), sb.mem)
195+
if err != nil {
196+
return nil, err
197+
}
198+
192199
return sb, nil
193200
}

cmd/zap/cmd/edge.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Copyright (c) 2026 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cmd
16+
17+
import (
18+
"encoding/binary"
19+
"fmt"
20+
21+
"github.com/spf13/cobra"
22+
)
23+
24+
// edgeListCmd represents the edge command
25+
var edgeListCmd = &cobra.Command{
26+
Use: "edgeList",
27+
Short: "prints the edge list for nested documents",
28+
Long: `The edgeList command will print the edge list for nested documents in the segment.`,
29+
RunE: func(cmd *cobra.Command, args []string) error {
30+
edgeListAddr, err := segment.EdgeListAddr()
31+
if err != nil {
32+
return fmt.Errorf("error getting edge list: %v", err)
33+
}
34+
if edgeListAddr == 0 {
35+
fmt.Println("no edge list present")
36+
return nil
37+
}
38+
data := segment.Data()
39+
// read edge list
40+
// pos stores the current read position
41+
pos := edgeListAddr
42+
// read number of nested documents which is also the number of edges
43+
numEdges, read := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
44+
if read <= 0 {
45+
return fmt.Errorf("error reading number of edges in nested edge list")
46+
}
47+
pos += uint64(read)
48+
// if no edges or no nested documents, return
49+
if numEdges == 0 {
50+
fmt.Println("no nested documents present")
51+
return nil
52+
}
53+
// edgeList as a map[node]parent
54+
edgeList := make(map[uint64]uint64, numEdges)
55+
for i := uint64(0); i < numEdges; i++ {
56+
child, read := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
57+
if read <= 0 {
58+
return fmt.Errorf("error reading child doc id in nested edge list")
59+
}
60+
pos += uint64(read)
61+
parent, read := binary.Uvarint(data[pos : pos+binary.MaxVarintLen64])
62+
if read <= 0 {
63+
return fmt.Errorf("error reading parent doc id in nested edge list")
64+
}
65+
pos += uint64(read)
66+
edgeList[child] = parent
67+
}
68+
// print number of edges / nested documents
69+
fmt.Printf("number of edges / nested documents: %d\n", len(edgeList))
70+
fmt.Printf("child document number -> parent document number\n")
71+
for child, parent := range edgeList {
72+
fmt.Printf("%d -> %d\n", child, parent)
73+
}
74+
return nil
75+
},
76+
}
77+
78+
func init() {
79+
RootCmd.AddCommand(edgeListCmd)
80+
}

faiss_vector_posting.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,11 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, except *roaring.Bitmap
308308
if rv.vecIndex != nil {
309309
rv.vecIndexSize = rv.vecIndex.Size()
310310
}
311+
312+
// get the number of nested documents in this segment, if any
313+
// to determine if the wrapper needs to handle nested documents
314+
rv.nestedMode = sb.countNested() > 0
315+
311316
return rv, nil
312317
}
313318

faiss_vector_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
// Copyright (c) 2026 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
115
//go:build vectors
216
// +build vectors
317

faiss_vector_wrapper.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ type vectorIndexWrapper struct {
5050
fieldID uint16
5151
vecIndexSize uint64
5252

53+
// nestedMode indicates if the vector index is operating in nested document mode.
54+
// if so we have a reusable ancestry slice to help with docID lookups
55+
nestedMode bool
56+
ancestry []index.AncestorID
57+
5358
sb *SegmentBase
5459
}
5560

@@ -528,9 +533,27 @@ func (v *vectorIndexWrapper) searchClustersFromIVFIndex(eligibleCentroidIDs []in
528533

529534
// Utility function to get the docID for a given vectorID, used for the
530535
// deduplication logic, to map vectorIDs back to their corresponding docIDs
536+
// if we are in nested mode, this method returns the root docID instead of
537+
// the nested docID, by consulting the edge list. This ensures that kNN searches
538+
// return unique root documents when nested documents are involved.
531539
func (v *vectorIndexWrapper) getDocIDForVectorID(vecID int64) (uint32, bool) {
532540
docID, exists := v.mapping.docForVec(uint32(vecID))
533-
return docID, exists
541+
if !v.nestedMode || !exists {
542+
// either not in nested mode, or docID does not exist
543+
//for the vectorID, so just return the docID as is
544+
return docID, exists
545+
}
546+
// in nested mode and docID exists, so we must get the root docID from the edge list
547+
// reuse the wrapper's ancestry slice to avoid allocations
548+
v.ancestry = v.sb.Ancestors(uint64(docID), v.ancestry[:0])
549+
if len(v.ancestry) == 0 {
550+
// should not happen, but just in case, return the docID as is
551+
return docID, exists
552+
}
553+
// return the root docID, which is the last element in the ancestry slice
554+
// in case the docID is a root doc, the ancestry slice would have
555+
// just one element, which is the docID itself
556+
return uint32(v.ancestry[len(v.ancestry)-1]), true
534557
}
535558

536559
// ------------------------------------------------------------------------------

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@ go 1.24
44

55
require (
66
github.com/RoaringBitmap/roaring/v2 v2.4.5
7-
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728
7+
github.com/blevesearch/bleve_index_api v1.3.0
88
github.com/blevesearch/go-faiss v1.0.27
99
github.com/blevesearch/mmap-go v1.0.4
10-
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df
10+
github.com/blevesearch/scorch_segment_api/v2 v2.4.0
1111
github.com/blevesearch/vellum v1.1.0
1212
github.com/golang/snappy v0.0.4
1313
github.com/spf13/cobra v1.7.0

go.sum

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/
33
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
44
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
55
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
6-
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 h1:qFnvr+SqVOCbhMl5sVynhuwVkv1yrc7Vhrn8lVdw1nU=
7-
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko=
6+
github.com/blevesearch/bleve_index_api v1.3.0 h1:DsMpWVjFNlBw9/6pyWf59XoqcAkhHj3H0UWiQsavb6E=
7+
github.com/blevesearch/bleve_index_api v1.3.0/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko=
88
github.com/blevesearch/go-faiss v1.0.27 h1:7cBImYDDQ82WJd5RUZ1ie6zXztCsC73W94ZzwOjkatk=
99
github.com/blevesearch/go-faiss v1.0.27/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
1010
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
1111
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
12-
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df h1:gBuVkzZLUpGJGnCBRgY0ruZVjppD7WaQLeHZei7QQnU=
13-
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df/go.mod h1:f8fXitmMpzgNziIMqUlpTrfPxVVDN8at9k7POEohvJU=
12+
github.com/blevesearch/scorch_segment_api/v2 v2.4.0 h1:OtipwURRzZv6UFmHQnbEqOY90eotINQ2TtSSpWfYuWU=
13+
github.com/blevesearch/scorch_segment_api/v2 v2.4.0/go.mod h1:JalWE/eyEgISwhqtKXoaHMKf5t+F4kXiYrgg0ds3ylw=
1414
github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
1515
github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
1616
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=

inverted_text_cache.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ type invertedIndexCache struct {
3434
cache map[uint16]*invertedCacheEntry
3535
}
3636

37-
// Clear clears the synonym cache which would mean that the termID to term map would no longer be available.
3837
func (sc *invertedIndexCache) Clear() {
3938
sc.m.Lock()
4039
sc.cache = nil

0 commit comments

Comments
 (0)