diff --git a/go.mod b/go.mod index 56c543097f..c99471ed49 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/Nerzal/gocloak/v13 v13.9.0 github.com/bbalet/stopwords v1.0.0 github.com/beevik/etree v1.6.0 - github.com/blevesearch/bleve/v2 v2.5.7 + github.com/blevesearch/bleve/v2 v2.6.0 github.com/cenkalti/backoff v2.2.1+incompatible github.com/coreos/go-oidc/v3 v3.18.0 github.com/cs3org/go-cs3apis v0.0.0-20260424072047-8d9ef7076ae9 @@ -130,7 +130,7 @@ require ( github.com/Masterminds/sprig v2.22.0+incompatible // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.1.6 // indirect - github.com/RoaringBitmap/roaring/v2 v2.4.5 // indirect + github.com/RoaringBitmap/roaring/v2 v2.14.5 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect github.com/ajg/form v1.5.1 // indirect github.com/alexedwards/argon2id v1.0.0 // indirect @@ -140,24 +140,25 @@ require ( github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bitly/go-simplejson v0.5.0 // indirect - github.com/bits-and-blooms/bitset v1.22.0 // indirect - github.com/blevesearch/bleve_index_api v1.2.11 // indirect - github.com/blevesearch/geo v0.2.4 // indirect - github.com/blevesearch/go-faiss v1.0.26 // indirect + github.com/bits-and-blooms/bitset v1.24.2 // indirect + github.com/blevesearch/bleve_index_api v1.3.11 // indirect + github.com/blevesearch/geo v0.2.5 // indirect + github.com/blevesearch/go-faiss v1.1.0 // indirect github.com/blevesearch/go-porterstemmer v1.0.3 // indirect github.com/blevesearch/gtreap v0.1.1 // indirect - github.com/blevesearch/mmap-go v1.0.4 // indirect - github.com/blevesearch/scorch_segment_api/v2 v2.3.13 // indirect + github.com/blevesearch/mmap-go v1.2.0 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.4.7 // indirect github.com/blevesearch/segment v0.9.1 // indirect github.com/blevesearch/snowballstem v0.9.0 // indirect github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect - github.com/blevesearch/vellum v1.1.0 // indirect - github.com/blevesearch/zapx/v11 v11.4.2 // indirect - github.com/blevesearch/zapx/v12 v12.4.2 // indirect - github.com/blevesearch/zapx/v13 v13.4.2 // indirect - github.com/blevesearch/zapx/v14 v14.4.2 // indirect - github.com/blevesearch/zapx/v15 v15.4.2 // indirect - github.com/blevesearch/zapx/v16 v16.2.8 // indirect + github.com/blevesearch/vellum v1.2.0 // indirect + github.com/blevesearch/zapx/v11 v11.4.3 // indirect + github.com/blevesearch/zapx/v12 v12.4.3 // indirect + github.com/blevesearch/zapx/v13 v13.4.3 // indirect + github.com/blevesearch/zapx/v14 v14.4.3 // indirect + github.com/blevesearch/zapx/v15 v15.4.3 // indirect + github.com/blevesearch/zapx/v16 v16.3.4 // indirect + github.com/blevesearch/zapx/v17 v17.1.2 // indirect github.com/bluele/gcache v0.0.2 // indirect github.com/bombsimon/logrusr/v3 v3.1.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect @@ -235,7 +236,7 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect - github.com/golang/snappy v0.0.4 // indirect + github.com/golang/snappy v1.0.0 // indirect github.com/google/go-querystring v1.1.0 // indirect github.com/google/go-tpm v0.9.8 // indirect github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect diff --git a/go.sum b/go.sum index febbc0c73c..c93bdfe9d6 100644 --- a/go.sum +++ b/go.sum @@ -91,8 +91,8 @@ github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAE github.com/OpenDNS/vegadns2client v0.0.0-20180418235048-a3fa4a771d87/go.mod h1:iGLljf5n9GjT6kc0HBvyI1nOKnGQbNB66VzSNbK5iks= github.com/ProtonMail/go-crypto v1.1.6 h1:ZcV+Ropw6Qn0AX9brlQLAUXfqLBc7Bl+f/DmNxpLfdw= github.com/ProtonMail/go-crypto v1.1.6/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE= -github.com/RoaringBitmap/roaring/v2 v2.4.5 h1:uGrrMreGjvAtTBobc0g5IrW1D5ldxDQYe2JW2gggRdg= -github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/hVXDS2dXi7/eUFE0= +github.com/RoaringBitmap/roaring/v2 v2.14.5 h1:ckd0o545JqDPeVJDgeFoaM21eBixUnlWfYgjE5VnyWw= +github.com/RoaringBitmap/roaring/v2 v2.14.5/go.mod h1:eq4wdNXxtJIS/oikeCzdX1rBzek7ANzbth041hrU8Q4= github.com/Shopify/sarama v1.19.0/go.mod h1:FVkBWblsNy7DGZRfXLU0O9RCGt5g3g3yEuWXgklEdEo= github.com/Shopify/toxiproxy v2.1.4+incompatible/go.mod h1:OXgGpZ6Cli1/URJOF1DMxUHB2q5Ap20/P/eIdh4G0pI= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= @@ -145,46 +145,47 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bitly/go-simplejson v0.5.0 h1:6IH+V8/tVMab511d5bn4M7EwGXZf9Hj6i2xSwkNEM+Y= github.com/bitly/go-simplejson v0.5.0/go.mod h1:cXHtHw4XUPsvGaxgjIAn8PhEWG9NfngEKAMDJEczWVA= -github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= -github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= -github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.24.2 h1:M7/NzVbsytmtfHbumG+K2bremQPMJuqv1JD3vOaFxp0= +github.com/bits-and-blooms/bitset v1.24.2/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bketelsen/crypt v0.0.3-0.20200106085610-5cbc8cc4026c/go.mod h1:MKsuJmJgSg28kpZDP6UIiPt0e0Oz0kqKNGyRaWEPv84= -github.com/blevesearch/bleve/v2 v2.5.7 h1:2d9YrL5zrX5EBBW++GOaEKjE+NPWeZGaX77IM26m1Z8= -github.com/blevesearch/bleve/v2 v2.5.7/go.mod h1:yj0NlS7ocGC4VOSAedqDDMktdh2935v2CSWOCDMHdSA= -github.com/blevesearch/bleve_index_api v1.2.11 h1:bXQ54kVuwP8hdrXUSOnvTQfgK0KI1+f9A0ITJT8tX1s= -github.com/blevesearch/bleve_index_api v1.2.11/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0= -github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk= -github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8= -github.com/blevesearch/go-faiss v1.0.26 h1:4dRLolFgjPyjkaXwff4NfbZFdE/dfywbzDqporeQvXI= -github.com/blevesearch/go-faiss v1.0.26/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/bleve/v2 v2.6.0 h1:Cyd3dd4q5tCbOV8MnKUVRUDYMHOir9xn12NZzXVSEd4= +github.com/blevesearch/bleve/v2 v2.6.0/go.mod h1:gLmI8lWgHgrIYf7UpUX7JISI1CaqC6VScu46mHThuAY= +github.com/blevesearch/bleve_index_api v1.3.11 h1:x29vbV8OjWfLcrDVd7Lr1q+BkLNS0JWNEig0MCVnKH4= +github.com/blevesearch/bleve_index_api v1.3.11/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= +github.com/blevesearch/geo v0.2.5 h1:yJg9FX1oRwLnjXSXF+ECHfXFTF4diF02Ca/qUGVjJhE= +github.com/blevesearch/geo v0.2.5/go.mod h1:Jhq7WE2K6mJTx1xS44M2pUO6Io+wjCSHh1+co3YOgH4= +github.com/blevesearch/go-faiss v1.1.0 h1:xM7Jc0ZUCv5lssG9Ohj3Jv0SdTpxcUABU1dDt9XVsc4= +github.com/blevesearch/go-faiss v1.1.0/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= -github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= -github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= -github.com/blevesearch/scorch_segment_api/v2 v2.3.13 h1:ZPjv/4VwWvHJZKeMSgScCapOy8+DdmsmRyLmSB88UoY= -github.com/blevesearch/scorch_segment_api/v2 v2.3.13/go.mod h1:ENk2LClTehOuMS8XzN3UxBEErYmtwkE7MAArFTXs9Vc= +github.com/blevesearch/mmap-go v1.2.0 h1:l33nNKPFcBjJUMwem6sAYJPUzhUCABoK9FxZDGiFNBI= +github.com/blevesearch/mmap-go v1.2.0/go.mod h1:Vd6+20GBhEdwJnU1Xohgt88XCD/CTWcqbCNxkZpyBo0= +github.com/blevesearch/scorch_segment_api/v2 v2.4.7 h1:GlMzW08hcsM3DnLUxhyF/1PcDal1qtvvIuytuph5djw= +github.com/blevesearch/scorch_segment_api/v2 v2.4.7/go.mod h1://IJ7tG3QCf0cWW/aVSXqy77tc1AvLu3fcJLYEvOAFs= github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= -github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= -github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= -github.com/blevesearch/zapx/v11 v11.4.2 h1:l46SV+b0gFN+Rw3wUI1YdMWdSAVhskYuvxlcgpQFljs= -github.com/blevesearch/zapx/v11 v11.4.2/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= -github.com/blevesearch/zapx/v12 v12.4.2 h1:fzRbhllQmEMUuAQ7zBuMvKRlcPA5ESTgWlDEoB9uQNE= -github.com/blevesearch/zapx/v12 v12.4.2/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= -github.com/blevesearch/zapx/v13 v13.4.2 h1:46PIZCO/ZuKZYgxI8Y7lOJqX3Irkc3N8W82QTK3MVks= -github.com/blevesearch/zapx/v13 v13.4.2/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= -github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT7fWYz0= -github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= -github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k= -github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= -github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI= -github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14= +github.com/blevesearch/vellum v1.2.0 h1:xkDiOEsHc2t3Cp0NsNZZ36pvc130sCzcGKOPMzXe+e0= +github.com/blevesearch/vellum v1.2.0/go.mod h1:uEcfBJz7mAOf0Kvq6qoEKQQkLODBF46SINYNkZNae4k= +github.com/blevesearch/zapx/v11 v11.4.3 h1:PTZOO5loKpHC/x/GzmPZNa9cw7GZIQxd5qRjwij9tHY= +github.com/blevesearch/zapx/v11 v11.4.3/go.mod h1:4gdeyy9oGa/lLa6D34R9daXNUvfMPZqUYjPwiLmekwc= +github.com/blevesearch/zapx/v12 v12.4.3 h1:eElXvAaAX4m04t//CGBQAtHNPA+Q6A1hHZVrN3LSFYo= +github.com/blevesearch/zapx/v12 v12.4.3/go.mod h1:TdFmr7afSz1hFh/SIBCCZvcLfzYvievIH6aEISCte58= +github.com/blevesearch/zapx/v13 v13.4.3 h1:qsdhRhaSpVnqDFlRiH9vG5+KJ+dE7KAW9WyZz/KXAiE= +github.com/blevesearch/zapx/v13 v13.4.3/go.mod h1:knK8z2NdQHlb5ot/uj8wuvOq5PhDGjNYQQy0QDnopZk= +github.com/blevesearch/zapx/v14 v14.4.3 h1:GY4Hecx0C6UTmiNC2pKdeA2rOKiLR5/rwpU9WR51dgM= +github.com/blevesearch/zapx/v14 v14.4.3/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8= +github.com/blevesearch/zapx/v15 v15.4.3 h1:iJiMJOHrz216jyO6lS0m9RTCEkprUnzvqAI2lc/0/CU= +github.com/blevesearch/zapx/v15 v15.4.3/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw= +github.com/blevesearch/zapx/v16 v16.3.4 h1:hDAqA8qusZTNbPEL7//w5P65UZ2de6yhSeUaTbp0Po0= +github.com/blevesearch/zapx/v16 v16.3.4/go.mod h1:zqkPPqs9GS9FzVWzCO3Wf1X044yWAV17+4zb+FTiEHg= +github.com/blevesearch/zapx/v17 v17.1.2 h1:avbOk2igaASNoiy0BE/jPgcxAnRI2PGeydeP4hg7Ikk= +github.com/blevesearch/zapx/v17 v17.1.2/go.mod h1:WQObxKrqUX7cd0G1GMvDfc/bmZzQvoy7APOPimx7DiI= github.com/bluele/gcache v0.0.2 h1:WcbfdXICg7G/DGBh1PFfcirkWOQV+v077yF1pSy3DGw= github.com/bluele/gcache v0.0.2/go.mod h1:m15KV+ECjptwSPxKhOhQoAFQVtUFjTVkc3H8o0t/fp0= github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= @@ -535,8 +536,8 @@ github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= +github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= @@ -1785,7 +1786,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/.gitignore b/vendor/github.com/RoaringBitmap/roaring/v2/.gitignore index 851f323dba..eb738f1610 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/.gitignore +++ b/vendor/github.com/RoaringBitmap/roaring/v2/.gitignore @@ -3,3 +3,4 @@ roaring-fuzz.zip workdir coverage.out testdata/all3.classic +/vendor \ No newline at end of file diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/Makefile b/vendor/github.com/RoaringBitmap/roaring/v2/Makefile new file mode 100644 index 0000000000..033fc7b8e9 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/v2/Makefile @@ -0,0 +1,10 @@ +# Display general help about this command +help: + @echo "" + @echo "The following commands are available:" + @echo " make unconvert : Find unnecessary type conversions" + @echo "" + +# Find unnecessary type conversions +unconvert: + go tool unconvert -apply ./... \ No newline at end of file diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/README.md b/vendor/github.com/RoaringBitmap/roaring/v2/README.md index b7e9684aff..eb99b82668 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/README.md +++ b/vendor/github.com/RoaringBitmap/roaring/v2/README.md @@ -25,6 +25,7 @@ Roaring bitmaps are used by several major systems such as [Apache Lucene][lucene [pinot]: http://github.com/linkedin/pinot/wiki [vsts]: https://www.visualstudio.com/team-services/ [atlas]: https://github.com/Netflix/atlas +[quanta]: https://github.com/disney/quanta Roaring bitmaps are found to work well in many important applications: @@ -44,6 +45,10 @@ The ``roaring`` Go library is used by * [trident](https://github.com/NetApp/trident) * [Husky](https://www.datadoghq.com/blog/engineering/introducing-husky/) * [FrostDB](https://github.com/polarsignals/frostdb) +* [Disney Quanta](https://github.com/disney/quanta) + + + This library is used in production in several systems, it is part of the [Awesome Go collection](https://awesome-go.com). @@ -370,7 +375,7 @@ go get github.com/RoaringBitmap/real-roaring-datasets BENCH_REAL_DATA=1 go test -bench BenchmarkRealData -run - ``` -### Iterative use +### Interactive use You can use roaring with gore: @@ -414,4 +419,14 @@ The two versions were written independently. ### Mailing list/discussion group -https://groups.google.com/forum/#!forum/roaring-bitmaps +https://groups.google.com/g/roaring-bitmaps + +## Stars + + +[![Star History Chart](https://api.star-history.com/svg?repos=RoaringBitmap/roaring&type=Date)](https://www.star-history.com/#RoaringBitmap/roaring&Date) + +### Further reading + +

Mastering Programming: From Testing to Performance in Go

+
diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go index 2e75c5ad4b..784894d820 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/arraycontainer.go @@ -11,6 +11,7 @@ type arrayContainer struct { var ( ErrArrayIncorrectSort = errors.New("incorrectly sorted array") + ErrEmptyArray = errors.New("empty array") ErrArrayInvalidSize = errors.New("invalid array size") ) @@ -61,6 +62,10 @@ func (ac *arrayContainer) getManyIterator() manyIterable { return &shortIterator{ac.content, 0} } +func (ac *arrayContainer) getUnsetIterator() shortPeekable { + return newArrayContainerUnsetIterator(ac.content) +} + func (ac *arrayContainer) minimum() uint16 { return ac.content[0] // assume not empty } @@ -417,8 +422,10 @@ func (ac *arrayContainer) iorArray(value2 *arrayContainer) container { func (ac *arrayContainer) iorBitmap(bc2 *bitmapContainer) container { bc1 := ac.toBitmapContainer() bc1.iorBitmap(bc2) - *ac = *newArrayContainerFromBitmap(bc1) - return ac + // DO NOT DO THIS: + // *ac = *newArrayContainerFromBitmap(bc1) + // This will create gigantic array containers in the case of repeated calls to iorBitmap. + return bc1 } func (ac *arrayContainer) iorRun16(rc *runContainer16) container { @@ -621,6 +628,30 @@ func (ac *arrayContainer) xor(a container) container { panic("unsupported container type") } +func (ac *arrayContainer) ixor(a container) container { + switch x := a.(type) { + case *arrayContainer: + return ac.ixorArray(x) + case *bitmapContainer: + return ac.ixorBitmap(x) + case *runContainer16: + return ac.ixorRun16(x) + } + panic("unsupported container type") +} + +func (ac *arrayContainer) ixorArray(value2 *arrayContainer) container { + return ac.xorArray(value2) +} + +func (ac *arrayContainer) ixorBitmap(value2 *bitmapContainer) container { + return value2.ixor(ac) +} + +func (ac *arrayContainer) ixorRun16(value2 *runContainer16) container { + return value2.ixor(ac) +} + func (ac *arrayContainer) xorArray(value2 *arrayContainer) container { value1 := ac totalCardinality := value1.getCardinality() + value2.getCardinality() @@ -962,12 +993,12 @@ func (ac *arrayContainer) resetTo(a container) { x.fillArray(ac.content) case *runContainer16: - card := int(x.getCardinality()) + card := x.getCardinality() ac.realloc(card) cur := 0 for _, r := range x.iv { - for val := r.start; val <= r.last(); val++ { - ac.content[cur] = val + for val := int(r.start); val <= int(r.last()); val++ { + ac.content[cur] = uint16(val) cur++ } } @@ -1289,7 +1320,7 @@ func (ac *arrayContainer) validate() error { cardinality := ac.getCardinality() if cardinality <= 0 { - return ErrArrayInvalidSize + return ErrEmptyArray } if cardinality > arrayDefaultMaxSize { diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go index 10bc0f1c7c..9972963451 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/bitmapcontainer.go @@ -262,6 +262,39 @@ func (bc *bitmapContainer) getManyIterator() manyIterable { return newBitmapContainerManyIterator(bc) } +type bitmapContainerUnsetIterator struct { + ptr *bitmapContainer + i int +} + +func (bcui *bitmapContainerUnsetIterator) next() uint16 { + j := bcui.i + bcui.i = bcui.ptr.NextUnsetBit(uint(bcui.i) + 1) + return uint16(j) +} + +func (bcui *bitmapContainerUnsetIterator) hasNext() bool { + return bcui.i >= 0 && bcui.i < 65536 +} + +func (bcui *bitmapContainerUnsetIterator) peekNext() uint16 { + return uint16(bcui.i) +} + +func (bcui *bitmapContainerUnsetIterator) advanceIfNeeded(minval uint16) { + if bcui.hasNext() && bcui.peekNext() < minval { + bcui.i = bcui.ptr.NextUnsetBit(uint(minval)) + } +} + +func newBitmapContainerUnsetIterator(a *bitmapContainer) *bitmapContainerUnsetIterator { + return &bitmapContainerUnsetIterator{a, a.NextUnsetBit(0)} +} + +func (bc *bitmapContainer) getUnsetIterator() shortPeekable { + return newBitmapContainerUnsetIterator(bc) +} + func (bc *bitmapContainer) getSizeInBytes() int { return len(bc.bitmap) * 8 } @@ -882,6 +915,43 @@ func (bc *bitmapContainer) iandBitmap(value2 *bitmapContainer) container { return bc } +func (bc *bitmapContainer) ixor(a container) container { + switch x := a.(type) { + case *arrayContainer: + return bc.ixorArray(x) + case *bitmapContainer: + return bc.ixorBitmap(x) + case *runContainer16: + return bc.ixorRun16(x) + } + panic("unsupported container type") +} + +func (bc *bitmapContainer) ixorArray(value2 *arrayContainer) container { + vbc := value2.toBitmapContainer() + return bc.ixorBitmap(vbc) +} + +func (bc *bitmapContainer) ixorRun16(value2 *runContainer16) container { + rcb := value2.toBitmapContainer() + return bc.ixorBitmap(rcb) +} + +func (bc *bitmapContainer) ixorBitmap(value2 *bitmapContainer) container { + newCardinality := int(popcntXorSlice(bc.bitmap, value2.bitmap)) + if newCardinality > arrayDefaultMaxSize { + for k := 0; k < len(bc.bitmap); k++ { + bc.bitmap[k] = bc.bitmap[k] ^ value2.bitmap[k] + } + bc.cardinality = newCardinality + return bc + } + ac := newArrayContainerSize(newCardinality) + fillArrayXOR(ac.content, bc.bitmap, value2.bitmap) + ac.content = ac.content[:newCardinality] + return ac +} + func (bc *bitmapContainer) andNot(a container) container { switch x := a.(type) { case *arrayContainer: @@ -1100,7 +1170,7 @@ func (bc *bitmapContainer) NextSetBit(i uint) int { return -1 } w := bc.bitmap[x] - w = w >> uint(i%64) + w = w >> (i % 64) if w != 0 { return int(i) + countTrailingZeros(w) } @@ -1113,6 +1183,29 @@ func (bc *bitmapContainer) NextSetBit(i uint) int { return -1 } +func (bc *bitmapContainer) NextUnsetBit(i uint) int { + var ( + x = i / 64 + length = uint(len(bc.bitmap)) + ) + if x >= length { + return int(i) + } + w := bc.bitmap[x] + w = w >> (i % 64) + w = ^w + if w != 0 { + return int(i) + countTrailingZeros(w) + } + x++ + for ; x < length; x++ { + if bc.bitmap[x] != 0xFFFFFFFFFFFFFFFF { + return int(x*64) + countTrailingZeros(^bc.bitmap[x]) + } + } + return int(length * 64) +} + // PrevSetBit returns the previous set bit e.g the previous int packed into the bitmaparray func (bc *bitmapContainer) PrevSetBit(i int) int { if i < 0 { @@ -1136,7 +1229,7 @@ func (bc *bitmapContainer) uPrevSetBit(i uint) int { b := i % 64 - w = w << uint(63-b) + w = w << (63 - b) if w != 0 { return int(i) - countLeadingZeros(w) } diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/iter.go b/vendor/github.com/RoaringBitmap/roaring/v2/iter.go new file mode 100644 index 0000000000..1c43379ee2 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/v2/iter.go @@ -0,0 +1,44 @@ +package roaring + +import "iter" + +// Values returns an iterator that yields the elements of the bitmap in +// increasing order. Starting with Go 1.23, users can use a for loop to iterate +// over it. +func Values(b *Bitmap) iter.Seq[uint32] { + return func(yield func(uint32) bool) { + it := b.Iterator() + for it.HasNext() { + if !yield(it.Next()) { + return + } + } + } +} + +// Backward returns an iterator that yields the elements of the bitmap in +// decreasing order. Starting with Go 1.23, users can use a for loop to iterate +// over it. +func Backward(b *Bitmap) iter.Seq[uint32] { + return func(yield func(uint32) bool) { + it := b.ReverseIterator() + for it.HasNext() { + if !yield(it.Next()) { + return + } + } + } +} + +// Unset creates an iterator that yields values in the range [min, max] that are NOT contained in the bitmap. +// The iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). +func Unset(b *Bitmap, min, max uint32) iter.Seq[uint32] { + return func(yield func(uint32) bool) { + it := b.UnsetIterator(uint64(min), uint64(max)+1) + for it.HasNext() { + if !yield(it.Next()) { + return + } + } + } +} diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/parallel.go b/vendor/github.com/RoaringBitmap/roaring/v2/parallel.go index 9208e3e380..01ba856f8a 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/parallel.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/parallel.go @@ -370,23 +370,23 @@ func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { var chunkSize int var chunkCount int - if parallelism*4 > int(keyRange) { + if parallelism*4 > keyRange { chunkSize = 1 - chunkCount = int(keyRange) + chunkCount = keyRange } else { chunkCount = parallelism * 4 - chunkSize = (int(keyRange) + chunkCount - 1) / chunkCount + chunkSize = (keyRange + chunkCount - 1) / chunkCount } - if chunkCount*chunkSize < int(keyRange) { + if chunkCount*chunkSize < keyRange { // it's fine to panic to indicate an implementation error panic(fmt.Sprintf("invariant check failed: chunkCount * chunkSize < keyRange, %d * %d < %d", chunkCount, chunkSize, keyRange)) } chunks := make([]*roaringArray, chunkCount) - chunkSpecChan := make(chan parChunkSpec, minOfInt(maxOfInt(64, 2*parallelism), int(chunkCount))) - chunkChan := make(chan parChunk, minOfInt(32, int(chunkCount))) + chunkSpecChan := make(chan parChunkSpec, minOfInt(maxOfInt(64, 2*parallelism), chunkCount)) + chunkChan := make(chan parChunk, minOfInt(32, chunkCount)) orFunc := func() { for spec := range chunkSpecChan { @@ -412,7 +412,7 @@ func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { spec := parChunkSpec{ start: uint16(int(lKey) + i*chunkSize), end: uint16(minOfInt(int(lKey)+(i+1)*chunkSize-1, int(hKey))), - idx: int(i), + idx: i, } chunkSpecChan <- spec } diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go index 9972a51e25..e9fdfe8582 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring.go @@ -68,10 +68,10 @@ func (rb *Bitmap) DenseSize() uint64 { maximum := 1 + uint64(rb.Maximum()) if maximum > (capacity - wordSize + 1) { - return uint64(capacity >> log2WordSize) + return capacity >> log2WordSize } - return uint64((maximum + (wordSize - 1)) >> log2WordSize) + return (maximum + (wordSize - 1)) >> log2WordSize } // ToDense returns a slice of uint64s representing the bitmap as a dense bitmap. @@ -421,6 +421,11 @@ func FromBitSet(bitset *bitset.BitSet) *Bitmap { // ToArray creates a new slice containing all of the integers stored in the Bitmap in sorted order func (rb *Bitmap) ToArray() []uint32 { array := make([]uint32, rb.GetCardinality()) + ar := rb.toArray(&array) + return *ar +} + +func (rb *Bitmap) toArray(array *[]uint32) *[]uint32 { pos := 0 pos2 := 0 @@ -428,11 +433,18 @@ func (rb *Bitmap) ToArray() []uint32 { hs := uint32(rb.highlowcontainer.getKeyAtIndex(pos)) << 16 c := rb.highlowcontainer.getContainerAtIndex(pos) pos++ - pos2 = c.fillLeastSignificant16bits(array, pos2, hs) + pos2 = c.fillLeastSignificant16bits(*array, pos2, hs) } return array } +// ToExistingArray stores all of the integers stored in the Bitmap in sorted order in the +// slice that is given to ToExistingArray. It is the callers duty to make sure the slice +// has the right size. +func (rb *Bitmap) ToExistingArray(array *[]uint32) *[]uint32 { + return rb.toArray(array) +} + // GetSizeInBytes estimates the memory usage of the Bitmap. Note that this // might differ slightly from the amount of bytes required for persistent storage func (rb *Bitmap) GetSizeInBytes() uint64 { @@ -599,7 +611,7 @@ func (ii *intReverseIterator) init() { ii.shortIter = reverseIterator{t.content, len(t.content) - 1} ii.iter = &ii.shortIter case *runContainer16: - index := int(len(t.iv)) - 1 + index := len(t.iv) - 1 pos := uint16(0) if index >= 0 { @@ -730,6 +742,182 @@ func (ii *manyIntIterator) Initialize(a *Bitmap) { ii.init() } +type unsetIterator struct { + containerIndex int + nextKey int + hs uint32 + iter shortPeekable + highlowcontainer *roaringArray + + arrayUnsetIter arrayContainerUnsetIterator + runUnsetIter runUnsetIterator16 + bitmapUnsetIter bitmapContainerUnsetIterator + emptyContainerVal uint16 + + start, end uint64 +} + +// HasNext returns true if there are more integers to iterate over +func (iui *unsetIterator) HasNext() bool { + // Skip containers that have no unset bits in our range + for iui.nextKey < 65536 && uint64(iui.nextKey)<<16 < iui.end { + if iui.iter == nil { + // We're in an empty container gap, which has unset bits + if uint64(iui.nextKey)<<16|uint64(iui.emptyContainerVal) < iui.end { + return true + } + // Move to next container + iui.nextKey++ + iui.containerIndex++ + iui.init() + continue + } + if iui.iter.hasNext() { + // Check if next value is within range + nextVal := (uint64(iui.nextKey) << 16) | uint64(iui.iter.peekNext()) + if nextVal < iui.end { + return true + } + } + // Current container has no more unset bits in range, move to next + iui.nextKey++ + iui.containerIndex++ + iui.init() + } + return false +} + +func (iui *unsetIterator) init() { + // Check if we've gone past the end range + if uint64(iui.nextKey)<<16 >= iui.end { + iui.iter = nil + return + } + + // Check if we're in an empty container gap + if iui.containerIndex >= iui.highlowcontainer.size() || + iui.highlowcontainer.getKeyAtIndex(iui.containerIndex) > uint16(iui.nextKey) { + // We're in a gap - iterate through empty container + iui.emptyContainerVal = 0 + // If this container overlaps with start, advance to start + if uint64(iui.nextKey)<<16 < iui.start && iui.start < uint64(iui.nextKey+1)<<16 { + iui.emptyContainerVal = uint16(iui.start) + } + iui.iter = nil + return + } + + // We're in an actual container + iui.hs = uint32(iui.nextKey) << 16 + c := iui.highlowcontainer.getContainerAtIndex(iui.containerIndex) + switch t := c.(type) { + case *arrayContainer: + iui.arrayUnsetIter = *newArrayContainerUnsetIterator(t.content) + iui.iter = &iui.arrayUnsetIter + case *runContainer16: + iui.runUnsetIter = *t.newRunUnsetIterator16() + iui.iter = &iui.runUnsetIter + case *bitmapContainer: + iui.bitmapUnsetIter = *newBitmapContainerUnsetIterator(t) + iui.iter = &iui.bitmapUnsetIter + } + + // If this container overlaps with start, advance to the low bits of start + if uint64(iui.nextKey)<<16 < iui.start && iui.start < uint64(iui.nextKey+1)<<16 { + iui.iter.advanceIfNeeded(uint16(iui.start)) + } +} + +// Next returns the next integer +func (iui *unsetIterator) Next() uint32 { + if iui.iter == nil { + // We're in an empty container gap + x := (uint32(iui.nextKey) << 16) | uint32(iui.emptyContainerVal) + iui.emptyContainerVal++ + if iui.emptyContainerVal == 0 || uint64(iui.nextKey)<<16|uint64(iui.emptyContainerVal) >= iui.end { + // Wrapped around or reached end, move to next container + iui.nextKey++ + iui.init() + } + return x + } + + x := uint32(iui.iter.next()) | iui.hs + if !iui.iter.hasNext() || uint64(iui.nextKey)<<16|uint64(iui.iter.peekNext()) >= iui.end { + iui.nextKey++ + iui.containerIndex++ + iui.init() + } + return x +} + +// PeekNext peeks the next value without advancing the iterator +func (iui *unsetIterator) PeekNext() uint32 { + if !iui.HasNext() { + panic("PeekNext() called when HasNext() returns false") + } + if iui.iter == nil { + return (uint32(iui.nextKey) << 16) | uint32(iui.emptyContainerVal) + } + return uint32(iui.iter.peekNext()&maxLowBit) | iui.hs +} + +// AdvanceIfNeeded advances as long as the next value is smaller than minval +func (iui *unsetIterator) AdvanceIfNeeded(minval uint32) { + targetKey := int(minval >> 16) + + for iui.HasNext() && iui.nextKey < targetKey { + iui.nextKey++ + // Find the next container that matches or exceeds nextKey + for iui.containerIndex < iui.highlowcontainer.size() && + int(iui.highlowcontainer.getKeyAtIndex(iui.containerIndex)) < iui.nextKey { + iui.containerIndex++ + } + iui.init() + } + + if iui.HasNext() && iui.nextKey == targetKey { + if iui.iter != nil { + iui.iter.advanceIfNeeded(lowbits(minval)) + if !iui.iter.hasNext() || uint64(iui.nextKey)<<16|uint64(iui.iter.peekNext()) >= iui.end { + iui.nextKey++ + iui.containerIndex++ + iui.init() + } + } else { + lowVal := lowbits(minval) + if iui.emptyContainerVal < lowVal { + iui.emptyContainerVal = lowVal + } + if uint64(iui.nextKey)<<16|uint64(iui.emptyContainerVal) >= iui.end { + iui.nextKey++ + iui.containerIndex++ + iui.init() + } + } + } +} + +// Initialize configures the unset iterator to iterate over values in [start, end) that are not in the bitmap +func (iui *unsetIterator) Initialize(a *Bitmap, start, end uint64) { + if end > 0x100000000 { + panic("end > 0x100000000") + } + iui.start = start + iui.end = end + iui.containerIndex = 0 + iui.nextKey = int(start >> 16) + iui.highlowcontainer = &a.highlowcontainer + + // Find the first container that matches or exceeds the start key + for iui.containerIndex < iui.highlowcontainer.size() && + int(iui.highlowcontainer.getKeyAtIndex(iui.containerIndex)) < iui.nextKey { + iui.containerIndex++ + } + + iui.init() +} + // String creates a string representation of the Bitmap func (rb *Bitmap) String() string { // inspired by https://github.com/fzandona/goroar/ @@ -812,6 +1000,14 @@ func (rb *Bitmap) ManyIterator() ManyIntIterable { return p } +// UnsetIterator creates a new IntPeekable to iterate over values in the range [start, end) that are NOT contained in the bitmap. +// The iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove). +func (rb *Bitmap) UnsetIterator(start, end uint64) IntPeekable { + p := new(unsetIterator) + p.Initialize(rb, start, end) + return p +} + // Clone creates a copy of the Bitmap func (rb *Bitmap) Clone() *Bitmap { ptr := new(Bitmap) @@ -1290,6 +1486,10 @@ main: // Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap func (rb *Bitmap) Xor(x2 *Bitmap) { + if rb == x2 { + rb.Clear() + return + } pos1 := 0 pos2 := 0 length1 := rb.highlowcontainer.size() @@ -1304,14 +1504,12 @@ func (rb *Bitmap) Xor(x2 *Bitmap) { break } } else if s1 > s2 { - c := x2.highlowcontainer.getWritableContainerAtIndex(pos2) - rb.highlowcontainer.insertNewKeyValueAt(pos1, x2.highlowcontainer.getKeyAtIndex(pos2), c) + rb.highlowcontainer.insertNewKeyValueAt(pos1, x2.highlowcontainer.getKeyAtIndex(pos2), x2.highlowcontainer.getContainerAtIndex(pos2).clone()) length1++ pos1++ pos2++ } else { - // TODO: couple be computed in-place for reduced memory usage - c := rb.highlowcontainer.getContainerAtIndex(pos1).xor(x2.highlowcontainer.getContainerAtIndex(pos2)) + c := rb.highlowcontainer.getWritableContainerAtIndex(pos1).ixor(x2.highlowcontainer.getContainerAtIndex(pos2)) if !c.isEmpty() { rb.highlowcontainer.setContainerAtIndex(pos1, c) pos1++ @@ -1358,7 +1556,8 @@ main: } s2 = x2.highlowcontainer.getKeyAtIndex(pos2) } else { - rb.highlowcontainer.replaceKeyAndContainerAtIndex(pos1, s1, rb.highlowcontainer.getUnionedWritableContainer(pos1, x2.highlowcontainer.getContainerAtIndex(pos2)), false) + newcont := rb.highlowcontainer.getUnionedWritableContainer(pos1, x2.highlowcontainer.getContainerAtIndex(pos2)) + rb.highlowcontainer.replaceKeyAndContainerAtIndex(pos1, s1, newcont, false) pos1++ pos2++ if (pos1 == length1) || (pos2 == length2) { @@ -1376,6 +1575,10 @@ main: // AndNot computes the difference between two bitmaps and stores the result in the current bitmap func (rb *Bitmap) AndNot(x2 *Bitmap) { + if rb == x2 { + rb.Clear() + return + } pos1 := 0 pos2 := 0 intersectionsize := 0 @@ -1465,7 +1668,6 @@ main: } s2 = x2.highlowcontainer.getKeyAtIndex(pos2) } else { - answer.highlowcontainer.appendContainer(s1, x1.highlowcontainer.getContainerAtIndex(pos1).or(x2.highlowcontainer.getContainerAtIndex(pos2)), false) pos1++ pos2++ @@ -1504,6 +1706,7 @@ main: if !C.isEmpty() { answer.highlowcontainer.appendContainer(s1, C, false) } + pos1++ pos2++ if (pos1 == length1) || (pos2 == length2) { @@ -1531,6 +1734,9 @@ main: // Xor computes the symmetric difference between two bitmaps and returns the result func Xor(x1, x2 *Bitmap) *Bitmap { + if x1 == x2 { + return NewBitmap() + } answer := NewBitmap() pos1 := 0 pos2 := 0 @@ -1568,6 +1774,9 @@ func Xor(x1, x2 *Bitmap) *Bitmap { // AndNot computes the difference between two bitmaps and returns the result func AndNot(x1, x2 *Bitmap) *Bitmap { + if x1 == x2 { + return NewBitmap() + } answer := NewBitmap() pos1 := 0 pos2 := 0 @@ -1669,11 +1878,11 @@ func (rb *Bitmap) Flip(rangeStart, rangeEnd uint64) { for hb := hbStart; hb <= hbLast; hb++ { var containerStart uint32 if hb == hbStart { - containerStart = uint32(lbStart) + containerStart = lbStart } containerLast := max if hb == hbLast { - containerLast = uint32(lbLast) + containerLast = lbLast } i := rb.highlowcontainer.getIndex(uint16(hb)) @@ -1829,11 +2038,11 @@ func Flip(bm *Bitmap, rangeStart, rangeEnd uint64) *Bitmap { for hb := hbStart; hb <= hbLast; hb++ { var containerStart uint32 if hb == hbStart { - containerStart = uint32(lbStart) + containerStart = lbStart } containerLast := max if hb == hbLast { - containerLast = uint32(lbLast) + containerLast = lbLast } i := bm.highlowcontainer.getIndex(uint16(hb)) @@ -1931,8 +2140,8 @@ func (rb *Bitmap) PreviousValue(target uint32) int64 { return -1 } - originalKey := highbits(uint32(target)) - query := lowbits(uint32(target)) + originalKey := highbits(target) + query := lowbits(target) var prevValue int64 prevValue = -1 containerIndex := rb.highlowcontainer.advanceUntil(originalKey, -1) @@ -2133,6 +2342,34 @@ func (rb *Bitmap) Stats() Statistics { return stats } +// Describe prints a description of the bitmap's containers to stdout +func (rb *Bitmap) Describe() { + fmt.Printf("Bitmap with %d containers:\n", len(rb.highlowcontainer.containers)) + for i, c := range rb.highlowcontainer.containers { + key := rb.highlowcontainer.keys[i] + shared := "" + if rb.highlowcontainer.needCopyOnWrite[i] { + shared = " (shared)" + } + switch c.(type) { + case *arrayContainer: + fmt.Printf(" Container %d (key %d): array, cardinality %d%s\n", i, key, c.getCardinality(), shared) + case *bitmapContainer: + fmt.Printf(" Container %d (key %d): bitmap, cardinality %d%s\n", i, key, c.getCardinality(), shared) + case *runContainer16: + fmt.Printf(" Container %d (key %d): run, cardinality %d%s\n", i, key, c.getCardinality(), shared) + default: + fmt.Printf(" Container %d (key %d): unknown type, cardinality %d%s\n", i, key, c.getCardinality(), shared) + } + } + valid := rb.Validate() + if valid != nil { + fmt.Printf(" Bitmap is INVALID: %v\n", valid) + } else { + fmt.Printf(" Bitmap is valid\n") + } +} + // Validate checks if the bitmap is internally consistent. // You may call it after deserialization to check that the bitmap is valid. // This function returns an error if the bitmap is invalid, nil otherwise. diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go index 46dbe12103..5d6019db27 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/bsi64.go @@ -66,7 +66,7 @@ func (b *BSI) GetExistenceBitmap() *Bitmap { // ValueExists tests whether the value exists. func (b *BSI) ValueExists(columnID uint64) bool { - return b.eBM.Contains(uint64(columnID)) + return b.eBM.Contains(columnID) } // GetCardinality returns a count of unique column IDs for which a value has been set. @@ -115,11 +115,37 @@ func (b *BSI) SetBigValue(columnID uint64, value *big.Int) { b.eBM.Add(columnID) } +func (b *BSI) SetBigMany(foundSet *Bitmap, value *big.Int) { + // If max/min values are set to zero then automatically determine bit array size + if b.MaxValue == 0 && b.MinValue == 0 { + minBits := value.BitLen() + 1 + if minBits == 1 { + minBits = 2 + } + for len(b.bA) < minBits { + b.bA = append(b.bA, Bitmap{}) + } + } + for i := b.BitCount(); i >= 0; i-- { + if value.Bit(i) == 0 { + b.bA[i].AndNot(foundSet) + } else { + b.bA[i].Or(foundSet) + } + } + b.eBM.Or(foundSet) +} + // SetValue sets a value for a given columnID. func (b *BSI) SetValue(columnID uint64, value int64) { b.SetBigValue(columnID, big.NewInt(value)) } +// SetMany sets a value for all columns in foundSet +func (b *BSI) SetMany(foundSet *Bitmap, value int64) { + b.SetBigMany(foundSet, big.NewInt(value)) +} + // GetValue gets the value at the column ID. Second param will be false for non-existent values. func (b *BSI) GetValue(columnID uint64) (value int64, exists bool) { bv, exists := b.GetBigValue(columnID) @@ -722,7 +748,7 @@ func transpose(e *task, batch []uint64, resultsChan chan *Bitmap, wg *sync.WaitG results.RunOptimize() } for _, cID := range batch { - if value, ok := e.bsi.GetValue(uint64(cID)); ok { + if value, ok := e.bsi.GetValue(cID); ok { results.Add(uint64(value)) } } @@ -738,7 +764,7 @@ func (b *BSI) ParOr(parallelism int, bsis ...*BSI) { bits := len(b.bA) for i := 0; i < len(bsis); i++ { if len(bsis[i].bA) > bits { - bits = len(bsis[i].bA ) + bits = len(bsis[i].bA) } } @@ -931,7 +957,7 @@ func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap, for i := 0; i < len(batch); i++ { cID := batch[i] - if value, ok := e.bsi.GetBigValue(uint64(cID)); ok { + if value, ok := e.bsi.GetBigValue(cID); ok { if _, yes := e.values[string(value.Bytes())]; yes { results.Add(cID) } @@ -942,11 +968,7 @@ func batchEqual(e *task, batch []uint64, resultsChan chan *Bitmap, // ClearBits cleared the bits that exist in the target if they are also in the found set. func ClearBits(foundSet, target *Bitmap) { - iter := foundSet.Iterator() - for iter.HasNext() { - cID := iter.Next() - target.Remove(cID) - } + target.AndNot(foundSet) } // ClearValues removes the values found in foundSet @@ -956,13 +978,13 @@ func (b *BSI) ClearValues(foundSet *Bitmap) { wg.Add(1) go func() { defer wg.Done() - ClearBits(foundSet, &b.eBM) + b.eBM.AndNot(foundSet) }() for i := 0; i < b.BitCount(); i++ { wg.Add(1) go func(j int) { defer wg.Done() - ClearBits(foundSet, &b.bA[j]) + b.bA[j].AndNot(foundSet) }(i) } wg.Wait() @@ -1044,7 +1066,7 @@ func transposeWithCounts(input *BSI, filterSet *Bitmap, batch []uint64, resultsC results.RunOptimize() } for _, cID := range batch { - if value, ok := input.GetValue(uint64(cID)); ok { + if value, ok := input.GetValue(cID); ok { if !filterSet.Contains(uint64(value)) { continue } diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iter.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iter.go new file mode 100644 index 0000000000..ce6c24dbd3 --- /dev/null +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/iter.go @@ -0,0 +1,31 @@ +package roaring64 + +import "iter" + +// Values returns an iterator that yields the elements of the bitmap in +// increasing order. Starting with Go 1.23, users can use a for loop to iterate +// over it. +func Values(b *Bitmap) iter.Seq[uint64] { + return func(yield func(uint64) bool) { + it := b.Iterator() + for it.HasNext() { + if !yield(it.Next()) { + return + } + } + } +} + +// Backward returns an iterator that yields the elements of the bitmap in +// decreasing order. Starting with Go 1.23, users can use a for loop to iterate +// over it. +func Backward(b *Bitmap) iter.Seq[uint64] { + return func(yield func(uint64) bool) { + it := b.ReverseIterator() + for it.HasNext() { + if !yield(it.Next()) { + return + } + } + } +} diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go index 5dadc8deac..7bea3c6886 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/parallel64.go @@ -39,9 +39,13 @@ func ParOr(parallelism int, bitmaps ...*Bitmap) *Bitmap { // on some systems, would block indefinitely. keyRange := uint64(hKey) - uint64(lKey) + 1 if keyRange == 1 { - // revert to FastOr. Since the key range is 0 - // no container-level aggregation parallelism is achievable - return FastOr(bitmaps...) + // All bitmaps have the same key, + // we can merge the 32-bit roaring bitmaps in parallel + var bms32s = make([]*roaring.Bitmap, 0, len(bitmaps)) + for _, b := range bitmaps { + bms32s = append(bms32s, b.highlowcontainer.containers...) + } + return roaring32AsRoaring64(roaring.ParOr(parallelism, bms32s...), lKey) } if parallelism == 0 { diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go index ebea5ffcb6..8589925ee4 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaring64/roaring64.go @@ -73,7 +73,7 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) { return n, err } written, err := c.WriteTo(stream) - n += int64(written) + n += written if err != nil { return n, err } @@ -119,7 +119,7 @@ func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) { n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) if n == 0 || err != nil { - return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) + return n, fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) } } @@ -167,9 +167,9 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) { n, err := rb.highlowcontainer.containers[i].ReadFrom(stream) if n == 0 || err != nil { - return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) + return n, fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err) } - p += int64(n) + p += n } return p, nil } @@ -249,7 +249,7 @@ func (rb *Bitmap) String() string { counter := 0 if i.HasNext() { counter = counter + 1 - buffer.WriteString(strconv.FormatUint(uint64(i.Next()), 10)) + buffer.WriteString(strconv.FormatUint(i.Next(), 10)) } for i.HasNext() { buffer.WriteString(",") @@ -259,7 +259,7 @@ func (rb *Bitmap) String() string { buffer.WriteString("...") break } - buffer.WriteString(strconv.FormatUint(uint64(i.Next()), 10)) + buffer.WriteString(strconv.FormatUint(i.Next(), 10)) } buffer.WriteString("}") return buffer.String() @@ -346,7 +346,7 @@ func (rb *Bitmap) CheckedAdd(x uint64) bool { return true } -// AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint32 and we call Add) +// AddInt adds the integer x to the bitmap (convenience method: the parameter is casted to uint64 and we call Add) func (rb *Bitmap) AddInt(x int) { rb.Add(uint64(x)) } @@ -1248,9 +1248,13 @@ func (rb *Bitmap) Validate() error { // Roaring32AsRoaring64 inserts a 32-bit roaring bitmap into // a 64-bit roaring bitmap. No copy is made. func Roaring32AsRoaring64(bm32 *roaring.Bitmap) *Bitmap { + return roaring32AsRoaring64(bm32, 0) +} + +func roaring32AsRoaring64(bm32 *roaring.Bitmap, key uint32) *Bitmap { rb := NewBitmap() rb.highlowcontainer.resize(0) - rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0) + rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, key) rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32) rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false) return rb diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go b/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go index 40be90a56d..f533902356 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/roaringarray.go @@ -39,7 +39,9 @@ type container interface { not(start, final int) container // range is [firstOfRange,lastOfRange) inot(firstOfRange, endx int) container // i stands for inplace, range is [firstOfRange,endx) xor(r container) container + ixor(r container) container // i stands for inplace getShortIterator() shortPeekable + getUnsetIterator() shortPeekable iterate(cb func(x uint16) bool) bool getReverseIterator() shortIterable getManyIterator() manyIterable @@ -108,7 +110,7 @@ func rangeOfOnes(start, last int) container { if last < 0 { panic("rangeOfOnes called with last < 0") } - return newRunContainer16Range(uint16(start), uint16(last)) + return newRunContainer16Range(uint16(start), uint16(last)).toEfficientContainer() } type roaringArray struct { @@ -588,7 +590,7 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte var isRunBitmap []byte if cookie&0x0000FFFF == serialCookie { - size = uint32(cookie>>16 + 1) + size = cookie>>16 + 1 // create is-run-container bitmap isRunBitmapSize := (int(size) + 7) / 8 isRunBitmap, err = stream.Next(isRunBitmapSize) diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go b/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go index ac9ea1b456..4731da7363 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/runcontainer.go @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import ( "errors" "fmt" - "sort" + "slices" ) // runContainer16 does run-length encoding of sets of @@ -113,18 +113,6 @@ func (rc *runContainer16) String() string { return `runContainer16{` + is + `}` } -// uint16Slice is a sort.Sort convenience method -type uint16Slice []uint16 - -// Len returns the length of p. -func (p uint16Slice) Len() int { return len(p) } - -// Less returns p[i] < p[j] -func (p uint16Slice) Less(i, j int) bool { return p[i] < p[j] } - -// Swap swaps elements i and j. -func (p uint16Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - // addHelper helps build a runContainer16. type addHelper16 struct { runstart uint16 @@ -183,7 +171,7 @@ func newRunContainer16FromVals(alreadySorted bool, vals ...uint16) *runContainer ah := addHelper16{rc: rc} if !alreadySorted { - sort.Sort(uint16Slice(vals)) + slices.Sort(vals) } n := len(vals) var cur, prev uint16 @@ -386,8 +374,8 @@ func (rc *runContainer16) union(b *runContainer16) *runContainer16 { var m []interval16 - alim := int(len(rc.iv)) - blim := int(len(b.iv)) + alim := len(rc.iv) + blim := len(b.iv) var na int // next from a var nb int // next from b @@ -497,8 +485,8 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint { // call it rc for consistency with the rest of the methods. answer := uint(0) - alim := int(len(rc.iv)) - blim := int(len(b.iv)) + alim := len(rc.iv) + blim := len(b.iv) var na int // next from a var nb int // next from b @@ -617,8 +605,8 @@ func (rc *runContainer16) indexOfIntervalAtOrAfter(key int, startIndex int) int // intersection of rc (also known as 'a') and b. func (rc *runContainer16) intersect(b *runContainer16) *runContainer16 { a := rc - numa := int(len(a.iv)) - numb := int(len(b.iv)) + numa := len(a.iv) + numb := len(b.iv) res := &runContainer16{} if numa == 0 || numb == 0 { return res @@ -719,8 +707,8 @@ func (rc *runContainer16) intersectCardinality(b *runContainer16) int { answer := int(0) a := rc - numa := int(len(a.iv)) - numb := int(len(b.iv)) + numa := len(a.iv) + numb := len(b.iv) if numa == 0 || numb == 0 { return 0 } @@ -847,7 +835,7 @@ func (rc *runContainer16) numIntervals() int { // The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there // no upper bound. func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) { - n := int(len(rc.iv)) + n := len(rc.iv) if n == 0 { return -1, false, 0 } @@ -1045,7 +1033,7 @@ func (rc *runContainer16) Add(k uint16) (wasNew bool) { } wasNew = true - n := int(len(rc.iv)) + n := len(rc.iv) if index == -1 { // we may need to extend the first run if n > 0 { @@ -1139,8 +1127,8 @@ func (rc *runContainer16) iterate(cb func(x uint16) bool) bool { // returns true when there is at least one more value // available in the iteration sequence. func (ri *runIterator16) hasNext() bool { - return int(len(ri.rc.iv)) > ri.curIndex+1 || - (int(len(ri.rc.iv)) == ri.curIndex+1 && ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex) + return len(ri.rc.iv) > ri.curIndex+1 || + (len(ri.rc.iv) == ri.curIndex+1 && ri.rc.iv[ri.curIndex].length >= ri.curPosInIndex) } // next returns the next value in the iteration sequence. @@ -1169,7 +1157,7 @@ func (ri *runIterator16) advanceIfNeeded(minval uint16) { } // interval cannot be -1 because of minval > peekNext - interval, isPresent, _ := ri.rc.searchRange(int(minval), ri.curIndex, int(len(ri.rc.iv))) + interval, isPresent, _ := ri.rc.searchRange(int(minval), ri.curIndex, len(ri.rc.iv)) // if the minval is present, set the curPosIndex at the right position if isPresent { @@ -1193,7 +1181,7 @@ type runReverseIterator16 struct { // newRunReverseIterator16 returns a new empty run iterator. func (rc *runContainer16) newRunReverseIterator16() *runReverseIterator16 { - index := int(len(rc.iv)) - 1 + index := len(rc.iv) - 1 pos := uint16(0) if index >= 0 { @@ -1254,8 +1242,17 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { // allows BCE buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint32(i) + i := 0 + for ; i+3 < len(buf2); i += 4 { + buf2[i] = base + buf2[i+1] = base + 1 + buf2[i+2] = base + 2 + buf2[i+3] = base + 3 + base += 4 + } + for ; i < len(buf2); i++ { + buf2[i] = base + base++ } // update values @@ -1266,7 +1263,7 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int { ri.curPosInIndex = 0 ri.curIndex++ - if ri.curIndex == int(len(ri.rc.iv)) { + if ri.curIndex == len(ri.rc.iv) { break } } else { @@ -1295,8 +1292,17 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { // allows BCE buf2 := buf[n : n+moreVals] - for i := range buf2 { - buf2[i] = base + uint64(i) + i := 0 + for ; i+3 < len(buf2); i += 4 { + buf2[i] = base + buf2[i+1] = base + 1 + buf2[i+2] = base + 2 + buf2[i+3] = base + 3 + base += 4 + } + for ; i < len(buf2); i++ { + buf2[i] = base + base++ } // update values @@ -1307,7 +1313,7 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { ri.curPosInIndex = 0 ri.curIndex++ - if ri.curIndex == int(len(ri.rc.iv)) { + if ri.curIndex == len(ri.rc.iv) { break } } else { @@ -1416,7 +1422,7 @@ func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex if w < startIndex { // not found and comes before lower bound startIndex, // so just use the lower bound. - if startIndex == int(len(rc.iv)) { + if startIndex == len(rc.iv) { // also this bump up means that we are done return startIndex, true } @@ -1542,7 +1548,7 @@ func (iv interval16) subtractInterval(del interval16) (left []interval16, delcou func (rc *runContainer16) isubtract(del interval16) { origiv := make([]interval16, len(rc.iv)) copy(origiv, rc.iv) - n := int(len(rc.iv)) + n := len(rc.iv) if n == 0 { return // already done. } @@ -1569,8 +1575,8 @@ func (rc *runContainer16) isubtract(del interval16) { // would overwrite values in iv b/c res0 can have len 2. so // write to origiv instead. lost := 1 + ilast - istart - changeSize := int(len(res0)) - lost - newSize := int(len(rc.iv)) + changeSize + changeSize := len(res0) - lost + newSize := len(rc.iv) + changeSize // rc.iv = append(pre, caboose...) // return @@ -1578,19 +1584,19 @@ func (rc *runContainer16) isubtract(del interval16) { if ilast != istart { res1, _ := rc.iv[ilast].subtractInterval(del) res0 = append(res0, res1...) - changeSize = int(len(res0)) - lost - newSize = int(len(rc.iv)) + changeSize + changeSize = len(res0) - lost + newSize = len(rc.iv) + changeSize } switch { case changeSize < 0: // shrink - copy(rc.iv[istart+int(len(res0)):], rc.iv[ilast+1:]) - copy(rc.iv[istart:istart+int(len(res0))], res0) + copy(rc.iv[istart+len(res0):], rc.iv[ilast+1:]) + copy(rc.iv[istart:istart+len(res0)], res0) rc.iv = rc.iv[:newSize] return case changeSize == 0: // stay the same - copy(rc.iv[istart:istart+int(len(res0))], res0) + copy(rc.iv[istart:istart+len(res0)], res0) return default: // changeSize > 0 is only possible when ilast == istart. @@ -1647,7 +1653,7 @@ func (rc *runContainer16) isubtract(del interval16) { // INVAR: ilast < n-1 lost := ilast - istart changeSize := -lost - newSize := int(len(rc.iv)) + changeSize + newSize := len(rc.iv) + changeSize if changeSize != 0 { copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) } @@ -1664,8 +1670,8 @@ func (rc *runContainer16) isubtract(del interval16) { rc.iv[istart] = res0[0] } lost := 1 + (ilast - istart) - changeSize := int(len(res0)) - lost - newSize := int(len(rc.iv)) + changeSize + changeSize := len(res0) - lost + newSize := len(rc.iv) + changeSize if changeSize != 0 { copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) } @@ -1676,8 +1682,8 @@ func (rc *runContainer16) isubtract(del interval16) { // we can only shrink or stay the same size res1, _ := rc.iv[ilast].subtractInterval(del) lost := ilast - istart - changeSize := int(len(res1)) - lost - newSize := int(len(rc.iv)) + changeSize + changeSize := len(res1) - lost + newSize := len(rc.iv) + changeSize if changeSize != 0 { // move the tail first to make room for res1 copy(rc.iv[ilast+1+changeSize:], rc.iv[ilast+1:]) @@ -1823,7 +1829,11 @@ func (rc *runContainer16) and(a container) container { } switch c := a.(type) { case *runContainer16: - return rc.intersect(c) + // Important: there is no reason to believe that the + // result of intersecting two run containers is itself + // a run container. Hence we convert to efficient container. + // We only use run containers when they are efficient. + return rc.intersect(c).toEfficientContainer() case *arrayContainer: return rc.andArray(c) case *bitmapContainer: @@ -1835,7 +1845,7 @@ func (rc *runContainer16) and(a container) container { func (rc *runContainer16) andCardinality(a container) int { switch c := a.(type) { case *runContainer16: - return int(rc.intersectCardinality(c)) + return rc.intersectCardinality(c) case *arrayContainer: return rc.andArrayCardinality(c) case *bitmapContainer: @@ -1885,11 +1895,19 @@ func (rc *runContainer16) iand(a container) container { } switch c := a.(type) { case *runContainer16: - return rc.inplaceIntersect(c) + // Important: there is no reason to believe that the + // result of intersecting two run containers is itself + // a run container. Hence we convert to efficient container. + // We only use run containers when they are efficient. + return rc.inplaceIntersect(c).toEfficientContainer() case *arrayContainer: + // inplace intersection with array is not supported + // It is likely not very useful either. return rc.andArray(c) case *bitmapContainer: - return rc.iandBitmapContainer(c) + // inplace intersection with bitmap is not supported + // It is very difficult to do this inplace and likely not useful. + return rc.andBitmapContainer(c) } panic("unsupported container type") } @@ -1900,12 +1918,6 @@ func (rc *runContainer16) inplaceIntersect(rc2 *runContainer16) container { return rc } -func (rc *runContainer16) iandBitmapContainer(bc *bitmapContainer) container { - isect := rc.andBitmapContainer(bc) - *rc = *newRunContainer16FromContainer(isect) - return rc -} - func (rc *runContainer16) andArray(ac *arrayContainer) container { if len(rc.iv) == 0 { return newArrayContainer() @@ -1943,7 +1955,7 @@ func (rc *runContainer16) andNot(a container) container { case *bitmapContainer: return rc.andNotBitmap(c) case *runContainer16: - return rc.andNotRunContainer16(c) + return rc.andNotRunContainer16(c).toEfficientContainer() } panic("unsupported container type") } @@ -1974,6 +1986,61 @@ func (rc *runContainer16) getManyIterator() manyIterable { return rc.newManyRunIterator16() } +type runUnsetIterator16 struct { + rc *runContainer16 + curIndex int + nextVal int +} + +func (rc *runContainer16) newRunUnsetIterator16() *runUnsetIterator16 { + rui := &runUnsetIterator16{rc: rc, curIndex: 0, nextVal: 0} + if len(rc.iv) > 0 && rc.iv[0].start == 0 { + rui.nextVal = int(rc.iv[0].start) + int(rc.iv[0].length) + 1 + rui.curIndex = 1 + } + return rui +} + +func (rui *runUnsetIterator16) hasNext() bool { + return rui.nextVal < 65536 +} + +func (rui *runUnsetIterator16) next() uint16 { + val := rui.nextVal + rui.nextVal++ + if rui.curIndex < len(rui.rc.iv) && uint16(rui.nextVal) >= rui.rc.iv[rui.curIndex].start { + rui.nextVal = int(rui.rc.iv[rui.curIndex].start) + int(rui.rc.iv[rui.curIndex].length) + 1 + rui.curIndex++ + } + return uint16(val) +} + +func (rui *runUnsetIterator16) peekNext() uint16 { + return uint16(rui.nextVal) +} + +func (rui *runUnsetIterator16) advanceIfNeeded(minval uint16) { + if !rui.hasNext() || rui.peekNext() >= minval { + return + } + rui.nextVal = int(minval) + for rui.curIndex < len(rui.rc.iv) { + if rui.rc.iv[rui.curIndex].start+rui.rc.iv[rui.curIndex].length < minval { + rui.curIndex++ + } else if rui.rc.iv[rui.curIndex].start <= minval { + rui.nextVal = int(rui.rc.iv[rui.curIndex].start) + int(rui.rc.iv[rui.curIndex].length) + 1 + rui.curIndex++ + break + } else { + break + } + } +} + +func (rc *runContainer16) getUnsetIterator() shortPeekable { + return rc.newRunUnsetIterator16() +} + // add the values in the range [firstOfRange, endx). endx // is still abe to express 2^16 because it is an int not an uint16. func (rc *runContainer16) iaddRange(firstOfRange, endx int) container { @@ -2104,7 +2171,7 @@ func (rc *runContainer16) equals(o container) bool { func (rc *runContainer16) iaddReturnMinimized(x uint16) container { rc.Add(x) - return rc + return rc.toEfficientContainer() } func (rc *runContainer16) iadd(x uint16) (wasNew bool) { @@ -2113,7 +2180,7 @@ func (rc *runContainer16) iadd(x uint16) (wasNew bool) { func (rc *runContainer16) iremoveReturnMinimized(x uint16) container { rc.removeKey(x) - return rc + return rc.toEfficientContainer() } func (rc *runContainer16) iremove(x uint16) bool { @@ -2174,15 +2241,9 @@ func (rc *runContainer16) orArray(ac *arrayContainer) container { if rc.isEmpty() { return ac.clone() } - intervals, cardMinusOne := runArrayUnionToRuns(rc, ac) + intervals, cardminusone := runArrayUnionToRuns(rc, ac) result := newRunContainer16TakeOwnership(intervals) - if len(intervals) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { - return newBitmapContainerFromRun(result) - } - if len(intervals)*2 > 1+int(cardMinusOne) { - return result.toArrayContainer() - } - return result + return result.toEfficientContainerFromCardinality(int(cardminusone) + 1) } // orArray finds the union of rc and ac. @@ -2200,7 +2261,7 @@ func (rc *runContainer16) ior(a container) container { case *arrayContainer: return rc.iorArray(c) case *bitmapContainer: - return rc.iorBitmapContainer(c) + return rc.orBitmapContainer(c) } panic("unsupported container type") } @@ -2212,16 +2273,17 @@ func (rc *runContainer16) inplaceUnion(rc2 *runContainer16) container { rc.Add(uint16(i)) } } - return rc + return rc.toEfficientContainer() } -func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container { - it := bc.getShortIterator() - for it.hasNext() { - rc.Add(it.next()) - } - return rc -} +// Such code should not be used as it will not preserve the container invariants: +//func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container { +// it := bc.getShortIterator() +// for it.hasNext() { +// rc.Add(it.next()) +// } +// return rc +//} func (rc *runContainer16) iorArray(ac *arrayContainer) container { if rc.isEmpty() { @@ -2235,13 +2297,8 @@ func (rc *runContainer16) iorArray(ac *arrayContainer) container { // this can be done with methods like the in-place array container union // but maybe lazily moving the remaining elements back. rc.iv, cardMinusOne = runArrayUnionToRuns(rc, ac) - if len(rc.iv) >= MaxNumIntervals && cardMinusOne >= arrayDefaultMaxSize { - return newBitmapContainerFromRun(rc) - } - if len(rc.iv)*2 > 1+int(cardMinusOne) { - return rc.toArrayContainer() - } - return rc + return rc.toEfficientContainerFromCardinality(int(cardMinusOne) + 1) + } func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16, uint16) { @@ -2377,6 +2434,30 @@ func (rc *runContainer16) xor(a container) container { panic("unsupported container type") } +func (rc *runContainer16) ixor(a container) container { + switch c := a.(type) { + case *arrayContainer: + return rc.ixorArray(c) + case *bitmapContainer: + return rc.ixorBitmap(c) + case *runContainer16: + return rc.ixorRunContainer16(c) + } + panic("unsupported container type") +} + +func (rc *runContainer16) ixorArray(value2 *arrayContainer) container { + return rc.toBitmapContainer().ixor(value2) +} + +func (rc *runContainer16) ixorBitmap(value2 *bitmapContainer) container { + return value2.ixor(rc) +} + +func (rc *runContainer16) ixorRunContainer16(value2 *runContainer16) container { + return rc.toBitmapContainer().ixor(value2.toBitmapContainer()) +} + func (rc *runContainer16) iandNot(a container) container { switch c := a.(type) { case *arrayContainer: @@ -2384,7 +2465,7 @@ func (rc *runContainer16) iandNot(a container) container { case *bitmapContainer: return rc.iandNotBitmap(c) case *runContainer16: - return rc.iandNotRunContainer16(c) + return rc.iandNotRunContainer16(c).toEfficientContainer() } panic("unsupported container type") } @@ -2399,11 +2480,11 @@ func (rc *runContainer16) inot(firstOfRange, endx int) container { } // TODO: minimize copies, do it all inplace; not() makes a copy. rc = rc.Not(firstOfRange, endx) - return rc + return rc.toEfficientContainer() } func (rc *runContainer16) rank(x uint16) int { - n := int(len(rc.iv)) + n := len(rc.iv) xx := int(x) w, already, _ := rc.search(xx) if w < 0 { @@ -2417,13 +2498,13 @@ func (rc *runContainer16) rank(x uint16) int { for i := int(0); i <= w; i++ { rnk += rc.iv[i].runlen() } - return int(rnk) + return rnk } for i := int(0); i < w; i++ { rnk += rc.iv[i].runlen() } rnk += int(x-rc.iv[w].start) + 1 - return int(rnk) + return rnk } func (rc *runContainer16) selectInt(x uint16) int { @@ -2431,7 +2512,7 @@ func (rc *runContainer16) selectInt(x uint16) int { for k := range rc.iv { nextOffset := offset + rc.iv[k].runlen() if nextOffset > int(x) { - return int(int(rc.iv[k].start) + (int(x) - offset)) + return int(rc.iv[k].start) + (int(x) - offset) } offset = nextOffset } @@ -2455,10 +2536,11 @@ func (rc *runContainer16) andNotBitmap(bc *bitmapContainer) container { func (rc *runContainer16) toBitmapContainer() *bitmapContainer { bc := newBitmapContainer() + bc.cardinality = 0 for i := range rc.iv { + bc.cardinality += rc.iv[i].runlen() bc.iaddRange(int(rc.iv[i].start), int(rc.iv[i].last())+1) } - bc.computeCardinality() return bc } @@ -2473,21 +2555,23 @@ func (rc *runContainer16) iandNotArray(ac *arrayContainer) container { rcb := rc.toBitmapContainer() acb := ac.toBitmapContainer() rcb.iandNotBitmapSurely(acb) - // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) - *rc = *rc2 - return rc + answer := rcb.toEfficientContainer() + if runrc, ok := answer.(*runContainer16); ok { + *rc = *runrc + return rc + } + return answer } func (rc *runContainer16) iandNotBitmap(bc *bitmapContainer) container { rcb := rc.toBitmapContainer() rcb.iandNotBitmapSurely(bc) - // TODO: check size and optimize the return value - // TODO: is inplace modification really required? If not, elide the copy. - rc2 := newRunContainer16FromBitmapContainer(rcb) - *rc = *rc2 - return rc + answer := rcb.toEfficientContainer() + if runrc, ok := answer.(*runContainer16); ok { + *rc = *runrc + return rc + } + return answer } func (rc *runContainer16) xorRunContainer16(x2 *runContainer16) container { @@ -2523,6 +2607,20 @@ func (rc *runContainer16) toEfficientContainer() container { return bc } +func (rc *runContainer16) toEfficientContainerFromCardinality(card int) container { + sizeAsRunContainer := rc.getSizeInBytes() + sizeAsBitmapContainer := bitmapContainerSizeInBytes() + sizeAsArrayContainer := arrayContainerSizeInBytes(card) + if sizeAsRunContainer < minOfInt(sizeAsBitmapContainer, sizeAsArrayContainer) { + return rc + } + if card <= arrayDefaultMaxSize { + return rc.toArrayContainer() + } + bc := newBitmapContainerFromRun(rc) + return bc +} + func (rc *runContainer16) toArrayContainer() *arrayContainer { ac := newArrayContainer() for i := range rc.iv { @@ -2619,7 +2717,7 @@ func (rc *runContainer16) addOffset(x uint16) (container, container) { for _, iv := range rc.iv { val := int(iv.start) + int(x) - finalVal := int(val) + int(iv.length) + finalVal := val + int(iv.length) if val <= 0xffff { if finalVal <= 0xffff { low.iv = append(low.iv, interval16{uint16(val), iv.length}) diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go b/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go index 16d356cafe..397805f802 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/serialization_littleendian.go @@ -6,9 +6,8 @@ package roaring import ( "encoding/binary" "errors" + "fmt" "io" - "reflect" - "runtime" "unsafe" ) @@ -26,51 +25,30 @@ func (bc *bitmapContainer) writeTo(stream io.Writer) (int, error) { } func uint64SliceAsByteSlice(slice []uint64) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 8 - header.Cap *= 8 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + const size = unsafe.Sizeof(uint64(0)) + return unsafe.Slice(((*byte)(unsafe.Pointer(ptr))), int(size)*len(slice)) } func uint16SliceAsByteSlice(slice []uint16) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 2 - header.Cap *= 2 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + const size = unsafe.Sizeof(uint16(0)) + return unsafe.Slice(((*byte)(unsafe.Pointer(ptr))), int(size)*len(slice)) } func interval16SliceAsByteSlice(slice []interval16) []byte { - // make a new slice header - header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice)) - - // update its capacity and length - header.Len *= 4 - header.Cap *= 4 - - // instantiate result and use KeepAlive so data isn't unmapped. - result := *(*[]byte)(unsafe.Pointer(&header)) - runtime.KeepAlive(&slice) - - // return it - return result + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + const size = unsafe.Sizeof(interval16{}) + return unsafe.Slice(((*byte)(unsafe.Pointer(ptr))), int(size)*len(slice)) } func (bc *bitmapContainer) asLittleEndianByteSlice() []byte { @@ -86,69 +64,39 @@ func (bc *bitmapContainer) asLittleEndianByteSlice() []byte { // or modified while you hold the returned slince. // // func byteSliceAsUint16Slice(slice []byte) (result []uint16) { // here we create a new slice holder - if len(slice)%2 != 0 { - panic("Slice size should be divisible by 2") + const sz = int(unsafe.Sizeof(uint16(0))) + if len(slice)%sz != 0 { + panic(fmt.Sprintf("Slice size should be divisible by %d", sz)) } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 2 - rHeader.Cap = bHeader.Cap / 2 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*uint16)(unsafe.Pointer(ptr)), len(slice)/sz) } func byteSliceAsUint64Slice(slice []byte) (result []uint64) { - if len(slice)%8 != 0 { - panic("Slice size should be divisible by 8") + const sz = int(unsafe.Sizeof(uint64(0))) + if len(slice)%sz != 0 { + panic(fmt.Sprintf("Slice size should be divisible by %d", sz)) } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 8 - rHeader.Cap = bHeader.Cap / 8 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*uint64)(unsafe.Pointer(ptr)), len(slice)/sz) } func byteSliceAsInterval16Slice(slice []byte) (result []interval16) { - if len(slice)%4 != 0 { - panic("Slice size should be divisible by 4") + const sz = int(unsafe.Sizeof(interval16{})) + if len(slice)%sz != 0 { + panic(fmt.Sprintf("Slice size should be divisible by %d", sz)) } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / 4 - rHeader.Cap = bHeader.Cap / 4 - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*interval16)(unsafe.Pointer(ptr)), len(slice)/sz) } func byteSliceAsContainerSlice(slice []byte) (result []container) { @@ -158,114 +106,59 @@ func byteSliceAsContainerSlice(slice []byte) (result []container) { if len(slice)%containerSize != 0 { panic("Slice size should be divisible by unsafe.Sizeof(container)") } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / containerSize - rHeader.Cap = bHeader.Cap / containerSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*container)(unsafe.Pointer(ptr)), len(slice)/containerSize) } func byteSliceAsBitsetSlice(slice []byte) (result []bitmapContainer) { - bitsetSize := int(unsafe.Sizeof(bitmapContainer{})) + const bitsetSize = int(unsafe.Sizeof(bitmapContainer{})) if len(slice)%bitsetSize != 0 { panic("Slice size should be divisible by unsafe.Sizeof(bitmapContainer)") } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / bitsetSize - rHeader.Cap = bHeader.Cap / bitsetSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*bitmapContainer)(unsafe.Pointer(ptr)), len(slice)/bitsetSize) } func byteSliceAsArraySlice(slice []byte) (result []arrayContainer) { - arraySize := int(unsafe.Sizeof(arrayContainer{})) + const arraySize = int(unsafe.Sizeof(arrayContainer{})) if len(slice)%arraySize != 0 { panic("Slice size should be divisible by unsafe.Sizeof(arrayContainer)") } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / arraySize - rHeader.Cap = bHeader.Cap / arraySize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*arrayContainer)(unsafe.Pointer(ptr)), len(slice)/arraySize) } func byteSliceAsRun16Slice(slice []byte) (result []runContainer16) { - run16Size := int(unsafe.Sizeof(runContainer16{})) + const run16Size = int(unsafe.Sizeof(runContainer16{})) if len(slice)%run16Size != 0 { panic("Slice size should be divisible by unsafe.Sizeof(runContainer16)") } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / run16Size - rHeader.Cap = bHeader.Cap / run16Size - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*runContainer16)(unsafe.Pointer(ptr)), len(slice)/run16Size) } func byteSliceAsBoolSlice(slice []byte) (result []bool) { - boolSize := int(unsafe.Sizeof(true)) + const boolSize = int(unsafe.Sizeof(true)) if len(slice)%boolSize != 0 { panic("Slice size should be divisible by unsafe.Sizeof(bool)") } - // reference: https://go101.org/article/unsafe.html - - // make a new slice header - bHeader := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) - rHeader := (*reflect.SliceHeader)(unsafe.Pointer(&result)) - - // transfer the data from the given slice to a new variable (our result) - rHeader.Data = bHeader.Data - rHeader.Len = bHeader.Len / boolSize - rHeader.Cap = bHeader.Cap / boolSize - - // instantiate result and use KeepAlive so data isn't unmapped. - runtime.KeepAlive(&slice) // it is still crucial, GC can free it) - - // return result - return + ptr := unsafe.SliceData(slice) + if ptr == nil { + return nil + } + return unsafe.Slice((*bool)(unsafe.Pointer(ptr)), len(slice)/boolSize) } // FrozenView creates a static view of a serialized bitmap stored in buf. diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go b/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go index 8def774f5a..ec51cfc0fb 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/setutil.go @@ -202,10 +202,22 @@ func intersects2by2( set1 []uint16, set2 []uint16, ) bool { - // could be optimized if one set is much larger than the other one if (len(set1) == 0) || (len(set2) == 0) { return false } + if len(set1)*64 < len(set2) { + return onesidedgallopingintersect2by2Bool(set1, set2) + } else if len(set2)*64 < len(set1) { + return onesidedgallopingintersect2by2Bool(set2, set1) + } else { + return intersects2by2Bool(set1, set2) + } +} + +func intersects2by2Bool( + set1 []uint16, + set2 []uint16, +) bool { index1 := 0 index2 := 0 value1 := set1[index1] @@ -244,6 +256,38 @@ mainwhile: return false } +func onesidedgallopingintersect2by2Bool( + smallset []uint16, + largeset []uint16, +) bool { + k1 := 0 + k2 := 0 + s1 := largeset[k1] + s2 := smallset[k2] +mainwhile: + for { + if s1 < s2 { + k1 = advanceUntil(largeset, k1, len(largeset), s2) + if k1 == len(largeset) { + break mainwhile + } + s1 = largeset[k1] + } + if s2 < s1 { + k2++ + if k2 == len(smallset) { + break mainwhile + } + s2 = smallset[k2] + } else { + // (set2[k2] == set1[k1]) + return true + } + + } + return false +} + func localintersect2by2( set1 []uint16, set2 []uint16, diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go b/vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go index 15b78bd0c1..53252580fb 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/shortiterator.go @@ -50,3 +50,53 @@ func (si *reverseIterator) next() uint16 { si.loc-- return a } + +type arrayContainerUnsetIterator struct { + content []uint16 + // pos is the index of the next set bit that is >= nextVal. + // When nextVal reaches content[pos], pos is incremented. + pos int + nextVal int +} + +func (acui *arrayContainerUnsetIterator) next() uint16 { + val := acui.nextVal + acui.nextVal++ + for acui.pos < len(acui.content) && uint16(acui.nextVal) >= acui.content[acui.pos] { + acui.nextVal++ + acui.pos++ + } + return uint16(val) +} + +func (acui *arrayContainerUnsetIterator) hasNext() bool { + return acui.nextVal < 65536 +} + +func (acui *arrayContainerUnsetIterator) peekNext() uint16 { + return uint16(acui.nextVal) +} + +func (acui *arrayContainerUnsetIterator) advanceIfNeeded(minval uint16) { + if !acui.hasNext() || acui.peekNext() >= minval { + return + } + acui.nextVal = int(minval) + acui.pos = binarySearch(acui.content, minval) + if acui.pos < 0 { + acui.pos = -acui.pos - 1 + } + for acui.pos < len(acui.content) && uint16(acui.nextVal) >= acui.content[acui.pos] { + acui.nextVal++ + acui.pos++ + } +} + +func newArrayContainerUnsetIterator(content []uint16) *arrayContainerUnsetIterator { + acui := &arrayContainerUnsetIterator{content: content, pos: 0, nextVal: 0} + for acui.pos < len(acui.content) && uint16(acui.nextVal) >= acui.content[acui.pos] { + acui.nextVal++ + acui.pos++ + } + return acui +} diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/smat.go b/vendor/github.com/RoaringBitmap/roaring/v2/smat.go index c52c5f07cf..d0fcf292da 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/smat.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/smat.go @@ -1,6 +1,3 @@ -//go:build gofuzz -// +build gofuzz - /* # Instructions for smat testing for roaring @@ -11,69 +8,49 @@ To run the smat tests for roaring... ## Prerequisites - $ go get github.com/dvyukov/go-fuzz/go-fuzz - $ go get github.com/dvyukov/go-fuzz/go-fuzz-build +Go 1.18 or later (for native fuzzing support). ## Steps -1. Generate initial smat corpus: +1. Generate initial smat corpus: ``` - go test -tags=gofuzz -run=TestGenerateSmatCorpus +go test -tags=gofuzz -run=TestGenerateSmatCorpus ``` +You should see a directory `workdir` created with initial corpus files. -2. Build go-fuzz test program with instrumentation: +2. Run the fuzz test: ``` - go-fuzz-build -func FuzzSmat github.com/RoaringBitmap/roaring +go test -run='^$' -fuzz=FuzzSmat -fuzztime=300s -timeout=60s ``` -3. Run go-fuzz: -``` - go-fuzz -bin=./roaring-fuzz.zip -workdir=workdir/ -timeout=200 -``` - -You should see output like... -``` -2016/09/16 13:58:35 slaves: 8, corpus: 1 (3s ago), crashers: 0, restarts: 1/0, execs: 0 (0/sec), cover: 0, uptime: 3s -2016/09/16 13:58:38 slaves: 8, corpus: 1 (6s ago), crashers: 0, restarts: 1/0, execs: 0 (0/sec), cover: 0, uptime: 6s -2016/09/16 13:58:41 slaves: 8, corpus: 1 (9s ago), crashers: 0, restarts: 1/44, execs: 44 (5/sec), cover: 0, uptime: 9s -2016/09/16 13:58:44 slaves: 8, corpus: 1 (12s ago), crashers: 0, restarts: 1/45, execs: 45 (4/sec), cover: 0, uptime: 12s -2016/09/16 13:58:47 slaves: 8, corpus: 1 (15s ago), crashers: 0, restarts: 1/46, execs: 46 (3/sec), cover: 0, uptime: 15s -2016/09/16 13:58:50 slaves: 8, corpus: 1 (18s ago), crashers: 0, restarts: 1/47, execs: 47 (3/sec), cover: 0, uptime: 18s -2016/09/16 13:58:53 slaves: 8, corpus: 1 (21s ago), crashers: 0, restarts: 1/63, execs: 63 (3/sec), cover: 0, uptime: 21s -2016/09/16 13:58:56 slaves: 8, corpus: 1 (24s ago), crashers: 0, restarts: 1/65, execs: 65 (3/sec), cover: 0, uptime: 24s -2016/09/16 13:58:59 slaves: 8, corpus: 1 (27s ago), crashers: 0, restarts: 1/66, execs: 66 (2/sec), cover: 0, uptime: 27s -2016/09/16 13:59:02 slaves: 8, corpus: 1 (30s ago), crashers: 0, restarts: 1/67, execs: 67 (2/sec), cover: 0, uptime: 30s -2016/09/16 13:59:05 slaves: 8, corpus: 1 (33s ago), crashers: 0, restarts: 1/83, execs: 83 (3/sec), cover: 0, uptime: 33s -2016/09/16 13:59:08 slaves: 8, corpus: 1 (36s ago), crashers: 0, restarts: 1/84, execs: 84 (2/sec), cover: 0, uptime: 36s -2016/09/16 13:59:11 slaves: 8, corpus: 2 (0s ago), crashers: 0, restarts: 1/85, execs: 85 (2/sec), cover: 0, uptime: 39s -2016/09/16 13:59:14 slaves: 8, corpus: 17 (2s ago), crashers: 0, restarts: 1/86, execs: 86 (2/sec), cover: 480, uptime: 42s -2016/09/16 13:59:17 slaves: 8, corpus: 17 (5s ago), crashers: 0, restarts: 1/66, execs: 132 (3/sec), cover: 487, uptime: 45s -2016/09/16 13:59:20 slaves: 8, corpus: 17 (8s ago), crashers: 0, restarts: 1/440, execs: 2645 (55/sec), cover: 487, uptime: 48s - -``` - -Let it run, and if the # of crashers is > 0, check out the reports in -the workdir where you should be able to find the panic goroutine stack -traces. +Adjust `-fuzztime` as needed for longer or shorter runs. If crashes are found, +check the test output and the reproducer files in the `workdir` directory. +You may copy the reproducers to roaring_tests.go */ package roaring import ( + "encoding/base64" "fmt" - "sort" + "os" + "path/filepath" + "runtime/debug" + "slices" + "strings" + "time" "github.com/bits-and-blooms/bitset" "github.com/mschoch/smat" ) -// fuzz test using state machine driven by byte stream. -func FuzzSmat(data []byte) int { - return smat.Fuzz(&smatContext{}, smat.ActionID('S'), smat.ActionID('T'), - smatActionMap, data) -} +// The native fuzz entry point lives in a _test.go file so the go test +// fuzz engine discovers it. See smat_fuzz_test.go for the fuzz wrapper. -var smatDebug = false +var smatDebug = true + +const max_value = 1048576 +const max_pairs = 10 func smatLog(prefix, format string, args ...interface{}) { if smatDebug { @@ -90,22 +67,33 @@ type smatContext struct { y int actions int + // per-context last action for this fuzz worker + lastAction *actionRecord +} + +// actionRecord stores a snapshot of the state just before an action runs. +type actionRecord struct { + Name string + X, Y int + PairSnapshots []string // base64-encoded MarshalBinary of each pair's Bitmap } type smatPair struct { bm *Bitmap bs *bitset.BitSet + // parent context (nil if unknown) + ctx *smatContext } // ------------------------------------------------------------------ var smatActionMap = smat.ActionMap{ - smat.ActionID('X'): smatAction("x++", smatWrap(func(c *smatContext) { c.x++ })), - smat.ActionID('x'): smatAction("x--", smatWrap(func(c *smatContext) { c.x-- })), - smat.ActionID('Y'): smatAction("y++", smatWrap(func(c *smatContext) { c.y++ })), - smat.ActionID('y'): smatAction("y--", smatWrap(func(c *smatContext) { c.y-- })), - smat.ActionID('*'): smatAction("x*y", smatWrap(func(c *smatContext) { c.x = c.x * c.y })), - smat.ActionID('<'): smatAction("x<<", smatWrap(func(c *smatContext) { c.x = c.x << 1 })), + smat.ActionID('X'): smatAction("x++", smatWrap(func(c *smatContext) { c.x = (c.x + 1) % max_value })), + smat.ActionID('x'): smatAction("x--", smatWrap(func(c *smatContext) { c.x = (c.x - 1 + max_value) % max_value })), + smat.ActionID('Y'): smatAction("y++", smatWrap(func(c *smatContext) { c.y = (c.y + 1) % max_value })), + smat.ActionID('y'): smatAction("y--", smatWrap(func(c *smatContext) { c.y = (c.y - 1 + max_value) % max_value })), + smat.ActionID('*'): smatAction("x*y", smatWrap(func(c *smatContext) { c.x = (c.x * c.y) % max_value })), + smat.ActionID('<'): smatAction("x<<", smatWrap(func(c *smatContext) { c.x = (c.x << 1) % max_value })), smat.ActionID('^'): smatAction("swap", smatWrap(func(c *smatContext) { c.x, c.y = c.y, c.x })), @@ -117,11 +105,13 @@ var smatActionMap = smat.ActionMap{ smat.ActionID('o'): smatAction(" or", smatWrap(smatOr)), smat.ActionID('a'): smatAction(" and", smatWrap(smatAnd)), + smat.ActionID('z'): smatAction(" xor", smatWrap(smatXor)), smat.ActionID('#'): smatAction(" cardinality", smatWrap(smatCardinality)), smat.ActionID('O'): smatAction(" orCardinality", smatWrap(smatOrCardinality)), smat.ActionID('A'): smatAction(" andCardinality", smatWrap(smatAndCardinality)), + smat.ActionID('Z'): smatAction(" xorCardinality", smatWrap(smatXorCardinality)), smat.ActionID('c'): smatAction(" clear", smatWrap(smatClear)), smat.ActionID('r'): smatAction(" runOptimize", smatWrap(smatRunOptimize)), @@ -142,12 +132,12 @@ func init() { for actionId := range smatActionMap { ids = append(ids, int(actionId)) } - sort.Ints(ids) + slices.Sort(ids) pct := 100 / len(smatActionMap) for _, actionId := range ids { smatRunningPercentActions = append(smatRunningPercentActions, - smat.PercentAction{pct, smat.ActionID(actionId)}) + smat.PercentAction{Percent: pct, Action: smat.ActionID(actionId)}) } smatActionMap[smat.ActionID('S')] = smatAction("SETUP", smatSetupFunc) @@ -162,14 +152,153 @@ func smatRunning(next byte) smat.ActionID { func smatAction(name string, f func(ctx smat.Context) (smat.State, error)) func(smat.Context) (smat.State, error) { return func(ctx smat.Context) (smat.State, error) { c := ctx.(*smatContext) - c.actions++ - smatLog(" ", "%s\n", name) + // Snapshot all pairs' bitmaps (base64 of MarshalBinary) before action + rec := actionRecord{Name: name, X: c.x, Y: c.y} + if len(c.pairs) > 0 { + rec.PairSnapshots = make([]string, 0, len(c.pairs)) + for _, pair := range c.pairs { + if pair == nil || pair.bm == nil { + rec.PairSnapshots = append(rec.PairSnapshots, "") + continue + } + b, err := pair.bm.MarshalBinary() + if err != nil { + rec.PairSnapshots = append(rec.PairSnapshots, "") + } else { + rec.PairSnapshots = append(rec.PairSnapshots, base64.StdEncoding.EncodeToString(b)) + } + } + } + + // record per-context last action (no global mutex required) + if c != nil { + c.lastAction = &rec + } + // catch panics inside action to dump a repro and stack before re-panicking + defer func() { + if r := recover(); r != nil { + // best-effort: write quick repro with lastAction from context + var lastAction *actionRecord + if c != nil { + lastAction = c.lastAction + } + ts := time.Now().UnixNano() + repro := "// Reproducer generated by smat (panic)\n" + repro += "package roaring\n\n" + repro += "import (\n\t\"encoding/base64\"\n\t\"testing\"\n)\n\n" + repro += fmt.Sprintf("func TestFuzzerPanicRepro_%d(t *testing.T) {\n", ts) + // similar to checkEquals repro + if lastAction != nil && len(lastAction.PairSnapshots) > 0 { + pairIndex := lastAction.X % len(lastAction.PairSnapshots) + if pairIndex < len(lastAction.PairSnapshots) { + snapshot := lastAction.PairSnapshots[pairIndex] + if snapshot != "" && !strings.HasPrefix(snapshot, "<") { + repro += fmt.Sprintf("\tb, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshot) + repro += "\tbm := NewBitmap()\n" + repro += "\tbm.UnmarshalBinary(b)\n" + // perform the action that caused panic + if strings.Contains(lastAction.Name, "setBit") { + repro += fmt.Sprintf("\tbm.AddInt(%d)\n", lastAction.Y) + } else if strings.Contains(lastAction.Name, "removeBit") { + repro += fmt.Sprintf("\tbm.Remove(%d)\n", lastAction.Y) + } else if strings.Contains(lastAction.Name, "flip") { + repro += fmt.Sprintf("\tbm.Flip(uint64(%d), uint64(%d)+1)\n", lastAction.Y, lastAction.Y) + } else if strings.Contains(lastAction.Name, "runOptimize") { + repro += "\tbm.RunOptimize()\n" + } else if strings.Contains(lastAction.Name, "clear") { + repro += "\tbm.Clear()\n" + } else if lastAction.Name == " or" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.Or(bm2)\n" + } + } + } else if lastAction.Name == " and" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.And(bm2)\n" + } + } + } else if lastAction.Name == " difference" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.AndNot(bm2)\n" + } + } + } else if lastAction.Name == " xor" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.Xor(bm2)\n" + } + } + } else { + repro += fmt.Sprintf("\t// Unhandled action: %s\n", lastAction.Name) + } + } else { + repro += "\t// invalid snapshot\n" + } + } + } + repro += "}\n" + if path, werr := saveReproFile("smat_panic_repro", ts, repro); werr == nil { + fmt.Printf("wrote panic repro to %s\n", path) + } else { + fmt.Printf("failed writing panic repro: %v\n", werr) + } + fmt.Printf("PANIC in action %s: %v\n", rec.Name, r) + fmt.Printf("stack:\n%s\n", debug.Stack()) + panic(r) + } + }() + + c.actions++ return f(ctx) } } +// saveReproFile writes the given repro content to workdir/__test.go +// or falls back to the OS temp dir. Returns full path or error. +func saveReproFile(prefix string, ts int64, content string) (string, error) { + // try workdir + if err := os.MkdirAll("workdir", 0o755); err == nil { + fname := fmt.Sprintf("workdir/%s_%d_test.go", prefix, ts) + if err := os.WriteFile(fname, []byte(content), 0o644); err == nil { + return fname, nil + } + } + // fallback to temp + tmp := os.TempDir() + fname := fmt.Sprintf("%s_%d_test.go", prefix, ts) + full := filepath.Join(tmp, fname) + if err := os.WriteFile(full, []byte(content), 0o644); err == nil { + return full, nil + } else { + return "", err + } +} + // Creates an smat action func based on a simple callback. func smatWrap(cb func(c *smatContext)) func(smat.Context) (next smat.State, err error) { return func(ctx smat.Context) (next smat.State, err error) { @@ -203,10 +332,15 @@ func smatTeardownFunc(ctx smat.Context) (next smat.State, err error) { // ------------------------------------------------------------------ func smatPushPair(c *smatContext) { - c.pairs = append(c.pairs, &smatPair{ - bm: NewBitmap(), - bs: bitset.New(100), - }) + if len(c.pairs) >= max_pairs { + return + } + p := &smatPair{ + bm: NewBitmap(), + bs: bitset.New(100), + ctx: c, + } + c.pairs = append(c.pairs, p) } func smatPopPair(c *smatContext) { @@ -217,6 +351,7 @@ func smatPopPair(c *smatContext) { func smatSetBit(c *smatContext) { c.withPair(c.x, func(p *smatPair) { + p.Validate() y := uint32(c.y) p.bm.AddInt(int(y)) p.bs.Set(uint(y)) @@ -226,6 +361,7 @@ func smatSetBit(c *smatContext) { func smatRemoveBit(c *smatContext) { c.withPair(c.x, func(p *smatPair) { + p.Validate() y := uint32(c.y) p.bm.Remove(y) p.bs.Clear(uint(y)) @@ -236,6 +372,8 @@ func smatRemoveBit(c *smatContext) { func smatAnd(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() px.bm.And(py.bm) px.bs = px.bs.Intersection(py.bs) px.checkEquals() @@ -247,6 +385,8 @@ func smatAnd(c *smatContext) { func smatOr(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() px.bm.Or(py.bm) px.bs = px.bs.Union(py.bs) px.checkEquals() @@ -255,9 +395,24 @@ func smatOr(c *smatContext) { }) } +func smatXor(c *smatContext) { + c.withPair(c.x, func(px *smatPair) { + c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() + px.bm.Xor(py.bm) + px.bs = px.bs.SymmetricDifference(py.bs) + px.checkEquals() + py.checkEquals() + }) + }) +} + func smatAndCardinality(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() c0 := px.bm.AndCardinality(py.bm) c1 := px.bs.IntersectionCardinality(py.bs) if c0 != uint64(c1) { @@ -272,6 +427,8 @@ func smatAndCardinality(c *smatContext) { func smatOrCardinality(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() c0 := px.bm.OrCardinality(py.bm) c1 := px.bs.UnionCardinality(py.bs) if c0 != uint64(c1) { @@ -283,8 +440,25 @@ func smatOrCardinality(c *smatContext) { }) } +func smatXorCardinality(c *smatContext) { + c.withPair(c.x, func(px *smatPair) { + c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() + c0 := px.bm.OrCardinality(py.bm) - px.bm.AndCardinality(py.bm) + c1 := px.bs.SymmetricDifferenceCardinality(py.bs) + if c0 != uint64(c1) { + panic("expected same xor cardinality") + } + px.checkEquals() + py.checkEquals() + }) + }) +} + func smatRunOptimize(c *smatContext) { c.withPair(c.x, func(px *smatPair) { + px.Validate() px.bm.RunOptimize() px.checkEquals() }) @@ -292,6 +466,7 @@ func smatRunOptimize(c *smatContext) { func smatClear(c *smatContext) { c.withPair(c.x, func(px *smatPair) { + px.Validate() px.bm.Clear() px.bs = px.bs.ClearAll() px.checkEquals() @@ -321,6 +496,8 @@ func smatIsEmpty(c *smatContext) { func smatIntersects(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() v0 := px.bm.Intersects(py.bm) v1 := px.bs.IntersectionCardinality(py.bs) > 0 if v0 != v1 { @@ -335,6 +512,7 @@ func smatIntersects(c *smatContext) { func smatFlip(c *smatContext) { c.withPair(c.x, func(p *smatPair) { + p.Validate() y := uint32(c.y) p.bm.Flip(uint64(y), uint64(y)+1) p.bs = p.bs.Flip(uint(y)) @@ -345,6 +523,8 @@ func smatFlip(c *smatContext) { func smatDifference(c *smatContext) { c.withPair(c.x, func(px *smatPair) { c.withPair(c.y, func(py *smatPair) { + px.Validate() + py.Validate() px.bm.AndNot(py.bm) px.bs = px.bs.Difference(py.bs) px.checkEquals() @@ -354,11 +534,164 @@ func smatDifference(c *smatContext) { } func (p *smatPair) checkEquals() { + valid := p.bm.Validate() + if valid != nil { + // marshal current bitmap + var curSnap string + if p != nil && p.bm != nil { + if b, err := p.bm.MarshalBinary(); err == nil { + curSnap = base64.StdEncoding.EncodeToString(b) + } else { + curSnap = "" + } + } else { + curSnap = "" + } + + // collect last action summary from context (per-worker) + last := "" + if p != nil && p.ctx != nil { + c := p.ctx + if c.lastAction != nil { + last = fmt.Sprintf("action=%s x=%d y=%d pairs=%d", c.lastAction.Name, c.lastAction.X, c.lastAction.Y, len(c.lastAction.PairSnapshots)) + } + } + + // If debugging enabled, log extra info + smatLog("ERROR: ", "bitmap invalid: %v\n", valid) + + // build a reproducible test snippet that reconstructs the bitmap and replays the failing action + ts := time.Now().UnixNano() + testName := fmt.Sprintf("TestFuzzerRepro_%d", ts) + repro := "// Reproducer generated by smat\n" + repro += "package roaring\n\n" + repro += "import (\n\t\"encoding/base64\"\n\t\"testing\"\n)\n\n" + repro += fmt.Sprintf("func %s(t *testing.T) {\n", testName) + var lastAction *actionRecord + if p != nil && p.ctx != nil { + lastAction = p.ctx.lastAction + } + // use the snapshot of the modified pair + if lastAction != nil && len(lastAction.PairSnapshots) > 0 { + // assume the modified pair is x % len(pairs), but since pairs are in order, and x is lastAction.X + pairIndex := lastAction.X % len(lastAction.PairSnapshots) + if pairIndex < len(lastAction.PairSnapshots) { + snapshot := lastAction.PairSnapshots[pairIndex] + if snapshot != "" && !strings.HasPrefix(snapshot, "<") { + repro += fmt.Sprintf("\tb, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshot) + repro += "\tbm := NewBitmap()\n" + repro += "\tbm.UnmarshalBinary(b)\n" + repro += "\tif err := bm.Validate(); err != nil {\n" + repro += "\t\tt.Errorf(\"Initial Validate failed: %v\", err)\n" + repro += "\t}\n" + // perform the action + if strings.Contains(lastAction.Name, "setBit") { + repro += fmt.Sprintf("\tbm.AddInt(%d)\n", lastAction.Y) + } else if strings.Contains(lastAction.Name, "removeBit") { + repro += fmt.Sprintf("\tbm.Remove(%d)\n", lastAction.Y) + } else if strings.Contains(lastAction.Name, "flip") { + repro += fmt.Sprintf("\tbm.Flip(uint64(%d), uint64(%d)+1)\n", lastAction.Y, lastAction.Y) + } else if strings.Contains(lastAction.Name, "runOptimize") { + repro += "\tbm.RunOptimize()\n" + } else if strings.Contains(lastAction.Name, "clear") { + repro += "\tbm.Clear()\n" + } else if lastAction.Name == " or" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.Or(bm2)\n" + } + } + } else if lastAction.Name == " and" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.And(bm2)\n" + } + } + } else if lastAction.Name == " difference" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.AndNot(bm2)\n" + } + } + } else if lastAction.Name == " xor" { + pairIndexY := lastAction.Y % len(lastAction.PairSnapshots) + if pairIndexY < len(lastAction.PairSnapshots) { + snapshotY := lastAction.PairSnapshots[pairIndexY] + if snapshotY != "" && !strings.HasPrefix(snapshotY, "<") { + repro += fmt.Sprintf("\tb2, _ := base64.StdEncoding.DecodeString(\"%s\")\n", snapshotY) + repro += "\tbm2 := NewBitmap()\n" + repro += "\tbm2.UnmarshalBinary(b2)\n" + repro += "\tbm.Xor(bm2)\n" + } + } + } else { + repro += fmt.Sprintf("\t// Unhandled action: %s\n", lastAction.Name) + } + repro += "\tif err := bm.Validate(); err != nil {\n" + repro += "\t\tt.Errorf(\"Validate failed: %v\", err)\n" + repro += "\t} else {\n" + repro += "\t\tt.Logf(\"Validate succeeded\")\n" + repro += "\t}\n" + } else { + repro += "\t// invalid snapshot\n" + } + } + } + repro += "}\n" + + // print the repro snippet for the developer + fmt.Println() + fmt.Println("=== SMAT REPRODUCER SNIPPET ===") + if len(repro) > 10000 { + fmt.Println("// Reproducer too large, skipping full print") + } else { + fmt.Println(repro) + } + + // also write the repro snippet to a timestamped file in workdir/ + if len(repro) > 10000 { + repro = "// Reproducer too large, skipping\n" + } + if err := os.MkdirAll("workdir", 0o755); err == nil { + fname := fmt.Sprintf("workdir/smat_repro_%d_test.go", ts) + if werr := os.WriteFile(fname, []byte(repro), 0o644); werr == nil { + fmt.Printf("Wrote repro to %s\n", fname) + } else { + fmt.Printf("Failed writing repro file: %v\n", werr) + } + } else { + fmt.Printf("Failed creating workdir: %v\n", err) + } + + panic(fmt.Sprintf("[checkEquals] bitmap invalid: %v\ncurrentBase64:%s\nlastAction:%s\n", valid, curSnap, last)) + } if !p.equalsBitSet(p.bs, p.bm) { panic("bitset mismatch") } } +func (p *smatPair) Validate() { + valid := p.bm.Validate() + if valid != nil { + panic(fmt.Sprintf("[Validate] bitmap invalid: %v", valid)) + } +} + func (p *smatPair) equalsBitSet(a *bitset.BitSet, b *Bitmap) bool { for i, e := a.NextSet(0); e; i, e = a.NextSet(i + 1) { if !b.ContainsInt(int(i)) { diff --git a/vendor/github.com/RoaringBitmap/roaring/v2/util.go b/vendor/github.com/RoaringBitmap/roaring/v2/util.go index f58a86b2ed..031dfa307d 100644 --- a/vendor/github.com/RoaringBitmap/roaring/v2/util.go +++ b/vendor/github.com/RoaringBitmap/roaring/v2/util.go @@ -1,9 +1,10 @@ package roaring import ( + "cmp" "math" "math/rand" - "sort" + "slices" ) const ( @@ -123,7 +124,7 @@ func combineLoHi16(lob uint16, hob uint16) uint32 { } func combineLoHi32(lob uint32, hob uint32) uint32 { - return uint32(lob) | (hob << 16) + return lob | (hob << 16) } const maxLowBit = 0xFFFF @@ -264,19 +265,13 @@ type ph struct { rand int } -type pha []ph - -func (p pha) Len() int { return len(p) } -func (p pha) Less(i, j int) bool { return p[i].rand < p[j].rand } -func (p pha) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - func getRandomPermutation(n int) []int { r := make([]ph, n) for i := 0; i < n; i++ { r[i].orig = i r[i].rand = rand.Intn(1 << 29) } - sort.Sort(pha(r)) + slices.SortFunc(r, func(a, b ph) int { return cmp.Compare(a.rand, b.rand) }) m := make([]int, n) for i := range m { m[i] = r[i].orig diff --git a/vendor/github.com/bits-and-blooms/bitset/README.md b/vendor/github.com/bits-and-blooms/bitset/README.md index b245facb7e..599982f2ee 100644 --- a/vendor/github.com/bits-and-blooms/bitset/README.md +++ b/vendor/github.com/bits-and-blooms/bitset/README.md @@ -164,3 +164,13 @@ Before committing the code, please check if it passes tests, has adequate covera go test go test -cover ``` + +## Stars + + +[![Star History Chart](https://api.star-history.com/svg?repos=bits-and-blooms/bitset&type=Date)](https://www.star-history.com/#bits-and-blooms/bitset&Date) + +## Further reading + +

Mastering Programming: From Testing to Performance in Go

+
diff --git a/vendor/github.com/bits-and-blooms/bitset/bitset.go b/vendor/github.com/bits-and-blooms/bitset/bitset.go index 46d05b9ed8..c8d8ddba8c 100644 --- a/vendor/github.com/bits-and-blooms/bitset/bitset.go +++ b/vendor/github.com/bits-and-blooms/bitset/bitset.go @@ -905,7 +905,9 @@ func (b *BitSet) DifferenceCardinality(compare *BitSet) uint { l = b.wordCount() } cnt := uint64(0) - cnt += popcntMaskSlice(b.set[:l], compare.set[:l]) + if l > 0 { + cnt += popcntMaskSlice(b.set[:l], compare.set[:l]) + } cnt += popcntSlice(b.set[l:]) return uint(cnt) } @@ -960,6 +962,9 @@ func (b *BitSet) Intersection(compare *BitSet) (result *BitSet) { func (b *BitSet) IntersectionCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) + if b.length == 0 || compare.length == 0 { + return 0 + } b, compare = sortByLength(b, compare) cnt := popcntAndSlice(b.set, compare.set) return uint(cnt) @@ -1016,7 +1021,10 @@ func (b *BitSet) UnionCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) b, compare = sortByLength(b, compare) - cnt := popcntOrSlice(b.set, compare.set) + cnt := uint64(0) + if len(b.set) > 0 { + cnt += popcntOrSlice(b.set, compare.set) + } if len(compare.set) > len(b.set) { cnt += popcntSlice(compare.set[len(b.set):]) } @@ -1071,7 +1079,10 @@ func (b *BitSet) SymmetricDifferenceCardinality(compare *BitSet) uint { panicIfNull(b) panicIfNull(compare) b, compare = sortByLength(b, compare) - cnt := popcntXorSlice(b.set, compare.set) + cnt := uint64(0) + if len(b.set) > 0 { + cnt += popcntXorSlice(b.set, compare.set) + } if len(compare.set) > len(b.set) { cnt += popcntSlice(compare.set[len(b.set):]) } @@ -1473,7 +1484,7 @@ func (b *BitSet) ShiftLeft(bits uint) { dst := b.set // not using extendSet() to avoid unneeded data copying - nsize := wordsNeeded(top + bits) + nsize := wordsNeeded(top + bits + 1) if len(b.set) < nsize { dst = make([]uint64, nsize) } @@ -1520,7 +1531,7 @@ func (b *BitSet) ShiftRight(bits uint) { return } - if bits >= top { + if bits > top { b.set = make([]uint64, wordsNeeded(b.length)) return } diff --git a/vendor/github.com/bits-and-blooms/bitset/popcnt.go b/vendor/github.com/bits-and-blooms/bitset/popcnt.go index 93492390bf..a6d62f7239 100644 --- a/vendor/github.com/bits-and-blooms/bitset/popcnt.go +++ b/vendor/github.com/bits-and-blooms/bitset/popcnt.go @@ -2,58 +2,51 @@ package bitset import "math/bits" -func popcntSlice(s []uint64) uint64 { - var cnt int +func popcntSlice(s []uint64) (cnt uint64) { for _, x := range s { - cnt += bits.OnesCount64(x) + cnt += uint64(bits.OnesCount64(x)) } - return uint64(cnt) + return } -func popcntMaskSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } +func popcntMaskSlice(s, m []uint64) (cnt uint64) { + // The next line is to help the bounds checker, it matters! + _ = m[len(s)-1] // BCE for i := range s { - cnt += bits.OnesCount64(s[i] &^ m[i]) + cnt += uint64(bits.OnesCount64(s[i] &^ m[i])) } - return uint64(cnt) + return } -func popcntAndSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } +// popcntAndSlice computes the population count of the AND of two slices. +// It assumes that len(m) >= len(s) > 0. +func popcntAndSlice(s, m []uint64) (cnt uint64) { + // The next line is to help the bounds checker, it matters! + _ = m[len(s)-1] // BCE for i := range s { - cnt += bits.OnesCount64(s[i] & m[i]) + cnt += uint64(bits.OnesCount64(s[i] & m[i])) } - return uint64(cnt) + return } -func popcntOrSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } +// popcntOrSlice computes the population count of the OR of two slices. +// It assumes that len(m) >= len(s) > 0. +func popcntOrSlice(s, m []uint64) (cnt uint64) { + // The next line is to help the bounds checker, it matters! + _ = m[len(s)-1] // BCE for i := range s { - cnt += bits.OnesCount64(s[i] | m[i]) + cnt += uint64(bits.OnesCount64(s[i] | m[i])) } - return uint64(cnt) + return } -func popcntXorSlice(s, m []uint64) uint64 { - var cnt int - // this explicit check eliminates a bounds check in the loop - if len(m) < len(s) { - panic("mask slice is too short") - } +// popcntXorSlice computes the population count of the XOR of two slices. +// It assumes that len(m) >= len(s) > 0. +func popcntXorSlice(s, m []uint64) (cnt uint64) { + // The next line is to help the bounds checker, it matters! + _ = m[len(s)-1] // BCE for i := range s { - cnt += bits.OnesCount64(s[i] ^ m[i]) + cnt += uint64(bits.OnesCount64(s[i] ^ m[i])) } - return uint64(cnt) + return } diff --git a/vendor/github.com/blevesearch/bleve/v2/README.md b/vendor/github.com/blevesearch/bleve/v2/README.md index 47ff007732..c0e85e53a3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/README.md +++ b/vendor/github.com/blevesearch/bleve/v2/README.md @@ -24,6 +24,7 @@ A modern indexing + search library in GO * [geo spatial search](https://github.com/blevesearch/bleve/blob/master/geo/README.md) * approximate k-nearest neighbors via [vector search](https://github.com/blevesearch/bleve/blob/master/docs/vectors.md) * [synonym search](https://github.com/blevesearch/bleve/blob/master/docs/synonyms.md) + * [hierarchical nested search](https://github.com/blevesearch/bleve/blob/master/docs/hierarchy.md) * [tf-idf](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#tf-idf) / [bm25](https://github.com/blevesearch/bleve/blob/master/docs/scoring.md#bm25) scoring models * Hybrid search: exact + semantic * Supports [RRF (Reciprocal Rank Fusion) and RSF (Relative Score Fusion)](docs/score_fusion.md) diff --git a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go index 5df940e5ee..9040e02830 100644 --- a/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go +++ b/vendor/github.com/blevesearch/bleve/v2/analysis/analyzer/custom/custom.go @@ -140,7 +140,7 @@ func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType st if ok { stringSlice[i] = stringObj } else { - return nil, fmt.Errorf(objType + " name must be a string") + return nil, fmt.Errorf("%s name must be a string", objType) } } diff --git a/vendor/github.com/blevesearch/bleve/v2/builder.go b/vendor/github.com/blevesearch/bleve/v2/builder.go index f170317ee2..5398739f52 100644 --- a/vendor/github.com/blevesearch/bleve/v2/builder.go +++ b/vendor/github.com/blevesearch/bleve/v2/builder.go @@ -73,7 +73,10 @@ func newBuilder(path string, mapping mapping.IndexMapping, config map[string]int // do not use real config, as these are options for the builder, // not the resulting index - meta := newIndexMeta(scorch.Name, scorch.Name, map[string]interface{}{}) + meta, err := newIndexMeta(scorch.Name, scorch.Name, map[string]interface{}{}, path) + if err != nil { + return nil, err + } err = meta.Save(path) if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/bleve/v2/document/document.go b/vendor/github.com/blevesearch/bleve/v2/document/document.go index 569d57bd68..cb3f537870 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/document.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/document.go @@ -30,8 +30,9 @@ func init() { } type Document struct { - id string `json:"id"` - Fields []Field `json:"fields"` + id string + Fields []Field `json:"fields"` + NestedDocuments []*Document `json:"nested_documents"` CompositeFields []*CompositeField StoredFieldsSize uint64 indexed bool @@ -68,6 +69,12 @@ func (d *Document) Size() int { sizeInBytes += entry.Size() } + for _, entry := range d.NestedDocuments { + if entry != nil { + sizeInBytes += entry.Size() + } + } + return sizeInBytes } @@ -111,6 +118,11 @@ func (d *Document) NumPlainTextBytes() uint64 { } } } + for _, nestedDoc := range d.NestedDocuments { + if nestedDoc != nil { + rv += nestedDoc.NumPlainTextBytes() + } + } return rv } @@ -157,3 +169,13 @@ func (d *Document) SetIndexed() { func (d *Document) Indexed() bool { return d.indexed } + +func (d *Document) AddNestedDocument(doc *Document) { + d.NestedDocuments = append(d.NestedDocuments, doc) +} + +func (d *Document) VisitNestedDocuments(visitor func(doc index.Document)) { + for _, doc := range d.NestedDocuments { + visitor(doc) + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go b/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go index 5795043f2f..ef8938f70d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_geopoint.go @@ -180,6 +180,15 @@ func NewGeoPointField(name string, arrayPositions []uint64, lon, lat float64) *G func NewGeoPointFieldWithIndexingOptions(name string, arrayPositions []uint64, lon, lat float64, options index.FieldIndexingOptions) *GeoPointField { mhash := geo.MortonHash(lon, lat) prefixCoded := numeric.MustNewPrefixCodedInt64(int64(mhash), 0) + + // docvalues are always enabled for geopoint fields, even if the + // indexing options are set to not include docvalues. + // snappy compression and chunking are always skipped for geopoint + // to avoid mem copies and faster lookups. + options |= index.DocValues + options |= index.SkipDVChunking + options |= index.SkipDVCompression + return &GeoPointField{ name: name, arrayPositions: arrayPositions, diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go index 6282ff12b3..2eb7aa3f2b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_geoshape.go @@ -180,7 +180,11 @@ func NewGeoShapeFieldFromShapeWithIndexingOptions(name string, arrayPositions [] // docvalues are always enabled for geoshape fields, even if the // indexing options are set to not include docvalues. + // snappy compression and chunking are always skipped for geoshape + // to avoid mem copies and faster lookups. options |= index.DocValues + options |= index.SkipDVChunking + options |= index.SkipDVCompression return &GeoShapeField{ shape: shape, @@ -232,7 +236,11 @@ func NewGeometryCollectionFieldFromShapesWithIndexingOptions(name string, // docvalues are always enabled for geoshape fields, even if the // indexing options are set to not include docvalues. + // snappy compression and chunking are always skipped for geoshape + // to avoid mem copies and faster lookups. options |= index.DocValues + options |= index.SkipDVChunking + options |= index.SkipDVCompression return &GeoShapeField{ shape: shape, diff --git a/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go index 4c20013c77..136b79bbcb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go +++ b/vendor/github.com/blevesearch/bleve/v2/document/field_vector.go @@ -114,6 +114,13 @@ func NewVectorFieldWithIndexingOptions(name string, arrayPositions []uint64, // skip freq/norms for vector field options |= index.SkipFreqNorm + // bivf-sq8 indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons + if index.OptimizationRequiresBinaryIndex(vectorIndexOptimizedFor) { + similarity = index.CosineSimilarity + } + return &VectorField{ name: name, dims: dims, diff --git a/vendor/github.com/blevesearch/bleve/v2/error.go b/vendor/github.com/blevesearch/bleve/v2/error.go index b57a61543d..e5ffe07e97 100644 --- a/vendor/github.com/blevesearch/bleve/v2/error.go +++ b/vendor/github.com/blevesearch/bleve/v2/error.go @@ -28,6 +28,7 @@ const ( ErrorIndexReadInconsistency ErrorTwoPhaseSearchInconsistency ErrorSynonymSearchNotSupported + ErrorTrainingNotSupported ) // Error represents a more strongly typed bleve error for detecting diff --git a/vendor/github.com/blevesearch/bleve/v2/index.go b/vendor/github.com/blevesearch/bleve/v2/index.go index 2f1ba5fbf1..889da1faea 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index.go @@ -389,6 +389,11 @@ type SynonymIndex interface { IndexSynonym(id string, collection string, definition *SynonymDefinition) error } +type IndexWithCallbacks interface { + FileWriterIDsInUse() (map[string]struct{}, error) + DropFileWriterIDs(ids map[string]struct{}) error +} + type InsightsIndex interface { Index // TermFrequencies returns the tokens ordered by frequencies for the field index. @@ -396,3 +401,13 @@ type InsightsIndex interface { // CentroidCardinalities returns the centroids (clusters) from IVF indexes ordered by data density. CentroidCardinalities(field string, limit int, desceding bool) ([]index.CentroidCardinality, error) } + +type TrainableIndex interface { + Index + Train(*Batch) error +} + +type IndexFileCopyable interface { + SetPathInBolt(key []byte, value []byte) error //dest index + CopyFile(file string, d index.IndexDirectory) error // source index +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go index d4d8e9c075..bd78f51e99 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/builder.go @@ -20,9 +20,9 @@ import ( "sync" "github.com/RoaringBitmap/roaring/v2" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" - bolt "go.etcd.io/bbolt" ) const DefaultBuilderBatchSize = 1000 @@ -291,7 +291,7 @@ func (o *Builder) Close() error { // create the root bolt rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt" - rootBolt, err := bolt.Open(rootBoltPath, 0600, nil) + rootBolt, err := util.OpenBolt(rootBoltPath, 0600, nil) if err != nil { return err } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go index cb11d5072d..5f0ea19952 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/introducer.go @@ -154,22 +154,24 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { cachedDocs: root.segment[i].cachedDocs, cachedMeta: root.segment[i].cachedMeta, creator: root.segment[i].creator, + internal: root.segment[i].internal, } // apply new obsoletions if root.segment[i].deleted == nil { newss.deleted = delta } else { - if delta.IsEmpty() { - newss.deleted = root.segment[i].deleted - } else { - newss.deleted = roaring.Or(root.segment[i].deleted, delta) - } + newss.deleted = roaring.Or(root.segment[i].deleted, delta) } if newss.deleted.IsEmpty() { newss.deleted = nil } + // update the deleted bitmap to include any nested/sub-documents as well + // if the segment supports that + if ns, ok := newss.segment.(segment.NestedSegment); ok { + newss.deleted = ns.AddNestedDocuments(newss.deleted) + } // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) @@ -201,6 +203,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { stats: next.stats, cachedDocs: &cachedDocs{cache: nil}, cachedMeta: &cachedMeta{meta: nil}, + internal: make(map[string][]byte), creator: "introduceSegment", } newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) @@ -210,6 +213,12 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // queued for persistence. atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) + + // track the internal values of this segment so that when we update the + // bolt we keep the internal values in sync with the segments on disk, and + // if this segment didn't get persisted we need to undo that info from the + // indexSnapshot's internal map as part of the bolt update. + newSegmentSnapshot.internal = next.internal } // copy old values for key, oldVal := range root.internal { @@ -398,6 +407,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { cachedDocs: root.segment[i].cachedDocs, cachedMeta: root.segment[i].cachedMeta, creator: root.segment[i].creator, + internal: root.segment[i].internal, }) root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go index e17288410e..0101dd1e36 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/merge.go @@ -31,6 +31,19 @@ import ( const merger = "merger" +// used in the context of mergerCtrl to provide a way to verify +// the completion of a merge operation +const mergeDoneKey = "mergeDone" + +type mergeDoneChan chan error + +// used in the context of mergerCtrl to provide a way to use +// a custom merge plan instead of the one generated by the +// default merge planner +const mergePlanFuncKey = "mergePlanFunc" + +type mergePlanFunc func(*IndexSnapshot) (*mergeplan.MergePlan, error) + func (s *Scorch) mergerLoop() { defer func() { if r := recover(); r != nil { @@ -95,11 +108,9 @@ OUTER: continue OUTER } - startTime := time.Now() - // lets get started - err := s.planMergeAtSnapshot(ctrlMsg.ctx, ctrlMsg.options, - ourSnapshot) + startTime := time.Now() + err := s.planMergeAtSnapshot(ctrlMsg, ourSnapshot) if err != nil { atomic.StoreUint64(&s.iStats.mergeEpoch, 0) if err == segment.ErrClosed { @@ -286,42 +297,64 @@ func (w *closeChWrapper) listen() { } } -func (s *Scorch) planMergeAtSnapshot(ctx context.Context, - options *mergeplan.MergePlanOptions, ourSnapshot *IndexSnapshot) error { - // build list of persisted segments in this snapshot - var onlyPersistedSnapshots []mergeplan.Segment - for _, segmentSnapshot := range ourSnapshot.segment { - if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { - onlyPersistedSnapshots = append(onlyPersistedSnapshots, segmentSnapshot) +// planMergeAtSnapshot plans and executes the merge operations for a given snapshot +// if there is a custom merge plan function provided, it uses that to get the merge plan +// otherwise, it builds the merge plan using the default planner and executes the merge tasks in the plan. +func (s *Scorch) planMergeAtSnapshot(ctrlMsg *mergerCtrl, ourSnapshot *IndexSnapshot) error { + var mergePlan *mergeplan.MergePlan + // if a merge plan function is provided in the context, use it to get the merge plan + if mergePlanFunc, ok := ctrlMsg.ctx.Value(mergePlanFuncKey).(mergePlanFunc); ok { + var err error + mergePlan, err = mergePlanFunc(ourSnapshot) + if err != nil { + atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) + return fmt.Errorf("merge planning err: %v", err) } } - atomic.AddUint64(&s.stats.TotFileMergePlan, 1) + // default to making a merge plan if a custom one is not provided + if mergePlan == nil { + // build list of persisted segments in this snapshot + var onlyPersistedSnapshots []mergeplan.Segment + for _, segmentSnapshot := range ourSnapshot.segment { + if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { + onlyPersistedSnapshots = append(onlyPersistedSnapshots, segmentSnapshot) + } + } - // give this list to the planner - resultMergePlan, err := mergeplan.Plan(onlyPersistedSnapshots, options) - if err != nil { - atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) - return fmt.Errorf("merge planning err: %v", err) - } - if resultMergePlan == nil { - // nothing to do - atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) - return nil - } - atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) + atomic.AddUint64(&s.stats.TotFileMergePlan, 1) - atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) + // give this list to the planner + var err error + mergePlan, err = mergeplan.Plan(onlyPersistedSnapshots, ctrlMsg.options) + if err != nil { + atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) + return fmt.Errorf("merge planning err: %v", err) + } + if mergePlan == nil { + // nothing to do + atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) + return nil + } + } - // process tasks in serial for now - var filenames []string + atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) + atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(mergePlan.Tasks))) - cw := newCloseChWrapper(s.closeCh, ctx) + cw := newCloseChWrapper(s.closeCh, ctrlMsg.ctx) defer cw.close() - go cw.listen() - for _, task := range resultMergePlan.Tasks { + var filenames []string + var err error + defer func() { + // send error to done channel if present + if done, ok := cw.ctx.Value(mergeDoneKey).(chan error); ok { + done <- err + } + }() + + for _, task := range mergePlan.Tasks { if len(task.Segments) == 0 { atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) continue @@ -329,7 +362,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) - oldMap := make(map[uint64]*SegmentSnapshot, len(task.Segments)) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]segment.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) @@ -337,7 +369,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { - oldMap[segSnapshot.id] = segSnapshot mergedSegHistory[segSnapshot.id] = &mergedSegmentHistory{ workerID: 0, oldSegment: segSnapshot, @@ -345,7 +376,6 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, if persistedSeg, ok := segSnapshot.segment.(segment.PersistedSegment); ok { if segSnapshot.LiveSize() == 0 { atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) - oldMap[segSnapshot.id] = nil delete(mergedSegHistory, segSnapshot.id) } else { segmentsToMerge = append(segmentsToMerge, segSnapshot.segment) @@ -372,8 +402,9 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) prevBytesReadTotal := cumulateBytesRead(segmentsToMerge) - newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, - cw.cancelCh, s) + var newDocNums [][]uint64 + newDocNums, _, err = s.segPlugin.MergeUsing(segmentsToMerge, docsToDrop, path, + cw.cancelCh, s, s.segmentConfig) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -391,7 +422,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, return fmt.Errorf("merging failed: %v", err) } - seg, err = s.segPlugin.Open(path) + seg, err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -425,7 +456,8 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, select { case <-s.closeCh: _ = seg.Close() - return segment.ErrClosed + err = segment.ErrClosed + return err case s.merges <- sm: atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } @@ -540,7 +572,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, // the newly merged segment is already flushed out to disk, just needs // to be opened using mmap. newDocIDs, _, err := - s.segPlugin.Merge(segsBatch, dropsBatch, path, s.closeCh, s) + s.segPlugin.MergeUsing(segsBatch, dropsBatch, path, s.closeCh, s, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) @@ -555,7 +587,7 @@ func (s *Scorch) mergeAndPersistInMemorySegments(snapshot *IndexSnapshot, s.markIneligibleForRemoval(filename) newMergedSegmentIDs[id] = newSegmentID newDocIDsSet[id] = newDocIDs - newMergedSegments[id], err = s.segPlugin.Open(path) + newMergedSegments[id], err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { em.Lock() errs = append(errs, err) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go index 20a0706ef9..658fb08dd6 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize.go @@ -395,5 +395,7 @@ func (i *IndexSnapshot) unadornedTermFieldReader( recycle: false, // signal downstream that this is a special unadorned termFieldReader unadorned: true, + // unadorned TFRs do not require bytes read tracking + updateBytesRead: false, } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go index affb4ff13b..29f80b4ccb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/optimize_knn.go @@ -34,8 +34,6 @@ type OptimizeVR struct { totalCost uint64 // maps field to vector readers vrs map[string][]*IndexSnapshotVectorReader - // if at least one of the vector readers requires filtered kNN. - requiresFiltering bool } // This setting _MUST_ only be changed during init and not after. @@ -85,8 +83,7 @@ func (o *OptimizeVR) Finish() error { continue } - vecIndex, err := segment.InterpretVectorIndex(field, - o.requiresFiltering, origSeg.deleted) + vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted) if err != nil { errorsM.Lock() errors = append(errors, err) @@ -109,7 +106,7 @@ func (o *OptimizeVR) Finish() error { // kNN search. if vr.eligibleSelector != nil { pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k, - vr.eligibleSelector.SegmentEligibleDocs(index), vr.searchParams) + vr.eligibleSelector.SegmentEligibleDocuments(index), vr.searchParams) } else { pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams) } @@ -163,9 +160,6 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context, return octx, nil } o.ctx = ctx - if !o.requiresFiltering { - o.requiresFiltering = s.eligibleSelector != nil - } if o.snapshot != s.snapshot { o.invokeSearcherEndCallback() diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go index d0c013a1d4..6b7eee5963 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/persister.go @@ -425,7 +425,6 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste var totSize int var numSegsToFlushOut int var totDocs uint64 - // legacy behaviour of merge + flush of all in-memory segments in one-shot if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) { val := &flushable{ @@ -538,10 +537,15 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste exclude := make(map[uint64]struct{}) // copy to the equiv the segments that weren't replaced - for _, segment := range snapshot.segment { - if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged { - equiv.segment = append(equiv.segment, segment) - exclude[segment.id] = struct{}{} + for _, ss := range snapshot.segment { + if _, wasMerged := mergedSegmentIDs[ss.id]; !wasMerged { + equiv.segment = append(equiv.segment, ss) + // this can be either in-memory or persisted segment, but while + // preparing the bolt snapshot we avoid the in-memory segments to be + // flushed out + if _, ok := ss.segment.(segment.PersistedSegment); !ok { + exclude[ss.id] = struct{}{} + } } } @@ -549,10 +553,11 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persiste for _, segment := range newSnapshot.segment { if _, ok := newMergedSegmentIDs[segment.id]; ok { equiv.segment = append(equiv.segment, &SegmentSnapshot{ - id: segment.id, - segment: segment.segment, - deleted: nil, // nil since merging handled deletions - stats: nil, + id: segment.id, + segment: segment.segment, + deleted: nil, // nil since merging handled deletions + stats: nil, + internal: nil, // segment is persisted and equiv is already updated }) } } @@ -575,6 +580,11 @@ func copyToDirectory(srcPath string, d index.Directory) (int64, error) { return 0, fmt.Errorf("GetWriter err: %v", err) } + // skip + if dest == nil { + return 0, nil + } + sourceFileStat, err := os.Stat(srcPath) if err != nil { return 0, err @@ -616,9 +626,8 @@ func persistToDirectory(seg segment.UnpersistedSegment, d index.Directory, return err } -func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, - segPlugin SegmentPlugin, exclude map[uint64]struct{}, d index.Directory) ( - []string, map[uint64]string, error) { +func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *util.BoltTxImpl, path string, segPlugin SegmentPlugin, + exclude map[uint64]struct{}, d index.Directory) ([]string, map[uint64]string, error) { snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) if err != nil { return nil, nil, err @@ -634,13 +643,29 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, if err != nil { return nil, nil, err } - err = metaBucket.Put(util.BoltMetaDataSegmentTypeKey, []byte(segPlugin.Type())) + err = metaBucket.Put(util.BoltMetaDataSegmentTypeKey, []byte(segPlugin.Type()), nil) if err != nil { return nil, nil, err } buf := make([]byte, binary.MaxVarintLen32) binary.BigEndian.PutUint32(buf, segPlugin.Version()) - err = metaBucket.Put(util.BoltMetaDataSegmentVersionKey, buf) + err = metaBucket.Put(util.BoltMetaDataSegmentVersionKey, buf, nil) + if err != nil { + return nil, nil, err + } + // always obtain the path from the parent snapshot if available + // since that is the primary source of truth for context + if snapshot.parent != nil { + path = snapshot.parent.path + } + writer, err := util.NewFileWriter( + []byte(path + string(os.PathSeparator) + "root.bolt")) + if err != nil { + return nil, nil, err + } + + // persist the writer ID used for the bolt snapshot + err = metaBucket.Put(util.BoltMetaDataFileWriterIDKey, []byte(writer.Id()), writer) if err != nil { return nil, nil, err } @@ -654,7 +679,7 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, if err != nil { return nil, nil, err } - err = metaBucket.Put(util.BoltMetaDataTimeStamp, timeStampBinary) + err = metaBucket.Put(util.BoltMetaDataTimeStamp, timeStampBinary, writer) if err != nil { return nil, nil, err } @@ -664,19 +689,19 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, if err != nil { return nil, nil, err } - // TODO optimize writing these in order? + + // deep copy the internal map since we'll be keeping only the persisted info + // in bolt and some of the information might be deleted + internal := make(map[string][]byte, len(snapshot.internal)) for k, v := range snapshot.internal { - err = internalBucket.Put([]byte(k), v) - if err != nil { - return nil, nil, err - } + internal[k] = v } if snapshot.parent != nil { val := make([]byte, 8) bytesWritten := atomic.LoadUint64(&snapshot.parent.stats.TotBytesWrittenAtIndexTime) binary.LittleEndian.PutUint64(val, bytesWritten) - err = internalBucket.Put(util.TotBytesWrittenKey, val) + err = internalBucket.Put(util.TotBytesWrittenKey, val, writer) if err != nil { return nil, nil, err } @@ -687,79 +712,112 @@ func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, // first ensure that each segment in this snapshot has been persisted for _, segmentSnapshot := range snapshot.segment { - snapshotSegmentKey := encodeUvarintAscending(nil, segmentSnapshot.id) - snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) - if err != nil { - return nil, nil, err - } + var persistedSeg bool + var snapshotSegmentBucket *util.BoltBucketImpl switch seg := segmentSnapshot.segment.(type) { case segment.PersistedSegment: + snapshotSegmentKey := encodeUvarintAscending(nil, segmentSnapshot.id) + snapshotSegmentBucket, err = snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) + if err != nil { + return nil, nil, err + } segPath := seg.Path() _, err = copyToDirectory(segPath, d) if err != nil { return nil, nil, fmt.Errorf("segment: %s copy err: %v", segPath, err) } filename := filepath.Base(segPath) - err = snapshotSegmentBucket.Put(util.BoltPathKey, []byte(filename)) + err = snapshotSegmentBucket.Put(util.BoltPathKey, []byte(filename), writer) if err != nil { return nil, nil, err } filenames = append(filenames, filename) + persistedSeg = true case segment.UnpersistedSegment: // need to persist this to disk if its not part of exclude list (which // restricts which in-memory segment to be persisted to disk) if _, ok := exclude[segmentSnapshot.id]; !ok { + snapshotSegmentKey := encodeUvarintAscending(nil, segmentSnapshot.id) + snapshotSegmentBucket, err = snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) + if err != nil { + return nil, nil, err + } filename := zapFileName(segmentSnapshot.id) path := filepath.Join(path, filename) - err := persistToDirectory(seg, d, path) + err = persistToDirectory(seg, d, path) if err != nil { return nil, nil, fmt.Errorf("segment: %s persist err: %v", path, err) } newSegmentPaths[segmentSnapshot.id] = path - err = snapshotSegmentBucket.Put(util.BoltPathKey, []byte(filename)) + err = snapshotSegmentBucket.Put(util.BoltPathKey, []byte(filename), nil) if err != nil { return nil, nil, err } filenames = append(filenames, filename) + persistedSeg = true + } else { + // this segment is not going to be persisted in this cycle, so any + // of the corresponding internal values need to be removed since + // on recovery they shouldn't be loaded as part of the indexSnapshot + for k, v := range segmentSnapshot.internal { + if v != nil { + delete(internal, k) + } + } } default: return nil, nil, fmt.Errorf("unknown segment type: %T", seg) } - // store current deleted bits - var roaringBuf bytes.Buffer - if segmentSnapshot.deleted != nil { - _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf) - if err != nil { - return nil, nil, fmt.Errorf("error persisting roaring bytes: %v", err) - } - err = snapshotSegmentBucket.Put(util.BoltDeletedKey, roaringBuf.Bytes()) - if err != nil { - return nil, nil, err + + // if the segment was excluded from persistence, then skip updating the metadata + // or helper data corresponding to it - we need to keep things in-line with + // the on-disk information + if persistedSeg { + // store current deleted bits + var roaringBuf bytes.Buffer + if segmentSnapshot.deleted != nil { + _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf) + if err != nil { + return nil, nil, fmt.Errorf("error persisting roaring bytes: %v", err) + } + err = snapshotSegmentBucket.Put(util.BoltDeletedKey, roaringBuf.Bytes(), writer) + if err != nil { + return nil, nil, err + } } - } - // store segment stats - if segmentSnapshot.stats != nil { - b, err := json.Marshal(segmentSnapshot.stats.Fetch()) - if err != nil { - return nil, nil, err + // store segment stats + if segmentSnapshot.stats != nil { + statsBytes, err := json.Marshal(segmentSnapshot.stats.Fetch()) + if err != nil { + return nil, nil, err + } + err = snapshotSegmentBucket.Put(util.BoltStatsKey, statsBytes, writer) + if err != nil { + return nil, nil, err + } } - err = snapshotSegmentBucket.Put(util.BoltStatsKey, b) - if err != nil { - return nil, nil, err + + // store updated field info + if segmentSnapshot.updatedFields != nil { + updatedFieldsBytes, err := json.Marshal(segmentSnapshot.updatedFields) + if err != nil { + return nil, nil, err + } + err = snapshotSegmentBucket.Put( + util.BoltUpdatedFieldsKey, updatedFieldsBytes, writer) + if err != nil { + return nil, nil, err + } } } + } - // store updated field info - if segmentSnapshot.updatedFields != nil { - b, err := json.Marshal(segmentSnapshot.updatedFields) - if err != nil { - return nil, nil, err - } - err = snapshotSegmentBucket.Put(util.BoltUpdatedFieldsKey, b) - if err != nil { - return nil, nil, err - } + // now the internal values are reflective of the on-disk data, update in bolt + for k, v := range internal { + err = internalBucket.Put([]byte(k), v, writer) + if err != nil { + return nil, nil, err } } @@ -804,7 +862,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint } }() for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = s.segPlugin.Open(path) + newSegments[segmentID], err = s.segPlugin.OpenUsing(path, s.segmentConfig) if err != nil { return fmt.Errorf("error opening new segment at %s, %v", path, err) } @@ -854,9 +912,8 @@ func zapFileName(epoch uint64) string { } // bolt snapshot code - func (s *Scorch) loadFromBolt() error { - err := s.rootBolt.View(func(tx *bolt.Tx) error { + err := s.rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil @@ -873,7 +930,7 @@ func (s *Scorch) loadFromBolt() error { s.AddEligibleForRemoval(snapshotEpoch) continue } - snapshot := snapshots.Bucket(k) + snapshot := snapshots.GetBucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) s.AddEligibleForRemoval(snapshotEpoch) @@ -904,6 +961,17 @@ func (s *Scorch) loadFromBolt() error { foundRoot = true } + + // try init trainer and load the trained data + if trainer := initTrainer(s, s.config); trainer != nil { + s.trainer = trainer + trainerBucket := snapshots.GetBucket(util.BoltTrainerKey) + err := s.trainer.loadTrainedData(trainerBucket) + if err != nil { + return err + } + } + return nil }) if err != nil { @@ -921,13 +989,13 @@ func (s *Scorch) loadFromBolt() error { // LoadSnapshot loads the segment with the specified epoch // NOTE: this is currently ONLY intended to be used by the command-line tool func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { - err = s.rootBolt.View(func(tx *bolt.Tx) error { + err = s.rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil } snapshotKey := encodeUvarintAscending(nil, epoch) - snapshot := snapshots.Bucket(snapshotKey) + snapshot := snapshots.GetBucket(snapshotKey) if snapshot == nil { return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch) } @@ -940,7 +1008,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { return rv, nil } -func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { +func (s *Scorch) loadSnapshot(snapshot *util.BoltBucketImpl) (*IndexSnapshot, error) { rv := &IndexSnapshot{ parent: s, internal: make(map[string][]byte), @@ -950,45 +1018,64 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { // first we look for the meta-data bucket, this will tell us // which segment type/version was used for this snapshot // all operations for this scorch will use this type/version - metaBucket := snapshot.Bucket(util.BoltMetaDataKey) + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) if metaBucket == nil { _ = rv.DecRef() return nil, fmt.Errorf("meta-data bucket missing") } - segmentType := string(metaBucket.Get(util.BoltMetaDataSegmentTypeKey)) - segmentVersion := binary.BigEndian.Uint32( - metaBucket.Get(util.BoltMetaDataSegmentVersionKey)) - err := s.loadSegmentPlugin(segmentType, segmentVersion) + segmentType, err := metaBucket.Get(util.BoltMetaDataSegmentTypeKey, nil) + if err != nil { + _ = rv.DecRef() + return nil, fmt.Errorf("segment type missing: %v", err) + } + segmentVersionBytes, err := metaBucket.Get(util.BoltMetaDataSegmentVersionKey, nil) + if err != nil { + _ = rv.DecRef() + return nil, fmt.Errorf("segment version missing: %v", err) + } + segmentVersion := binary.BigEndian.Uint32(segmentVersionBytes) + err = s.loadSegmentPlugin(string(segmentType), segmentVersion) if err != nil { _ = rv.DecRef() return nil, fmt.Errorf( "unable to load correct segment wrapper: %v", err) } + fileWriterID, err := metaBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + _ = rv.DecRef() + return nil, fmt.Errorf("file writer id missing: %v", err) + } + reader, err := util.NewFileReader( + string(fileWriterID), []byte(s.path+string(os.PathSeparator)+"root.bolt")) + if err != nil { + _ = rv.DecRef() + return nil, fmt.Errorf("unable to load correct reader: %v", err) + } + var running uint64 c := snapshot.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { if k[0] == util.BoltInternalKey[0] { - internalBucket := snapshot.Bucket(k) + internalBucket := snapshot.GetBucket(k) if internalBucket == nil { _ = rv.DecRef() return nil, fmt.Errorf("internal bucket missing") } err := internalBucket.ForEach(func(key []byte, val []byte) error { - copiedVal := append([]byte(nil), val...) - rv.internal[string(key)] = copiedVal + rv.internal[string(key)] = val return nil - }) + }, reader) if err != nil { _ = rv.DecRef() return nil, err } } else if k[0] != util.BoltMetaDataKey[0] { - segmentBucket := snapshot.Bucket(k) + segmentBucket := snapshot.GetBucket(k) if segmentBucket == nil { _ = rv.DecRef() return nil, fmt.Errorf("segment key, but bucket missing %x", k) } - segmentSnapshot, err := s.loadSegment(segmentBucket) + segmentSnapshot, err := s.loadSegment(segmentBucket, reader) if err != nil { _ = rv.DecRef() return nil, fmt.Errorf("failed to load segment: %v", err) @@ -1010,13 +1097,14 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { return rv, nil } -func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, error) { - pathBytes := segmentBucket.Get(util.BoltPathKey) +func (s *Scorch) loadSegment(segmentBucket *util.BoltBucketImpl, reader util.FileReader) ( + *SegmentSnapshot, error) { + pathBytes, err := segmentBucket.Get(util.BoltPathKey, nil) if pathBytes == nil { return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - seg, err := s.segPlugin.Open(segmentPath) + seg, err := s.segPlugin.OpenUsing(segmentPath, s.segmentConfig) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } @@ -1026,7 +1114,11 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro cachedDocs: &cachedDocs{cache: nil}, cachedMeta: &cachedMeta{meta: nil}, } - deletedBytes := segmentBucket.Get(util.BoltDeletedKey) + deletedBytes, err := segmentBucket.Get(util.BoltDeletedKey, reader) + if err != nil { + _ = seg.Close() + return nil, fmt.Errorf("error getting deleted bytes: %v", err) + } if deletedBytes != nil { deletedBitmap := roaring.NewBitmap() r := bytes.NewReader(deletedBytes) @@ -1039,23 +1131,28 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro rv.deleted = deletedBitmap } } - statBytes := segmentBucket.Get(util.BoltStatsKey) + statBytes, err := segmentBucket.Get(util.BoltStatsKey, reader) + if err != nil { + _ = seg.Close() + return nil, fmt.Errorf("error getting stat bytes: %v", err) + } if statBytes != nil { var statsMap map[string]map[string]uint64 - err := json.Unmarshal(statBytes, &statsMap) - stats := &fieldStats{statMap: statsMap} if err != nil { _ = seg.Close() return nil, fmt.Errorf("error reading stat bytes: %v", err) } - rv.stats = stats + rv.stats = &fieldStats{statMap: statsMap} + } + updatedFieldBytes, err := segmentBucket.Get(util.BoltUpdatedFieldsKey, reader) + if err != nil { + _ = seg.Close() + return nil, fmt.Errorf("error getting updated field bytes: %v", err) } - updatedFieldBytes := segmentBucket.Get(util.BoltUpdatedFieldsKey) if updatedFieldBytes != nil { var updatedFields map[string]*index.UpdateFieldInfo - - err := json.Unmarshal(updatedFieldBytes, &updatedFields) + err = json.Unmarshal(updatedFieldBytes, &updatedFields) if err != nil { _ = seg.Close() return nil, fmt.Errorf("error reading updated field bytes: %v", err) @@ -1068,6 +1165,152 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } +// identify all the file callback writer ids that are in use by boltdb +func (s *Scorch) boltFileWriterIDsInUse() (map[string]struct{}, error) { + idMap := make(map[string]struct{}) + err := s.rootBolt.View(func(tx *util.BoltTxImpl) error { + snapshots := tx.Bucket(util.BoltSnapshotsBucket) + if snapshots == nil { + return nil + } + c := snapshots.Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + snapshot := snapshots.GetBucket(k) + if snapshot == nil { + continue + } + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) + if metaBucket == nil { + continue + } + id, err := metaBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return err + } + idMap[string(id)] = struct{}{} + } + return nil + }) + if err != nil { + return nil, err + } + + return idMap, nil +} + +// remove all content in boltdb associated with the file callback +// writer ids and process the data using the latest file writer +func (s *Scorch) removeBoltFileWriterIDs(ids map[string]struct{}) error { + filePath := s.path + string(os.PathSeparator) + "root.bolt" + writer, err := util.NewFileWriter([]byte(filePath)) + if err != nil { + return err + } + + err = s.rootBolt.Update(func(tx *util.BoltTxImpl) error { + snapshots := tx.Bucket(util.BoltSnapshotsBucket) + if snapshots == nil { + return nil + } + c := snapshots.Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + snapshot := snapshots.GetBucket(k) + if snapshot == nil { + continue + } + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) + if metaBucket == nil { + continue + } + fileWriterIDBytes, err := metaBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return err + } + fileWriterID := string(fileWriterIDBytes) + if _, ok := ids[fileWriterID]; ok { + reader, err := util.NewFileReader(fileWriterID, []byte(filePath)) + if err != nil { + return fmt.Errorf("unable to load correct reader: %v", err) + } + + cc := snapshot.Cursor() + for kk, _ := cc.First(); kk != nil; kk, _ = cc.Next() { + if kk[0] == util.BoltInternalKey[0] { + internalBucket := snapshot.GetBucket(kk) + if internalBucket == nil { + continue + } + // process all of the internal values and replace them with new values + internalBucketVals := make(map[string][]byte) + err := internalBucket.ForEach(func(key []byte, val []byte) error { + internalBucketVals[string(key)] = val + return nil + }, reader) + if err != nil { + return err + } + for key, val := range internalBucketVals { + err = internalBucket.Put([]byte(key), val, writer) + if err != nil { + return err + } + } + } else if kk[0] != util.BoltMetaDataKey[0] { + segmentBucket := snapshot.GetBucket(kk) + if segmentBucket == nil { + continue + } + // process the updated field key + updatedFieldBytes, err := segmentBucket.Get(util.BoltUpdatedFieldsKey, reader) + if err != nil { + return fmt.Errorf("error getting updated field bytes: %v", err) + } + if updatedFieldBytes != nil { + err = segmentBucket.Put(util.BoltUpdatedFieldsKey, updatedFieldBytes, writer) + if err != nil { + return err + } + } + + // process the deleted key + deletedBytes, err := segmentBucket.Get(util.BoltDeletedKey, reader) + if err != nil { + return fmt.Errorf("error getting deleted bytes: %v", err) + } + if deletedBytes != nil { + err = segmentBucket.Put(util.BoltDeletedKey, deletedBytes, writer) + if err != nil { + return err + } + } + // process the stats key + statsBytes, err := segmentBucket.Get(util.BoltStatsKey, reader) + if err != nil { + return fmt.Errorf("error getting stats bytes: %v", err) + } + if statsBytes != nil { + err = segmentBucket.Put(util.BoltStatsKey, statsBytes, writer) + if err != nil { + return err + } + } + } + } + err = metaBucket.Put(util.BoltMetaDataFileWriterIDKey, + []byte(writer.Id()), writer) + if err != nil { + return err + } + } + } + return nil + }) + if err != nil { + return err + } + return nil +} + func (s *Scorch) removeOldData() { removed, err := s.removeOldBoltSnapshots() if err != nil { @@ -1359,7 +1602,7 @@ func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) { // for eg for n = 3 the checkpoints preserved should be tc, tc - d, tc - 2d expirationDuration := time.Duration(s.numSnapshotsToKeep-1) * s.rollbackSamplingInterval - err := s.rootBolt.View(func(tx *bolt.Tx) error { + err := s.rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil @@ -1380,15 +1623,18 @@ func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) { continue } - snapshot := snapshots.Bucket(sk) + snapshot := snapshots.GetBucket(sk) if snapshot == nil { continue } - metaBucket := snapshot.Bucket(util.BoltMetaDataKey) + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) if metaBucket == nil { continue } - timeStampBytes := metaBucket.Get(util.BoltMetaDataTimeStamp) + timeStampBytes, err := metaBucket.Get(util.BoltMetaDataTimeStamp, nil) + if err != nil { + continue + } var timeStamp time.Time err = timeStamp.UnmarshalText(timeStampBytes) if err != nil { @@ -1424,7 +1670,7 @@ func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) { func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) { var rv []uint64 - err := s.rootBolt.View(func(tx *bolt.Tx) error { + err := s.rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil @@ -1445,14 +1691,14 @@ func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) { // Returns the *.zap file names that are listed in the rootBolt. func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { rv := map[string]struct{}{} - err := s.rootBolt.View(func(tx *bolt.Tx) error { + err := s.rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil } sc := snapshots.Cursor() for sk, _ := sc.First(); sk != nil; sk, _ = sc.Next() { - snapshot := snapshots.Bucket(sk) + snapshot := snapshots.GetBucket(sk) if snapshot == nil { continue } @@ -1461,11 +1707,14 @@ func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { if segk[0] == util.BoltInternalKey[0] { continue } - segmentBucket := snapshot.Bucket(segk) + segmentBucket := snapshot.GetBucket(segk) if segmentBucket == nil { continue } - pathBytes := segmentBucket.Get(util.BoltPathKey) + pathBytes, err := segmentBucket.Get(util.BoltPathKey, nil) + if err != nil { + continue + } if pathBytes == nil { continue } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go index f047762fac..008b364082 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/rollback.go @@ -44,7 +44,7 @@ func RollbackPoints(path string) ([]*RollbackPoint, error) { rootBoltOpt := &bolt.Options{ ReadOnly: true, } - rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt) + rootBolt, err := util.OpenBolt(rootBoltPath, 0600, rootBoltOpt) if err != nil || rootBolt == nil { return nil, err } @@ -78,27 +78,40 @@ func RollbackPoints(path string) ([]*RollbackPoint, error) { continue } - snapshot := snapshots.Bucket(k) + snapshot := snapshots.GetBucket(k) if snapshot == nil { log.Printf("RollbackPoints:"+ " snapshot key, but bucket missing %x, continuing", k) continue } + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) + if metaBucket == nil { + return nil, fmt.Errorf("meta-data bucket missing") + } + + fileWriterID, err := metaBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return nil, fmt.Errorf("unable to load file writer ID: %v", err) + } + reader, err := util.NewFileReader(string(fileWriterID), []byte(rootBoltPath)) + if err != nil { + return nil, fmt.Errorf("unable to load correct reader: %v", err) + } + meta := map[string][]byte{} c2 := snapshot.Cursor() for j, _ := c2.First(); j != nil; j, _ = c2.Next() { if j[0] == util.BoltInternalKey[0] { - internalBucket := snapshot.Bucket(j) + internalBucket := snapshot.GetBucket(j) if internalBucket == nil { err = fmt.Errorf("internal bucket missing") break } err = internalBucket.ForEach(func(key []byte, val []byte) error { - copiedVal := append([]byte(nil), val...) - meta[string(key)] = copiedVal + meta[string(key)] = val return nil - }) + }, reader) if err != nil { break } @@ -136,7 +149,7 @@ func Rollback(path string, to *RollbackPoint) error { rootBoltOpt := &bolt.Options{ ReadOnly: false, } - rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt) + rootBolt, err := util.OpenBolt(rootBoltPath, 0600, rootBoltOpt) if err != nil || rootBolt == nil { return err } @@ -151,7 +164,7 @@ func Rollback(path string, to *RollbackPoint) error { // including the target one. var found bool var eligibleEpochs []uint64 - err = rootBolt.View(func(tx *bolt.Tx) error { + err = rootBolt.View(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go index 287d8e07fd..4f35483d88 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/scorch.go @@ -15,15 +15,19 @@ package scorch import ( + "context" "encoding/json" "fmt" + "io" "os" "path/filepath" + "strconv" "sync" "sync/atomic" "time" "github.com/RoaringBitmap/roaring/v2" + "github.com/blevesearch/bleve/v2/index/scorch/mergeplan" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" @@ -45,6 +49,7 @@ type Scorch struct { readOnly bool version uint8 config map[string]interface{} + segmentConfig map[string]interface{} analysisQueue *index.AnalysisQueue path string @@ -75,9 +80,11 @@ type Scorch struct { merges chan *segmentMerge introducerNotifier chan *epochWatcher persisterNotifier chan *epochWatcher - rootBolt *bolt.DB + rootBolt *util.RootBoltImpl asyncTasks sync.WaitGroup + trainer trainer + onEvent func(event Event) bool onAsyncError func(err error, path string) @@ -88,6 +95,33 @@ type Scorch struct { spatialPlugin index.SpatialAnalyzerPlugin } +// trainer interface is used for training an index that has the concept +// of "learning". Naturally, a vector index is one such thing that would +// implement this interface. There can be multiple implementations of the +// training itself even for the same index type. +// +// this component is not supposed to interact with the other master routines +// of scorch and will be used only for training the index before the actual data +// ingestion starts. The routine should also be released once the +// training is marked as complete - which can be done using the BoltTrainCompleteKey +// key and a bool value. However the struct is still maintained for the pointer to +// the instance so that we can use in the later stages of the index lifecycle. +type trainer interface { + // ephemeral + trainLoop() + // for the training state and the ingestion of the samples + train(batch *index.Batch) error + + // to load the metadata from the bolt under the BoltTrainerKey + loadTrainedData(*util.BoltBucketImpl) error + // to fetch the internal data from the component + getInternal(key []byte) ([]byte, error) + + // trainer specific file transfer operations + copyFileLOCKED(file string, d index.IndexDirectory) error + updateBolt(snapshotsBucket *util.BoltBucketImpl, key []byte, value []byte) error +} + type ScorchErrorType string func (t ScorchErrorType) Error() string { @@ -154,6 +188,7 @@ func NewScorch(storeName string, forceMergeRequestCh: make(chan *mergerCtrl, 1), segPlugin: defaultSegmentPlugin, copyScheduled: map[string]int{}, + segmentConfig: make(map[string]interface{}), } forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) @@ -168,6 +203,11 @@ func NewScorch(storeName string, } } + segConfig, ok := config["segmentConfig"].(map[string]interface{}) + if ok { + rv.segmentConfig = segConfig + } + typ, ok := config["spatialPlugin"].(string) if ok { if err := rv.loadSpatialAnalyzerPlugin(typ); err != nil { @@ -205,6 +245,10 @@ func NewScorch(storeName string, return nil, err } + if trainer := initTrainer(rv, config); trainer != nil { + rv.trainer = trainer + } + return rv, nil } @@ -259,6 +303,11 @@ func (s *Scorch) Open() error { s.asyncTasks.Add(1) go s.introducerLoop() + if s.trainer != nil { + s.asyncTasks.Add(1) + go s.trainer.trainLoop() + } + if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) go s.persisterLoop() @@ -312,7 +361,7 @@ func (s *Scorch) openBolt() error { rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt" var err error if s.path != "" { - s.rootBolt, err = bolt.Open(rootBoltPath, 0o600, &rootBoltOpt) + s.rootBolt, err = util.OpenBolt(rootBoltPath, 0o600, &rootBoltOpt) if err != nil { return err } @@ -424,6 +473,24 @@ func (s *Scorch) Delete(id string) error { return s.Batch(b) } +func (s *Scorch) isTrained(batch *index.Batch) (bool, error) { + trained := true + if len(batch.IndexOps) > 0 && s.trainer != nil { + val, err := s.getInternal(util.BoltTrainCompleteKey) + if err != nil { + return false, err + } + + if val != nil { + trained, err = strconv.ParseBool(string(val)) + if err != nil { + return false, err + } + } + } + return trained, nil +} + // Batch applices a batch of changes to the index atomically func (s *Scorch) Batch(batch *index.Batch) (err error) { start := time.Now() @@ -434,6 +501,15 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { s.fireEvent(EventKindBatchIntroduction, time.Since(start)) }() + trained, err := s.isTrained(batch) + if err != nil { + return err + } + + if !trained { + return fmt.Errorf("index is not trained yet") + } + resultChan := make(chan index.Document, len(batch.IndexOps)) var numUpdates uint64 @@ -497,7 +573,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { stats := newFieldStats() if len(analysisResults) > 0 { - newSegment, bufBytes, err = s.segPlugin.New(analysisResults) + newSegment, bufBytes, err = s.segPlugin.NewUsing(analysisResults, s.segmentConfig) if err != nil { return err } @@ -532,6 +608,29 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { return err } +func (s *Scorch) getInternal(key []byte) ([]byte, error) { + s.rootLock.RLock() + defer s.rootLock.RUnlock() + + switch string(key) { + case string(util.BoltTrainCompleteKey): + if s.trainer != nil { + return s.trainer.getInternal(key) + } else { + return nil, fmt.Errorf("get on BoltTrainCompleteKey is not supported" + + " with this build") + } + } + return nil, nil +} + +func (s *Scorch) Train(batch *index.Batch) error { + if s.trainer != nil { + return s.trainer.train(batch) + } + return fmt.Errorf("training is not supported with this build") +} + func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, internalOps map[string][]byte, persistedCallback index.BatchCallback, stats *fieldStats, ) error { @@ -741,6 +840,20 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["field:"+fieldName+":"+statName] = val } } + + aggVectorStats := newFieldStats() + for _, segmentSnapshot := range indexSnapshot.Segments() { + if vsr, ok := segmentSnapshot.Segment().(segment.VectorFieldStatsReporter); ok { + segStats := newFieldStats() + vsr.UpdateVectorFieldStats(segStats) + aggVectorStats.Aggregate(segStats) + } + } + for statName, stats := range aggVectorStats.Fetch() { + for fieldName, val := range stats { + m["field:"+fieldName+":"+statName] = val + } + } return m } @@ -799,6 +912,12 @@ func analyze(d index.Document, fn customAnalyzerPluginInitFunc) { } } }) + if nd, ok := d.(index.NestedDocument); ok { + nd.VisitNestedDocuments(func(doc index.Document) { + doc.AddIDField() + analyze(doc, fn) + }) + } } func (s *Scorch) AddEligibleForRemoval(epoch uint64) { @@ -971,6 +1090,65 @@ func (s *Scorch) CopyReader() index.CopyReader { return rv } +func (s *Scorch) SetPathInBolt(key []byte, value []byte) error { + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + return err + } + + // currently this is specific to trained index file update + err = s.trainer.updateBolt(snapshotsBucket, key, value) + if err != nil { + return err + } + + err = tx.Commit() + if err != nil { + return err + } + + return s.rootBolt.Sync() +} + +// CopyFile copies a specific file to a destination directory which has an access to a bleve index +// doing a io.Copy() isn't enough because the file needs to be tracked in bolt file as well +func (s *Scorch) CopyFile(file string, d index.IndexDirectory) error { + s.rootLock.Lock() + defer s.rootLock.Unlock() + + dest, err := d.GetWriter(filepath.Join("store", file)) + if err != nil { + return err + } + + source, err := os.Open(filepath.Join(s.path, file)) + if err != nil { + return err + } + + defer source.Close() + defer dest.Close() + _, err = io.Copy(dest, source) + if err != nil { + return err + } + + // this code is currently specific to copying trained data but is future proofed for other files + // to be updated in the dest's bolt + err = s.trainer.copyFileLOCKED(file, d) + return err +} + // external API to fire a scorch event (EventKindIndexStart) externally from bleve func (s *Scorch) FireIndexEvent() { s.fireEvent(EventKindIndexStart, 0) @@ -1002,7 +1180,8 @@ func (s *Scorch) OpenMeta() error { // Merge and update deleted field info and rewrite index mapping func (s *Scorch) updateBolt(fieldInfo map[string]*index.UpdateFieldInfo, mappingBytes []byte) error { - return s.rootBolt.Update(func(tx *bolt.Tx) error { + filePath := s.path + string(os.PathSeparator) + "root.bolt" + return s.rootBolt.Update(func(tx *util.BoltTxImpl) error { snapshots := tx.Bucket(util.BoltSnapshotsBucket) if snapshots == nil { return nil @@ -1015,27 +1194,69 @@ func (s *Scorch) updateBolt(fieldInfo map[string]*index.UpdateFieldInfo, mapping fmt.Printf("unable to parse segment epoch %x, continuing", k) continue } - snapshot := snapshots.Bucket(k) + snapshot := snapshots.GetBucket(k) + metaBucket := snapshot.GetBucket(util.BoltMetaDataKey) + if metaBucket == nil { + return fmt.Errorf("meta-data bucket missing") + } + + writer, err := util.NewFileWriter([]byte(filePath)) + if err != nil { + return fmt.Errorf("unable to load correct writer: %v", err) + } + + fileWriterID, err := metaBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return fmt.Errorf("unable to get file writer id: %v", err) + } + if fileWriterID == nil { + return fmt.Errorf("file writer id missing in meta data") + } + reader, err := util.NewFileReader(string(fileWriterID), []byte(filePath)) + if err != nil { + return fmt.Errorf("unable to load correct reader: %v", err) + } + + err = metaBucket.Put(util.BoltMetaDataFileWriterIDKey, []byte(writer.Id()), writer) + if err != nil { + return err + } + cc := snapshot.Cursor() for kk, _ := cc.First(); kk != nil; kk, _ = cc.Next() { if kk[0] == util.BoltInternalKey[0] { - internalBucket := snapshot.Bucket(kk) + internalBucket := snapshot.GetBucket(kk) if internalBucket == nil { return fmt.Errorf("segment key, but bucket missing %x", kk) } - err = internalBucket.Put(util.MappingInternalKey, mappingBytes) + + internalVals := make(map[string][]byte) + err := internalBucket.ForEach(func(key []byte, val []byte) error { + internalVals[string(key)] = val + return nil + }, reader) if err != nil { return err } + + for key, val := range internalVals { + err = internalBucket.Put([]byte(key), val, writer) + if err != nil { + return err + } + } } else if kk[0] != util.BoltMetaDataKey[0] { - segmentBucket := snapshot.Bucket(kk) + segmentBucket := snapshot.GetBucket(kk) if segmentBucket == nil { return fmt.Errorf("segment key, but bucket missing %x", kk) } var updatedFields map[string]*index.UpdateFieldInfo - updatedFieldBytes := segmentBucket.Get(util.BoltUpdatedFieldsKey) + updatedFieldBytes, err := segmentBucket.Get(util.BoltUpdatedFieldsKey, reader) + if err != nil { + return fmt.Errorf("error getting updated field bytes: %v", err) + } if updatedFieldBytes != nil { - err := json.Unmarshal(updatedFieldBytes, &updatedFields) + err = json.Unmarshal(updatedFieldBytes, &updatedFields) if err != nil { return fmt.Errorf("error reading updated field bytes: %v", err) } @@ -1054,17 +1275,218 @@ func (s *Scorch) updateBolt(fieldInfo map[string]*index.UpdateFieldInfo, mapping } else { updatedFields = fieldInfo } - b, err := json.Marshal(updatedFields) + buf, err := json.Marshal(updatedFields) if err != nil { return err } - err = segmentBucket.Put(util.BoltUpdatedFieldsKey, b) + err = segmentBucket.Put(util.BoltUpdatedFieldsKey, buf, writer) if err != nil { return err } + + deletedBytes, err := segmentBucket.Get(util.BoltDeletedKey, reader) + if err != nil { + return fmt.Errorf("error getting deleted bytes: %v", err) + } + if deletedBytes != nil { + err = segmentBucket.Put(util.BoltDeletedKey, deletedBytes, writer) + if err != nil { + return err + } + } + + statBytes, err := segmentBucket.Get(util.BoltStatsKey, reader) + if err != nil { + return fmt.Errorf("error getting stats bytes: %v", err) + } + if statBytes != nil { + err = segmentBucket.Put(util.BoltStatsKey, statBytes, writer) + if err != nil { + return err + } + } } } } return nil }) } + +// returns the set of file callback writer ids in use by all of the segments and boltdb +func (s *Scorch) FileWriterIDsInUse() (map[string]struct{}, error) { + s.rootLock.RLock() + keyMap := make(map[string]struct{}) + for _, segmentSnapShot := range s.root.segment { + if seg, ok := segmentSnapShot.segment.(segment.SegmentWithCallbacks); ok { + keyMap[seg.CallbackId()] = struct{}{} + } + } + + boltKeys, err := s.boltFileWriterIDsInUse() + if err != nil { + return nil, err + } + + for k, _ := range boltKeys { + keyMap[k] = struct{}{} + } + s.rootLock.RUnlock() + + return keyMap, nil +} + +// removes all file callback writer ids in use from all of the segments and boltdb +// boltdb is updated with the latest callback writer while segments are force +// merged blockingly until snapshot is persisted with the latest callback writer +func (s *Scorch) DropFileWriterIDs(ids map[string]struct{}) error { + err := s.removeBoltFileWriterIDs(ids) + if err != nil { + return err + } + + s.rootLock.Lock() + // create a done channel to ensure success of merge + ctx := context.Background() + doneCh := make(chan error) + ctx = context.WithValue(ctx, mergeDoneKey, doneCh) + + // PARTIAL ROLLBACK WILL NOT BE SUPPORTED DURING THIS OPERATION + // this is done because all of the rollback snapshots + // are likely to have the same sequence numbers and + // morever, it is not functionally correct to hold + // data with writer ids that have been removed + prevNumSnapshotsToKeep := s.numSnapshotsToKeep + s.numSnapshotsToKeep = 1 + + // track the zapx files that are expected to be removed after + // the merge so that we can block until they are removed by the persister + filePaths := make([]string, 0) + + var mergePlanner mergePlanFunc = func(ourSnapshot *IndexSnapshot) (*mergeplan.MergePlan, error) { + // Create a merge plan with the filtered segments and force a merge + // to remove the callback from the segments. + mergePlannerOptions, err := s.parseMergePlannerOptions() + if err != nil { + return nil, fmt.Errorf("mergePlannerOption json parsing err: %v", err) + } + atomic.AddUint64(&s.stats.TotFileMergePlan, 1) + + // filter all segments that have callbacks that need to be removed + // and add them to the list of segments to compact + segsToCompact := make([]mergeplan.Segment, 0) + for _, ss := range ourSnapshot.segment { + // only persisted segments needs to be checked + if _, ok := ss.segment.(segment.PersistedSegment); ok { + if segWithCallbacks, ok := ss.segment.(segment.SegmentWithCallbacks); ok { + if _, ok := ids[segWithCallbacks.CallbackId()]; ok { + segsToCompact = append(segsToCompact, ss) + filePaths = append(filePaths, zapFileName(ss.id)) + } + } + } + } + + // attempt a merge plan with the default merge planner options + mergePlan, err := mergeplan.Plan(segsToCompact, mergePlannerOptions) + if err != nil { + atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) + return nil, fmt.Errorf("merge plan creation err: %v", err) + } + + // create a map to track segments included in the default merge plan + segDictionary := make(map[uint64]bool) + for _, seg := range segsToCompact { + segDictionary[seg.Id()] = true + } + + // create a merge plan if the default merge planner is unable + // to create one with the given segments + if mergePlan == nil { + mergePlan = &mergeplan.MergePlan{ + Tasks: make([]*mergeplan.MergeTask, 0), + } + } + + // mark all segments included in the default merge plan + for _, task := range mergePlan.Tasks { + for _, seg := range task.Segments { + segDictionary[seg.Id()] = false + } + } + + // Create additional merge tasks for segments that are unable to be merged + for _, seg := range segsToCompact { + if segDictionary[seg.Id()] { + mergePlan.Tasks = append(mergePlan.Tasks, &mergeplan.MergeTask{ + Segments: []mergeplan.Segment{seg}, + }) + } + } + + return mergePlan, nil + } + + // set the merge plan func in the context for the merger to use when it receives the merge request + // this is to ensure that the merge request is triggered with the latest snapshot, thus avoiding + // any races + ctx = context.WithValue(ctx, mergePlanFuncKey, mergePlanner) + + // trigger the merge with the force merge plan + s.forceMergeRequestCh <- &mergerCtrl{ + ctx: ctx, + } + s.rootLock.Unlock() + + // blockingly wait for merge to complete + err = <-doneCh + close(doneCh) + if err != nil { + return err + } + + // wait for files to be cleaned up by persister + err = s.waitTillFileCleanup(filePaths) + if err != nil { + return err + } + + // reset rollback snapshot retention + s.rootLock.Lock() + s.numSnapshotsToKeep = prevNumSnapshotsToKeep + s.rootLock.Unlock() + + return nil +} + +// waitTillFileCleanup blocks until the given files are cleaned up by the persister or merger or +// returns an error after a timeout. It does so by checking the index directory every 5 seconds +// for the presence of the given files, and returns once they are no longer present. +func (s *Scorch) waitTillFileCleanup(filePaths []string) error { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + timeout := time.After(5 * time.Minute) + + for { + select { + case <-ticker.C: + files, err := os.ReadDir(s.path) + if err != nil { + return err + } + for _, f := range files { + fname := f.Name() + if filepath.Ext(fname) == ".zap" { + for _, filePath := range filePaths { + if fname == filePath { + continue + } + } + } + } + return nil + case <-timeout: + return fmt.Errorf("timeout waiting for file cleanup for files: %v", filePaths) + } + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go index 790a8008a3..16be8e440d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/segment_plugin.go @@ -28,6 +28,7 @@ import ( zapv14 "github.com/blevesearch/zapx/v14" zapv15 "github.com/blevesearch/zapx/v15" zapv16 "github.com/blevesearch/zapx/v16" + zapv17 "github.com/blevesearch/zapx/v17" ) // SegmentPlugin represents the essential functions required by a package to plug in @@ -45,10 +46,14 @@ type SegmentPlugin interface { // New takes a set of Documents and turns them into a new Segment New(results []index.Document) (segment.Segment, uint64, error) + NewUsing(results []index.Document, config map[string]interface{}) (segment.Segment, uint64, error) + // Open attempts to open the file at the specified path and // return the corresponding Segment Open(path string) (segment.Segment, error) + OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) + // Merge takes a set of Segments, and creates a new segment on disk at // the specified path. // Drops is a set of bitmaps (one for each segment) indicating which @@ -66,6 +71,10 @@ type SegmentPlugin interface { Merge(segments []segment.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s segment.StatsReporter) ( [][]uint64, uint64, error) + + MergeUsing(segments []segment.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s segment.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) } var supportedSegmentPlugins map[string]map[uint32]SegmentPlugin @@ -73,7 +82,8 @@ var defaultSegmentPlugin SegmentPlugin func init() { ResetSegmentPlugins() - RegisterSegmentPlugin(&zapv16.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv17.ZapPlugin{}, true) + RegisterSegmentPlugin(&zapv16.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv15.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv14.ZapPlugin{}, false) RegisterSegmentPlugin(&zapv13.ZapPlugin{}, false) diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go index 3f2a330c5c..4e9aff009b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index.go @@ -17,7 +17,6 @@ package scorch import ( "container/heap" "context" - "encoding/binary" "fmt" "os" "path/filepath" @@ -28,11 +27,11 @@ import ( "github.com/RoaringBitmap/roaring/v2" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" segment "github.com/blevesearch/scorch_segment_api/v2" "github.com/blevesearch/vellum" lev "github.com/blevesearch/vellum/levenshtein" - bolt "go.etcd.io/bbolt" ) // re usable, threadsafe levenshtein builders @@ -42,9 +41,8 @@ type asynchSegmentResult struct { dict segment.TermDictionary dictItr segment.DictionaryIterator - cardinality int - index int - docs *roaring.Bitmap + index int + docs *roaring.Bitmap thesItr segment.ThesaurusIterator @@ -59,11 +57,11 @@ func init() { var err error lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed1 builder err: %v", err)) } lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true) if err != nil { - panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + panic(fmt.Errorf("levenshtein automaton ed2 builder err: %v", err)) } } @@ -91,6 +89,8 @@ type IndexSnapshot struct { // UpdateFieldInfo.Index or .Store or .DocValues). // Used to short circuit queries trying to read stale data updatedFields map[string]*index.UpdateFieldInfo + + fileWriterID string // the file callback writer id associated with this snapshot } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -468,13 +468,17 @@ func (is *IndexSnapshot) Fields() ([]string, error) { } func (is *IndexSnapshot) GetInternal(key []byte) ([]byte, error) { + _, ok := is.internal[string(key)] + if !ok { + return is.parent.getInternal(key) + } return is.internal[string(key)], nil } func (is *IndexSnapshot) DocCount() (uint64, error) { var rv uint64 for _, segment := range is.segment { - rv += segment.Count() + rv += segment.CountRoot() } return rv, nil } @@ -501,7 +505,7 @@ func (is *IndexSnapshot) Document(id string) (rv index.Document, err error) { return nil, nil } - docNum, err := docInternalToNumber(next.ID) + docNum, err := next.ID.Value() if err != nil { return nil, err } @@ -571,7 +575,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in } func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return "", err } @@ -589,7 +593,7 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return 0, 0, err } @@ -700,6 +704,8 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field rv.incrementBytesRead(bytesRead - prevBytesReadItr) } } + // ONLY update the bytes read value beyond this point for this TFR if scoring is enabled + rv.updateBytesRead = rv.includeFreq || rv.includeNorm || rv.includeTermVectors atomic.AddUint64(&is.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil } @@ -776,25 +782,6 @@ func (is *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReade is.m2.Unlock() } -func docNumberToBytes(buf []byte, in uint64) []byte { - if len(buf) != 8 { - if cap(buf) >= 8 { - buf = buf[0:8] - } else { - buf = make([]byte, 8) - } - } - binary.BigEndian.PutUint64(buf, in) - return buf -} - -func docInternalToNumber(in index.IndexInternalID) (uint64, error) { - if len(in) != 8 { - return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) - } - return binary.BigEndian.Uint64(in), nil -} - func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( segmentIndex int, localDocNum uint64, fields []string, cFields []string, visitor index.DocValueVisitor, dvs segment.DocVisitState) ( @@ -826,8 +813,10 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( return filteredFields } - fieldsFiltered := filterUpdatedFields(fields) - vFieldsFiltered := filterUpdatedFields(vFields) + if len(is.updatedFields) > 0 { + fields = filterUpdatedFields(fields) + vFields = filterUpdatedFields(vFields) + } var errCh chan error @@ -836,9 +825,9 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( // if the caller happens to know we're on the same segmentIndex // from a previous invocation if cFields == nil { - cFields = subtractStrings(fieldsFiltered, vFieldsFiltered) + cFields = subtractStrings(fields, vFields) - if !ss.cachedDocs.hasFields(cFields) { + if len(cFields) > 0 && !ss.cachedDocs.hasFields(cFields) { errCh = make(chan error, 1) go func() { @@ -851,8 +840,8 @@ func (is *IndexSnapshot) documentVisitFieldTermsOnSegment( } } - if ssvOk && ssv != nil && len(vFieldsFiltered) > 0 { - dvs, err = ssv.VisitDocValues(localDocNum, fieldsFiltered, visitor, dvs) + if ssvOk && ssv != nil && len(vFields) > 0 { + dvs, err = ssv.VisitDocValues(localDocNum, fields, visitor, dvs) if err != nil { return nil, nil, err } @@ -897,7 +886,7 @@ func (dvr *DocValueReader) BytesRead() uint64 { func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, visitor index.DocValueVisitor, ) (err error) { - docNum, err := docInternalToNumber(id) + docNum, err := id.Value() if err != nil { return err } @@ -980,17 +969,15 @@ func subtractStrings(a, b []string) []string { return a } - // Create a map for O(1) lookups - bMap := make(map[string]struct{}, len(b)) - for _, bs := range b { - bMap[bs] = struct{}{} - } - rv := make([]string, 0, len(a)) +OUTER: for _, as := range a { - if _, exists := bMap[as]; !exists { - rv = append(rv, as) + for _, bs := range b { + if as == bs { + continue OUTER + } } + rv = append(rv, as) } return rv } @@ -1006,7 +993,7 @@ func (is *IndexSnapshot) CopyTo(d index.Directory) error { return fmt.Errorf("invalid root.bolt file found") } - copyBolt, err := bolt.Open(rootFile.Name(), 0o600, nil) + copyBolt, err := util.OpenBolt(rootFile.Name(), 0o600, nil) if err != nil { return err } @@ -1297,3 +1284,23 @@ func (is *IndexSnapshot) TermFrequencies(field string, limit int, descending boo return termFreqs[:limit], nil } + +// Ancestors returns the ancestor IDs for the given document ID. The prealloc +// slice can be provided to avoid allocations downstream, and MUST be empty. +func (i *IndexSnapshot) Ancestors(ID index.IndexInternalID, prealloc []index.AncestorID) ([]index.AncestorID, error) { + // get segment and local doc num for the ID + seg, ldoc, err := i.segmentIndexAndLocalDocNum(ID) + if err != nil { + return nil, err + } + // get ancestors from the segment + prealloc = i.segment[seg].Ancestors(ldoc, prealloc) + // get global offset for the segment (correcting factor for multi-segment indexes) + globalOffset := i.offsets[seg] + // adjust ancestors to global doc numbers, not local to segment + for idx := range prealloc { + prealloc[idx] = prealloc[idx].Add(globalOffset) + } + // return adjusted ancestors + return prealloc, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go index 0a979bfb5f..4048a199b8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_doc.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "reflect" "github.com/RoaringBitmap/roaring/v2" @@ -49,7 +48,7 @@ func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { next := i.iterators[i.segmentOffset].Next() // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] - return docNumberToBytes(nil, uint64(next)+globalOffset), nil + return index.NewIndexInternalID(nil, uint64(next)+globalOffset), nil } return nil, nil } @@ -63,7 +62,7 @@ func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.Inde if next == nil { return nil, nil } - for bytes.Compare(next, ID) < 0 { + for next.Compare(ID) < 0 { next, err = i.Next() if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go index cd4d82dce2..f31f213e71 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_tfr.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "context" "fmt" "reflect" @@ -51,6 +50,10 @@ type IndexSnapshotTermFieldReader struct { bytesRead uint64 ctx context.Context unadorned bool + // flag to indicate whether to increment our bytesRead + // value after creation of the TFR while iterating our postings + // lists + updateBytesRead bool } func (i *IndexSnapshotTermFieldReader) incrementBytesRead(val uint64) { @@ -83,10 +86,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in if rv == nil { rv = &index.TermFieldDoc{} } + var prevBytesRead uint64 // find the next hit for i.segmentOffset < len(i.iterators) { - prevBytesRead := i.iterators[i.segmentOffset].BytesRead() - next, err := i.iterators[i.segmentOffset].Next() + // get our current postings iterator + curItr := i.iterators[i.segmentOffset] + if i.updateBytesRead { + prevBytesRead = curItr.BytesRead() + } + next, err := curItr.Next() if err != nil { return nil, err } @@ -94,18 +102,20 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) i.postingToTermFieldDoc(next, rv) i.currID = rv.ID i.currPosting = next - // postingsIterators is maintain the bytesRead stat in a cumulative fashion. - // this is because there are chances of having a series of loadChunk calls, - // and they have to be added together before sending the bytesRead at this point - // upstream. - bytesRead := i.iterators[i.segmentOffset].BytesRead() - if bytesRead > prevBytesRead { - i.incrementBytesRead(bytesRead - prevBytesRead) + if i.updateBytesRead { + // postingsIterators maintains the bytesRead stat in a cumulative fashion. + // this is because there are chances of having a series of loadChunk calls, + // and they have to be added together before sending the bytesRead at this point + // upstream. + bytesRead := curItr.BytesRead() + if bytesRead > prevBytesRead { + i.incrementBytesRead(bytesRead - prevBytesRead) + } } return rv, nil } @@ -146,7 +156,7 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { // Check if the TFR is a special unadorned composite optimization. // Such a TFR will NOT have a valid `term` or `field` set, making it // impossible for the TFR to replace itself with a new one. @@ -171,7 +181,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } } } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -196,7 +206,7 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo if preAlloced == nil { preAlloced = &index.TermFieldDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.postingToTermFieldDoc(next, preAlloced) i.currID = preAlloced.ID diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go index bd57ad3e06..e572509e91 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_index_vr.go @@ -18,7 +18,6 @@ package scorch import ( - "bytes" "context" "encoding/json" "fmt" @@ -96,7 +95,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) + rv.ID = index.NewIndexInternalID(rv.ID, nnum+globalOffset) rv.Score = float64(next.Score()) i.currID = rv.ID @@ -113,7 +112,7 @@ func (i *IndexSnapshotVectorReader) Next(preAlloced *index.VectorDoc) ( func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, preAlloced *index.VectorDoc) (*index.VectorDoc, error) { - if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + if i.currPosting != nil && i.currID.Compare(ID) >= 0 { i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k, i.searchParams, i.eligibleSelector) if err != nil { @@ -124,7 +123,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, *i = *(i2.(*IndexSnapshotVectorReader)) } - num, err := docInternalToNumber(ID) + num, err := ID.Value() if err != nil { return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) } @@ -149,7 +148,7 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID, if preAlloced == nil { preAlloced = &index.VectorDoc{} } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + preAlloced.ID = index.NewIndexInternalID(preAlloced.ID, next.Number()+ i.snapshot.offsets[segIndex]) i.currID = preAlloced.ID i.currPosting = next @@ -183,8 +182,7 @@ func (i *IndexSnapshot) CentroidCardinalities(field string, limit int, descendin for _, segment := range i.segment { if sv, ok := segment.segment.(segment_api.VectorSegment); ok { - vecIndex, err := sv.InterpretVectorIndex(field, - false /* does not require filtering */, segment.deleted) + vecIndex, err := sv.InterpretVectorIndex(field, segment.deleted) if err != nil { return nil, fmt.Errorf("failed to interpret vector index for field %s in segment: %v", field, err) } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go index c6f3584cc8..136b9f344c 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_segment.go @@ -26,21 +26,23 @@ import ( segment "github.com/blevesearch/scorch_segment_api/v2" ) -var TermSeparator byte = 0xff - -var TermSeparatorSplitSlice = []byte{TermSeparator} - type SegmentSnapshot struct { // this flag is needed to identify whether this // segment was mmaped recently, in which case // we consider the loading cost of the metadata // as part of IO stats. - mmaped uint32 - id uint64 - segment segment.Segment - deleted *roaring.Bitmap - creator string - stats *fieldStats + mmaped uint32 + id uint64 + segment segment.Segment + deleted *roaring.Bitmap + creator string + stats *fieldStats + + // if this segment is in-memory then we'll try to undo the internal values + // in the indexSnapshot internal map before updating the bolt, since its + // supposed to be reflective of the on-disk data. + internal map[string][]byte + updatedFields map[string]*index.UpdateFieldInfo cachedMeta *cachedMeta @@ -113,6 +115,19 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } +// this counts the root documents in the segment this differs from Count() in that +// Count() counts all live documents including nested children, whereas this method +// counts only root live documents +func (s *SegmentSnapshot) CountRoot() uint64 { + var rv uint64 + if nsb, ok := s.segment.(segment.NestedSegment); ok { + rv = nsb.CountRoot(s.deleted) + } else { + rv = s.Count() + } + return rv +} + func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { rv, err := s.segment.DocNumbers(docIDs) if err != nil { @@ -220,7 +235,7 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) - cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) + cfd.docs[docNum] = append(cfd.docs[docNum], index.DocValueTermSeparator) cfd.size += uint64(len(next.Term) + 1) // map value nextPosting, err2 = postingsItr.Next() } @@ -241,7 +256,7 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { type cachedDocs struct { size uint64 - m sync.Mutex // As the cache is asynchronously prepared, need a lock + m sync.RWMutex // As the cache is asynchronously prepared, need a lock cache map[string]*cachedFieldDocs // Keyed by field } @@ -283,14 +298,14 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e // hasFields returns true if the cache has all the given fields func (c *cachedDocs) hasFields(fields []string) bool { - c.m.Lock() + c.m.RLock() for _, field := range fields { if _, exists := c.cache[field]; !exists { - c.m.Unlock() + c.m.RUnlock() return false // found a field not in cache } } - c.m.Unlock() + c.m.RUnlock() return true } @@ -311,17 +326,17 @@ func (c *cachedDocs) updateSizeLOCKED() { func (c *cachedDocs) visitDoc(localDocNum uint64, fields []string, visitor index.DocValueVisitor) { - c.m.Lock() + c.m.RLock() for _, field := range fields { if cachedFieldDocs, exists := c.cache[field]; exists { - c.m.Unlock() + c.m.RUnlock() <-cachedFieldDocs.readyCh - c.m.Lock() + c.m.RLock() if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { for { - i := bytes.Index(tlist, TermSeparatorSplitSlice) + i := bytes.IndexByte(tlist, index.DocValueTermSeparator) if i < 0 { break } @@ -332,7 +347,7 @@ func (c *cachedDocs) visitDoc(localDocNum uint64, } } - c.m.Unlock() + c.m.RUnlock() } // the purpose of the cachedMeta is to simply allow the user of this type to record @@ -357,7 +372,18 @@ func (c *cachedMeta) updateMeta(field string, val interface{}) { func (c *cachedMeta) fetchMeta(field string) (rv interface{}) { c.m.RLock() + defer c.m.RUnlock() + if c.meta == nil { + return nil + } rv = c.meta[field] - c.m.RUnlock() return rv } + +func (s *SegmentSnapshot) Ancestors(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { + nsb, ok := s.segment.(segment.NestedSegment) + if !ok { + return append(prealloc, index.NewAncestorID(docNum)) + } + return nsb.Ancestors(docNum, prealloc) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go index 4fbb8441e5..1d0c03b949 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/snapshot_vector_index.go @@ -22,6 +22,7 @@ import ( "encoding/json" "fmt" + "github.com/bits-and-blooms/bitset" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" ) @@ -45,17 +46,82 @@ func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32, return rv, nil } +// eligibleDocumentList represents the list of eligible documents within a segment. +type eligibleDocumentList struct { + bs *bitset.BitSet +} + +// Iterator returns an iterator for the eligible document IDs. +func (edl *eligibleDocumentList) Iterator() index.EligibleDocumentIterator { + if edl.bs == nil { + // no eligible documents + return emptyEligibleIterator + } + // return the iterator + return &eligibleDocumentIterator{ + bs: edl.bs, + } +} + +// Count returns the number of eligible document IDs. +func (edl *eligibleDocumentList) Count() uint64 { + if edl.bs == nil { + return 0 + } + return uint64(edl.bs.Count()) +} + +// emptyEligibleDocumentList is a reusable empty eligible document list. +var emptyEligibleDocumentList = &eligibleDocumentList{} + +// eligibleDocumentIterator iterates over eligible document IDs within a segment. +type eligibleDocumentIterator struct { + bs *bitset.BitSet + current uint +} + +// Next returns the next eligible document ID and whether it exists. +func (it *eligibleDocumentIterator) Next() (id uint64, ok bool) { + next, found := it.bs.NextSet(it.current) + if !found { + return 0, false + } + it.current = next + 1 + return uint64(next), true +} + +// emptyEligibleIterator is a reusable empty eligible document iterator. +var emptyEligibleIterator = &emptyEligibleDocumentIterator{} + +// emptyEligibleDocumentIterator is an iterator that always returns no documents. +type emptyEligibleDocumentIterator struct{} + +// Next always returns false for empty iterator. +func (it *emptyEligibleDocumentIterator) Next() (id uint64, ok bool) { + return 0, false +} + // eligibleDocumentSelector is used to filter out documents that are eligible for // the KNN search from a pre-filter query. type eligibleDocumentSelector struct { - // segment ID -> segment local doc nums - eligibleDocNums map[int][]uint64 + // segment ID -> segment local doc nums in a bitset + eligibleDocNums []*bitset.BitSet is *IndexSnapshot } -// SegmentEligibleDocs returns the list of eligible local doc numbers for the given segment. -func (eds *eligibleDocumentSelector) SegmentEligibleDocs(segmentID int) []uint64 { - return eds.eligibleDocNums[segmentID] +// SegmentEligibleDocuments returns an EligibleDocumentList for the specified segment ID. +func (eds *eligibleDocumentSelector) SegmentEligibleDocuments(segmentID int) index.EligibleDocumentList { + if eds.eligibleDocNums == nil || segmentID < 0 || segmentID >= len(eds.eligibleDocNums) { + return emptyEligibleDocumentList + } + bs := eds.eligibleDocNums[segmentID] + if bs == nil { + // no eligible documents for this segment + return emptyEligibleDocumentList + } + return &eligibleDocumentList{ + bs: bs, + } } // AddEligibleDocumentMatch adds a document match to the list of eligible documents. @@ -68,14 +134,19 @@ func (eds *eligibleDocumentSelector) AddEligibleDocumentMatch(id index.IndexInte if err != nil { return err } + // allocate a bitset for this segment if needed + if eds.eligibleDocNums[segIdx] == nil { + // the size of the bitset is the full size of the segment (which is the max local doc num + 1) + eds.eligibleDocNums[segIdx] = bitset.New(uint(eds.is.segment[segIdx].FullSize())) + } // Add the local doc number to the list of eligible doc numbers for this segment. - eds.eligibleDocNums[segIdx] = append(eds.eligibleDocNums[segIdx], docNum) + eds.eligibleDocNums[segIdx].Set(uint(docNum)) return nil } func (is *IndexSnapshot) NewEligibleDocumentSelector() index.EligibleDocumentSelector { return &eligibleDocumentSelector{ - eligibleDocNums: map[int][]uint64{}, + eligibleDocNums: make([]*bitset.BitSet, len(is.segment)), is: is, } } diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go index 9abc8ba96f..c4bf2a8557 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/stats.go @@ -136,6 +136,9 @@ type Stats struct { MaxMemMergeZapTime uint64 TotMemMergeSegments uint64 TotMemorySegmentsAtRoot uint64 + + TotTrainedSamples uint64 + TotTrainTime uint64 } // atomically populates the returned map diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_noop.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_noop.go new file mode 100644 index 0000000000..36752e1e13 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_noop.go @@ -0,0 +1,55 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !vectors +// +build !vectors + +package scorch + +import ( + "fmt" + + "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" +) + +func initTrainer(s *Scorch, config map[string]interface{}) *noopTrainer { + return nil +} + +type noopTrainer struct { +} + +func (t *noopTrainer) trainLoop() {} + +func (t *noopTrainer) train(batch *index.Batch) error { + return fmt.Errorf("training is not supported with this build") +} + +func (t *noopTrainer) loadTrainedData(bucket *util.BoltBucketImpl) error { + // noop + return nil +} + +func (t *noopTrainer) getInternal(key []byte) ([]byte, error) { + return nil, nil +} + +func (t *noopTrainer) copyFileLOCKED(file string, d index.IndexDirectory) error { + return nil +} + +func (t *noopTrainer) updateBolt(snapshotsBucket *util.BoltBucketImpl, key []byte, value []byte) error { + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_vector.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_vector.go new file mode 100644 index 0000000000..4c3b15ca8e --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/train_vector.go @@ -0,0 +1,397 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package scorch + +import ( + "bytes" + "encoding/binary" + "fmt" + "maps" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/blevesearch/bleve/v2/util" + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +type trainRequest struct { + finalSample bool + sampleSize int + ackCh chan error + sample segment.Segment +} + +type vectorTrainer struct { + trainingComplete atomic.Bool + trainedSamples uint64 + parent *Scorch + config map[string]interface{} + + m sync.RWMutex + // not a searchable segment in the sense that it won't return + // the data vectors, returns trained centroid layout + trainedIndex *SegmentSnapshot + trainCh chan *trainRequest +} + +const IndexTrainedWithFastMerge = "vector_index_fast_merge" + +func initTrainer(s *Scorch, config map[string]interface{}) *vectorTrainer { + if f, ok := config[IndexTrainedWithFastMerge]; ok { + feature, ok := f.(bool) + if ok && feature { + trainer := vectorTrainer{ + parent: s, + config: maps.Clone(s.config), + trainCh: make(chan *trainRequest, 1), + } + // update the parent scorch config with the trainer's callback to fetch the trained index + s.segmentConfig[index.TrainedIndexCallback] = index.TrainedIndexCallbackFn(trainer.getTrainedIndex) + return &trainer + } + } + return nil +} + +func moveFile(sourcePath, destPath string) error { + // rename is supposed to be atomic on the same filesystem + err := os.Rename(sourcePath, destPath) + if err != nil { + return fmt.Errorf("error renaming file: %v", err) + } + return nil +} + +func (t *vectorTrainer) persistToBolt(trainReq *trainRequest) error { + tx, err := t.parent.rootBolt.Begin(true) + if err != nil { + return fmt.Errorf("error starting bolt transaction: %v", err) + } + defer tx.Rollback() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(util.BoltSnapshotsBucket) + if err != nil { + return fmt.Errorf("error creating snapshots bucket: %v", err) + } + + trainerBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltTrainerKey) + if err != nil { + return fmt.Errorf("error creating trained index bucket: %v", err) + } + err = trainerBucket.Put(util.BoltPathKey, []byte(index.TrainedIndexFileName), nil) + if err != nil { + return fmt.Errorf("error updating trained index bucket: %v", err) + } + + t.trainingComplete.Store(trainReq.finalSample) + err = trainerBucket.Put(util.BoltTrainCompleteKey, []byte(strconv.FormatBool(trainReq.finalSample)), nil) + if err != nil { + return fmt.Errorf("error updating train complete key: %v", err) + } + + totSamples := atomic.AddUint64(&t.trainedSamples, uint64(trainReq.sampleSize)) + err = trainerBucket.Put(util.BoltTrainedSamplesKey, binary.LittleEndian.AppendUint64(nil, totSamples), nil) + if err != nil { + return fmt.Errorf("error updating trained samples key: %v", err) + } + + err = tx.Commit() + if err != nil { + return fmt.Errorf("error committing bolt transaction: %v", err) + } + + return t.parent.rootBolt.Sync() +} + +// this is not a routine that will be running throughout the lifetime of the index. It's purpose +// is to only train the vector index before the data ingestion starts. +func (t *vectorTrainer) trainLoop() { + defer t.parent.asyncTasks.Done() + + trainLoopStartTime := time.Now() + path := filepath.Join(t.parent.path, index.TrainedIndexFileName) + for { + // exit once the final sample set has been ingested and training is complete. + if t.trainingComplete.Load() { + atomic.StoreUint64(&t.parent.stats.TotTrainedSamples, t.trainedSamples) + atomic.StoreUint64(&t.parent.stats.TotTrainTime, uint64(time.Since(trainLoopStartTime).Milliseconds())) + return + } + select { + case <-t.parent.closeCh: + select { + case req := <-t.trainCh: + req.ackCh <- fmt.Errorf("trainer is closed") + close(req.ackCh) + default: + } + return + case trainReq := <-t.trainCh: + sampleSeg := trainReq.sample + // no sample segment: just persist state if this is the final sample and move on. + if sampleSeg == nil { + if trainReq.finalSample { + if err := t.persistToBolt(trainReq); err != nil { + trainReq.ackCh <- fmt.Errorf("error persisting to bolt: %v", err) + close(trainReq.ackCh) + return + } + } + close(trainReq.ackCh) + continue + } + + if t.trainedIndex == nil { + switch seg := sampleSeg.(type) { + case segment.UnpersistedSegment: + if err := persistToDirectory(seg, nil, path); err != nil { + trainReq.ackCh <- fmt.Errorf("error persisting segment: %v", err) + close(trainReq.ackCh) + continue + } + } + } else { + // merge the new segment with the existing one into a .tmp file, then + // atomically rename it into place (Os.Open on the live path is unsafe + // during the merge). + t.config[index.TrainingKey] = true + _, _, err := t.parent.segPlugin.MergeUsing([]segment.Segment{t.trainedIndex.segment, sampleSeg}, + []*roaring.Bitmap{nil, nil}, path+".tmp", t.parent.closeCh, nil, t.config) + t.config[index.TrainingKey] = false + if err != nil { + trainReq.ackCh <- fmt.Errorf("error merging trained index: %v", err) + close(trainReq.ackCh) + return + } + + t.trainedIndex.segment.Close() + if err = moveFile(path+".tmp", path); err != nil { + trainReq.ackCh <- fmt.Errorf("error renaming trained index: %v", err) + close(trainReq.ackCh) + return + } + } + + // bolt write acts as a checkpoint for failover-recovery: callers downstream + // can rely on the trained index being available once this completes. + // todo: rethink the frequency of bolt writes + if err := t.persistToBolt(trainReq); err != nil { + trainReq.ackCh <- fmt.Errorf("error persisting to bolt: %v", err) + close(trainReq.ackCh) + return + } + + trainedIndex, err := t.parent.segPlugin.OpenUsing(path, t.parent.segmentConfig) + if err != nil { + trainReq.ackCh <- fmt.Errorf("error opening trained index: %v", err) + close(trainReq.ackCh) + return + } + + t.m.Lock() + t.trainedIndex = &SegmentSnapshot{segment: trainedIndex} + t.m.Unlock() + close(trainReq.ackCh) + } + } +} + +// loads the metadata specific to the trained index from boltdb, happens during init +// no lock needed +func (t *vectorTrainer) loadTrainedData(bucket *util.BoltBucketImpl) error { + if bucket == nil { + return nil + } + writerID, err := bucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return fmt.Errorf("error getting writer id: %v", err) + } + reader, err := util.NewFileReader(string(writerID), nil) + if err != nil { + return fmt.Errorf("error creating file reader: %v", err) + } + + segmentSnapshot, err := t.parent.loadSegment(bucket, reader) + if err != nil { + return err + } + + // get the training status out of bolt + trainComplete, err := bucket.Get(util.BoltTrainCompleteKey, nil) + if err != nil { + return fmt.Errorf("error getting train complete: %v", err) + } + trainedSamples, err := bucket.Get(util.BoltTrainedSamplesKey, nil) + if err != nil { + return fmt.Errorf("error getting trained samples: %v", err) + } + atomic.StoreUint64(&t.trainedSamples, binary.LittleEndian.Uint64(trainedSamples)) + comp, err := strconv.ParseBool(string(trainComplete)) + if err != nil { + return fmt.Errorf("error parsing train complete: %v", err) + } + t.trainingComplete.Store(comp) + + t.m.Lock() + defer t.m.Unlock() + t.trainedIndex = segmentSnapshot + return nil +} + +func (t *vectorTrainer) train(batch *index.Batch) error { + // regulate the Train function + t.parent.FireIndexEvent() + + var trainData []index.Document + for _, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + // no need to track updates/deletes over here since + // the API is singleton + doc.AddIDField() + } + trainData = append(trainData, doc) + } + + trainComplete := batch.InternalOps[string(util.BoltTrainCompleteKey)] + if trainComplete == nil { + trainComplete = []byte("false") + } + fin, err := strconv.ParseBool(string(trainComplete)) + if err != nil { + return fmt.Errorf("error parsing train complete: %v", err) + } + + trainReq := &trainRequest{ + finalSample: fin, + sampleSize: len(trainData), + ackCh: make(chan error), + } + // just builds a new vector index out of the train data provided + // this is not necessarily the final train data since this is submitted + // as a request to the trainer component to be merged. once the training + // is complete, the template will be used for other operations down the line + // like merge and search. + // + // note: this might index text data too, how to handle this? s.segmentConfig? + // todo: updates/deletes -> data drift detection + if len(trainData) > 0 { + trainReq.sample, _, err = t.parent.segPlugin.NewUsing(trainData, t.parent.segmentConfig) + if err != nil { + return err + } + } + + t.trainCh <- trainReq + err = <-trainReq.ackCh + if err != nil { + return fmt.Errorf("train_vector: train() err'd out with: %w", err) + } + + return err +} + +func (t *vectorTrainer) getInternal(key []byte) ([]byte, error) { + switch string(key) { + case string(util.BoltTrainCompleteKey): + return []byte(strconv.FormatBool(t.trainingComplete.Load())), nil + } + return nil, nil +} + +func (t *vectorTrainer) getTrainedIndex(field string) (interface{}, error) { + // return the coarse quantizer of the trained faiss index belonging to the field + // if its not available then zap performs naive merge + t.m.RLock() + defer t.m.RUnlock() + if t.trainedIndex != nil { + trainedSegment, ok := t.trainedIndex.segment.(segment.TrainedSegment) + if !ok { + return nil, fmt.Errorf("segment is not a trained index segment") + } + + coarseQuantizer, err := trainedSegment.GetCoarseQuantizer(field) + if err != nil { + return nil, err + } + return coarseQuantizer, nil + } + return nil, nil +} + +func (t *vectorTrainer) copyFileLOCKED(file string, d index.IndexDirectory) error { + if strings.HasSuffix(file, index.TrainedIndexFileName) { + // trained index file - this is outside the snapshots domain so the bolt update is different + err := d.SetPathInBolt(util.BoltTrainerKey, []byte(file)) + if err != nil { + return fmt.Errorf("error updating dest index bolt: %w", err) + } + } + + return nil +} + +func (t *vectorTrainer) updateBolt(snapshotsBucket *util.BoltBucketImpl, key []byte, value []byte) error { + if bytes.Equal(key, util.BoltTrainerKey) { + trainerBucket, err := snapshotsBucket.CreateBucketIfNotExists(util.BoltTrainerKey) + if err != nil { + return err + } + if trainerBucket == nil { + return fmt.Errorf("trainer bucket not found") + } + + // guard against duplicate updates + existingValue, err := trainerBucket.Get(util.BoltPathKey, nil) + if err != nil { + return fmt.Errorf("error checking existing value: %v", err) + } + if existingValue != nil { + return fmt.Errorf("key already exists %v %v", t.parent.path, string(existingValue)) + } + + err = trainerBucket.Put(util.BoltPathKey, value, nil) + if err != nil { + return err + } + + writerID, err := trainerBucket.Get(util.BoltMetaDataFileWriterIDKey, nil) + if err != nil { + return fmt.Errorf("error getting writer id: %v", err) + } + reader, err := util.NewFileReader(string(writerID), nil) + if err != nil { + return fmt.Errorf("error creating file reader: %v", err) + } + + // update the centroid index pointer + t.trainedIndex, err = t.parent.loadSegment(trainerBucket, reader) + if err != nil { + return err + } + } + + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go b/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go index 18ce1c5823..a37fb37ff1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go +++ b/vendor/github.com/blevesearch/bleve/v2/index/scorch/unadorned.go @@ -38,6 +38,7 @@ func init() { type unadornedPostingsIteratorBitmap struct { actual roaring.IntPeekable actualBM *roaring.Bitmap + next UnadornedPosting // reused across Next() calls } func (i *unadornedPostingsIteratorBitmap) Next() (segment.Posting, error) { @@ -53,7 +54,10 @@ func (i *unadornedPostingsIteratorBitmap) nextAtOrAfter(atOrAfter uint64) (segme if !exists { return nil, nil } - return UnadornedPosting(docNum), nil + i.next = UnadornedPosting{} // clear the struct + rv := &i.next + rv.docNum = docNum + return rv, nil } func (i *unadornedPostingsIteratorBitmap) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { @@ -112,8 +116,9 @@ func newUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) segment.Postings const docNum1HitFinished = math.MaxUint64 type unadornedPostingsIterator1Hit struct { - docNumOrig uint64 // original 1-hit docNum used to create this iterator - docNum uint64 // current docNum + docNumOrig uint64 // original 1-hit docNum used to create this iterator + docNum uint64 // current docNum + next UnadornedPosting // reused across Next() calls } func (i *unadornedPostingsIterator1Hit) Next() (segment.Posting, error) { @@ -129,7 +134,10 @@ func (i *unadornedPostingsIterator1Hit) nextAtOrAfter(atOrAfter uint64) (segment if !exists { return nil, nil } - return UnadornedPosting(docNum), nil + i.next = UnadornedPosting{} // clear the struct + rv := &i.next + rv.docNum = docNum + return rv, nil } func (i *unadornedPostingsIterator1Hit) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { @@ -176,24 +184,26 @@ type ResetablePostingsIterator interface { ResetIterator() } -type UnadornedPosting uint64 +type UnadornedPosting struct { + docNum uint64 +} -func (p UnadornedPosting) Number() uint64 { - return uint64(p) +func (p *UnadornedPosting) Number() uint64 { + return p.docNum } -func (p UnadornedPosting) Frequency() uint64 { +func (p *UnadornedPosting) Frequency() uint64 { return 0 } -func (p UnadornedPosting) Norm() float64 { +func (p *UnadornedPosting) Norm() float64 { return 0 } -func (p UnadornedPosting) Locations() []segment.Location { +func (p *UnadornedPosting) Locations() []segment.Location { return nil } -func (p UnadornedPosting) Size() int { +func (p *UnadornedPosting) Size() int { return reflectStaticSizeUnadornedPosting } diff --git a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go index 8212c74b92..c58891d664 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_alias_impl.go @@ -103,6 +103,24 @@ func (i *indexAliasImpl) IndexSynonym(id string, collection string, definition * return ErrorSynonymSearchNotSupported } +func (i *indexAliasImpl) Train(batch *Batch) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + if !i.open { + return ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + return err + } + + if vi, ok := i.indexes[0].(TrainableIndex); ok { + return vi.Train(batch) + } + return ErrorTrainingNotSupported +} + func (i *indexAliasImpl) Delete(id string) error { i.mutex.RLock() defer i.mutex.RUnlock() @@ -985,6 +1003,15 @@ func MultiSearch(ctx context.Context, req *SearchRequest, params *multiSearchPar searchStart := time.Now() asyncResults := make(chan *asyncSearchResult, len(indexes)) + var preSearchData map[string]map[string]interface{} + var rescorer *rescorer + var fusionKnnHits search.DocumentMatchCollection + if params != nil { + preSearchData = params.preSearchData + rescorer = params.rescorer + fusionKnnHits = params.fusionKnnHits + } + var reverseQueryExecution bool if req.SearchBefore != nil { reverseQueryExecution = true @@ -1006,8 +1033,8 @@ func MultiSearch(ctx context.Context, req *SearchRequest, params *multiSearchPar waitGroup.Add(len(indexes)) for _, in := range indexes { var payload map[string]interface{} - if params.preSearchData != nil { - payload = params.preSearchData[in.Name()] + if preSearchData != nil { + payload = preSearchData[in.Name()] } go searchChildIndex(in, createChildSearchRequest(req, payload)) } @@ -1047,9 +1074,9 @@ func MultiSearch(ctx context.Context, req *SearchRequest, params *multiSearchPar } } - if params.rescorer != nil { - sr.Hits, sr.Total, sr.MaxScore = params.rescorer.rescore(sr.Hits, params.fusionKnnHits) - params.rescorer.restoreSearchRequest() + if rescorer != nil { + sr.Hits, sr.Total, sr.MaxScore = rescorer.rescore(sr.Hits, fusionKnnHits) + rescorer.restoreSearchRequest() } sr.Hits = hitsInCurrentPage(req, sr.Hits) diff --git a/vendor/github.com/blevesearch/bleve/v2/index_impl.go b/vendor/github.com/blevesearch/bleve/v2/index_impl.go index 8065d9c1e8..2545a47a3b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_impl.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_impl.go @@ -91,7 +91,10 @@ func newIndexUsing(path string, mapping mapping.IndexMapping, indexType string, path: path, name: path, m: mapping, - meta: newIndexMeta(indexType, kvstore, kvconfig), + } + rv.meta, err = newIndexMeta(indexType, kvstore, kvconfig, path) + if err != nil { + return nil, err } rv.stats = &IndexStat{i: &rv} // at this point there is hope that we can be successful, so save index meta @@ -369,6 +372,20 @@ func (i *indexImpl) IndexSynonym(id string, collection string, definition *Synon return err } +func (i *indexImpl) Train(batch *Batch) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + if vi, ok := i.i.(index.TrainableIndex); ok { + return vi.Train(batch.internal) + } + return ErrorTrainingNotSupported +} + // IndexAdvanced takes a document.Document object // skips the mapping and indexes it. func (i *indexImpl) IndexAdvanced(doc *document.Document) (err error) { @@ -479,6 +496,55 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { return i.SearchInContext(context.Background(), req) } +// returns the set of file callback writer ids in use by the index +func (i *indexImpl) FileWriterIDsInUse() (map[string]struct{}, error) { + ids := map[string]struct{}{i.meta.fileReader.Id(): {}} + + if cidx, ok := i.i.(IndexWithCallbacks); ok { + cIds, err := cidx.FileWriterIDsInUse() + if err != nil { + return nil, err + } + for k := range cIds { + ids[k] = struct{}{} + } + } else { + // if the underlying index does not support callbacks, we + // assume that the data being written is with the default + // writer id which is the empty string + ids[util.DefaultFileCallbackId] = struct{}{} + } + + return ids, nil +} + +// drops the file callback writer ids from the index and +// re-processes data with the latest file callback writer id +func (i *indexImpl) DropFileWriterIDs(ids map[string]struct{}) error { + i.mutex.Lock() + if _, ok := ids[i.meta.fileReader.Id()]; ok { + var err error + err = i.meta.UpdateWriter(i.path) + if err != nil { + return err + } + } + i.mutex.Unlock() + + if cidx, ok := i.i.(IndexWithCallbacks); ok { + return cidx.DropFileWriterIDs(ids) + } else { + // if the underlying index does not support callbacks and the request is + // to drop the empty id, which is the default id, we return an error + // because it is not possible to drop it + if _, ok := ids[util.DefaultFileCallbackId]; ok { + return fmt.Errorf("underlying index does not support DropFileWriterIDs") + } + } + + return nil +} + var ( documentMatchEmptySize int searchContextEmptySize int @@ -572,8 +638,7 @@ func (i *indexImpl) preSearch(ctx context.Context, req *SearchRequest, reader in return nil, err } - fs := make(query.FieldSet) - fs, err := query.ExtractFields(req.Query, i.m, fs) + fs, err := query.ExtractFields(req.Query, i.m, search.NewFieldSet()) if err != nil { return nil, err } @@ -642,7 +707,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // ------------------------------------------------------------------------------------------ // set up additional contexts for any search operation that will proceed from - // here, such as presearch, collectors etc. + // here, such as presearch, knn collector, topn collector etc. // Scoring model callback to be used to get scoring model scoringModelCallback := func() string { @@ -687,6 +752,13 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } ctx = context.WithValue(ctx, search.GeoBufferPoolCallbackKey, search.GeoBufferPoolCallbackFunc(getBufferPool)) + // check if the index mapping has any nested fields, which should force + // all collectors and searchers to be run in nested mode + if nm, ok := i.m.(mapping.NestedMapping); ok { + if nm.CountNested() > 0 { + ctx = context.WithValue(ctx, search.NestedSearchKey, true) + } + } // ------------------------------------------------------------------------------------------ if _, ok := ctx.Value(search.PreSearchKey).(bool); ok { @@ -716,11 +788,9 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.SearchBefore = nil } - var coll *collector.TopNCollector - if req.SearchAfter != nil { - coll = collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) - } else { - coll = collector.NewTopNCollector(req.Size, req.From, req.Sort) + coll, err := i.buildTopNCollector(ctx, req, indexReader) + if err != nil { + return nil, err } var knnHits []*search.DocumentMatch @@ -795,7 +865,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr // if score fusion, no faceting for knn hits is done // hence we can skip setting the knn hits in the collector if !contextScoreFusionKeyExists { - setKnnHitsInCollector(knnHits, req, coll) + setKnnHitsInCollector(knnHits, coll) } if fts != nil { @@ -937,7 +1007,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr if i.name != "" && hit.Index == "" { hit.Index = i.name } - err, storedFieldsBytes := LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter) + err, storedFieldsBytes := LoadAndHighlightAllFields(hit, req, i.name, indexReader, highlighter) if err != nil { return nil, err } @@ -1105,6 +1175,56 @@ func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, return nil, totalStoredFieldsBytes } +const NestedDocumentKey = "_$nested" + +// LoadAndHighlightAllFields loads stored fields + highlights for root and its descendants. +// All descendant documents are collected into a _$nested array in the root DocumentMatch. +func LoadAndHighlightAllFields( + root *search.DocumentMatch, + req *SearchRequest, + indexName string, + r index.IndexReader, + highlighter highlight.Highlighter, +) (error, uint64) { + var totalStoredFieldsBytes uint64 + // load root fields/highlights + err, bytes := LoadAndHighlightFields(root, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err, totalStoredFieldsBytes + } + // collect all descendant documents + nestedDocs := make([]*search.NestedDocumentMatch, 0, len(root.Descendants)) + // create a dummy desc DocumentMatch to reuse LoadAndHighlightFields + desc := &search.DocumentMatch{} + for _, descID := range root.Descendants { + extID, err := r.ExternalID(descID) + if err != nil { + return err, totalStoredFieldsBytes + } + // reset desc for reuse + desc.ID = extID + desc.IndexInternalID = descID + desc.Locations = root.Locations + err, bytes := LoadAndHighlightFields(desc, req, indexName, r, highlighter) + totalStoredFieldsBytes += bytes + if err != nil { + return err, totalStoredFieldsBytes + } + // copy fields to nested doc and append + if len(desc.Fields) != 0 || len(desc.Fragments) != 0 { + nestedDocs = append(nestedDocs, search.NewNestedDocumentMatch(desc.Fields, desc.Fragments)) + } + desc.Fields = nil + desc.Fragments = nil + } + // add nested documents to root under _$nested key + if len(nestedDocs) > 0 { + root.AddFieldValue(NestedDocumentKey, nestedDocs) + } + return nil, totalStoredFieldsBytes +} + // Fields returns the name of all the fields this // Index has operated on. func (i *indexImpl) Fields() (fields []string, err error) { @@ -1388,11 +1508,43 @@ func (i *indexImpl) CopyTo(d index.Directory) (err error) { err = copyReader.CopyTo(d) if err != nil { - return fmt.Errorf("error copying index metadata: %v", err) + return fmt.Errorf("error copying index data: %v", err) } // copy the metadata - return i.meta.CopyTo(d) + return i.meta.CopyTo(i.path, d) +} + +func (i *indexImpl) CopyFile(file string, d index.IndexDirectory) (err error) { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + fileCopyIndex, ok := i.i.(IndexFileCopyable) + if !ok { + return fmt.Errorf("index implementation does not support file copy reader") + } + + return fileCopyIndex.CopyFile(file, d) +} + +func (i *indexImpl) SetPathInBolt(key []byte, value []byte) error { + i.mutex.RLock() + defer i.mutex.RUnlock() + + if !i.open { + return ErrorIndexClosed + } + + fileCopyIndex, ok := i.i.(IndexFileCopyable) + if !ok { + return fmt.Errorf("index implementation does not support file copy") + } + + return fileCopyIndex.SetPathInBolt(key, value) } func (f FileSystemDirectory) GetWriter(filePath string) (io.WriteCloser, @@ -1487,3 +1639,39 @@ func (i *indexImpl) CentroidCardinalities(field string, limit int, descending bo return centroidCardinalities, nil } + +func (i *indexImpl) buildTopNCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader) (*collector.TopNCollector, error) { + newCollector := func() *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) + } + return collector.NewTopNCollector(req.Size, req.From, req.Sort) + } + + newNestedCollector := func(nr index.NestedReader) *collector.TopNCollector { + if req.SearchAfter != nil { + return collector.NewNestedTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter, nr) + } + return collector.NewNestedTopNCollector(req.Size, req.From, req.Sort, nr) + } + + // check if we are in nested mode + if nestedMode, ok := ctx.Value(search.NestedSearchKey).(bool); ok && nestedMode { + // get the nested reader from the index reader + if nr, ok := reader.(index.NestedReader); ok { + // check if the mapping has any nested fields that intersect + if nm, ok := i.m.(mapping.NestedMapping); ok { + var fs search.FieldSet + var err error + fs, err = query.ExtractFields(req.Query, i.m, fs) + if err != nil { + return nil, err + } + if fs.HasID() || nm.IntersectsPrefix(fs) { + return newNestedCollector(nr), nil + } + } + } + } + return newCollector(), nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/index_meta.go b/vendor/github.com/blevesearch/bleve/v2/index_meta.go index 14b88dcbc0..ca6c7c7ad1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_meta.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_meta.go @@ -15,6 +15,7 @@ package bleve import ( + "encoding/binary" "fmt" "os" "path/filepath" @@ -27,17 +28,30 @@ import ( const metaFilename = "index_meta.json" type indexMeta struct { - Storage string `json:"storage"` - IndexType string `json:"index_type"` - Config map[string]interface{} `json:"config,omitempty"` + Storage string `json:"storage"` + IndexType string `json:"index_type"` + Config map[string]interface{} `json:"config,omitempty"` + fileWriter util.FileWriter + fileReader util.FileReader } -func newIndexMeta(indexType string, storage string, config map[string]interface{}) *indexMeta { - return &indexMeta{ - IndexType: indexType, - Storage: storage, - Config: config, +func newIndexMeta(indexType string, storage string, config map[string]interface{}, path string) (*indexMeta, error) { + indexMetaPath := indexMetaPath(path) + fileWriter, err := util.NewFileWriter([]byte(indexMetaPath)) + if err != nil { + return nil, fmt.Errorf("failed to create file writer for index meta: %w", err) + } + fileReader, err := util.NewFileReader(fileWriter.Id(), []byte(indexMetaPath)) + if err != nil { + return nil, fmt.Errorf("failed to create file reader for index meta: %w", err) } + return &indexMeta{ + IndexType: indexType, + Storage: storage, + Config: config, + fileWriter: fileWriter, + fileReader: fileReader, + }, nil } func openIndexMeta(path string) (*indexMeta, error) { @@ -49,11 +63,60 @@ func openIndexMeta(path string) (*indexMeta, error) { if err != nil { return nil, ErrorIndexMetaMissing } + + // check if indexMetaPath+_temp exists, if so, this means a writer update was in progress + // and we should attempt to recover using the temp file + if _, err := os.Stat(indexMetaPath + "_temp"); err == nil { + tempBytes, err := os.ReadFile(indexMetaPath + "_temp") + if err == nil { + err = os.Rename(indexMetaPath+"_temp", indexMetaPath) + if err != nil { + return nil, err + } + metaBytes = tempBytes + } + } + var im indexMeta + var fileReader util.FileReader + // attempt to unmarshal metabytes directly. If this succeeds, + // then we know there was no file callback writer used and we can + // proceed as normal. err = util.UnmarshalJSON(metaBytes, &im) if err != nil { - return nil, ErrorIndexMetaCorrupt + // on failure, we expect the last 4 bytes to be the length of the file + // callback id and the preceding bytes to be the file callback id, which + // we can use to obtain the file reader to read the actual meta data bytes + if len(metaBytes) < 4 { + return nil, ErrorIndexMetaCorrupt + } + + // read the length of the file callback id from the last 4 bytes + pos := len(metaBytes) - 4 + fileWriterIDLen := int(binary.BigEndian.Uint32(metaBytes[pos:])) + pos -= fileWriterIDLen + if pos < 0 { + return nil, ErrorIndexMetaCorrupt + } + + // read and initialize the file reader using the file callback id + fileWriterID := metaBytes[pos : pos+fileWriterIDLen] + fileReader, err = util.NewFileReader(string(fileWriterID), []byte(indexMetaPath)) + if err != nil { + return nil, err + } + + buf, err := fileReader.Process(metaBytes[0:pos]) + if err != nil { + return nil, err + } + err = util.UnmarshalJSON(buf, &im) + if err != nil { + return nil, ErrorIndexMetaCorrupt + } } + im.fileReader = fileReader + if im.IndexType == "" { im.IndexType = upsidedown.Name } @@ -86,15 +149,29 @@ func (i *indexMeta) Save(path string) (err error) { err = ierr } }() + + metaBytes = i.fileWriter.Process(metaBytes) + _, err = indexMetaFile.Write(metaBytes) if err != nil { return err } + + _, err = indexMetaFile.Write([]byte(i.fileWriter.Id())) + if err != nil { + return err + } + + err = binary.Write(indexMetaFile, binary.BigEndian, uint32(len(i.fileWriter.Id()))) + if err != nil { + return err + } + return nil } -func (i *indexMeta) CopyTo(d index.Directory) (err error) { - metaBytes, err := util.MarshalJSON(i) +func (i *indexMeta) CopyTo(path string, d index.Directory) (err error) { + metaBytes, err := os.ReadFile(indexMetaPath(path)) if err != nil { return err } @@ -110,6 +187,69 @@ func (i *indexMeta) CopyTo(d index.Directory) (err error) { return err } +// updates the file callback writer id in the index meta, +// and re-processes data with the latest file callback writer +// returns the new file callback writer and reader to be used for +// future processing of index meta data +func (i *indexMeta) UpdateWriter(path string) error { + indexMetaPath := indexMetaPath(path) + metaBytes, err := util.MarshalJSON(i) + if err != nil { + return err + } + + i.fileWriter, err = util.NewFileWriter([]byte(indexMetaPath)) + if err != nil { + return err + } + metaBytes = i.fileWriter.Process(metaBytes) + + // write out new meta with new writer id, using temp file and rename to ensure atomicity + // if we crash in the middle of this, on next open we will see the temp file and recover using it + tempMetaPath := indexMetaPath + "_temp" + tempMetaFile, err := os.OpenFile(tempMetaPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) + if err != nil { + if os.IsExist(err) { + return ErrorIndexPathExists + } + return err + } + + // write the meta bytes + _, err = tempMetaFile.Write(metaBytes) + if err != nil { + return err + } + // write the file callback id + _, err = tempMetaFile.Write([]byte(i.fileWriter.Id())) + if err != nil { + return err + } + // write the length of the file callback id + err = binary.Write(tempMetaFile, binary.BigEndian, uint32(len(i.fileWriter.Id()))) + if err != nil { + return err + } + // close file before renaming + err = tempMetaFile.Close() + if err != nil { + return err + } + // atomically rename temp file to index meta file + err = os.Rename(tempMetaPath, indexMetaPath) + if err != nil { + return err + } + + // initialize the new file reader for index meta + i.fileReader, err = util.NewFileReader(string(i.fileWriter.Id()), []byte(indexMetaPath)) + if err != nil { + return err + } + + return nil +} + func indexMetaPath(path string) string { return filepath.Join(path, metaFilename) } diff --git a/vendor/github.com/blevesearch/bleve/v2/index_update.go b/vendor/github.com/blevesearch/bleve/v2/index_update.go index 5666d035be..f16b640d63 100644 --- a/vendor/github.com/blevesearch/bleve/v2/index_update.go +++ b/vendor/github.com/blevesearch/bleve/v2/index_update.go @@ -180,6 +180,10 @@ func checkUpdatedMapping(ori, upd *mapping.DocumentMapping) error { return nil } + if ori.Nested != upd.Nested { + return fmt.Errorf("nested property cannot be changed") + } + var err error // Recursively go through the child mappings for name, updDMapping := range upd.Properties { @@ -507,6 +511,9 @@ func compareFieldMapping(original, updated *mapping.FieldMapping) (*index.Update if original.VectorIndexOptimizedFor != updated.VectorIndexOptimizedFor { return nil, fmt.Errorf("vectorIndexOptimizedFor cannot be updated for vector and vector_base64 fields") } + if original.GPU != updated.GPU { + return nil, fmt.Errorf("gpu cannot be updated for vector and vector_base64 fields") + } } if original.IncludeInAll != updated.IncludeInAll { return nil, fmt.Errorf("includeInAll cannot be changed") diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping.go b/vendor/github.com/blevesearch/bleve/v2/mapping.go index 723105a294..af02db386a 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping.go @@ -34,6 +34,20 @@ func NewDocumentStaticMapping() *mapping.DocumentMapping { return mapping.NewDocumentStaticMapping() } +// NewNestedDocumentMapping returns a new document mapping +// that will treat all objects as nested documents. +func NewNestedDocumentMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentMapping() +} + +// NewNestedDocumentStaticMapping returns a new document mapping +// that will treat all objects as nested documents and +// will not automatically index parts of a nested document +// without an explicit mapping. +func NewNestedDocumentStaticMapping() *mapping.DocumentMapping { + return mapping.NewNestedDocumentStaticMapping() +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *mapping.DocumentMapping { diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go index a78b27e11d..3da925038b 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/document.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/document.go @@ -22,6 +22,7 @@ import ( "reflect" "time" + "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/util" ) @@ -44,6 +45,7 @@ type DocumentMapping struct { Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` + Nested bool `json:"nested,omitempty"` DefaultAnalyzer string `json:"default_analyzer,omitempty"` DefaultSynonymSource string `json:"default_synonym_source,omitempty"` @@ -230,6 +232,17 @@ func NewDocumentMapping() *DocumentMapping { } } +// NewNestedDocumentMapping returns a new document +// mapping that treats sub-documents as nested +// objects. +func NewNestedDocumentMapping() *DocumentMapping { + return &DocumentMapping{ + Nested: true, + Enabled: true, + Dynamic: true, + } +} + // NewDocumentStaticMapping returns a new document // mapping that will not automatically index parts // of a document without an explicit mapping. @@ -239,6 +252,17 @@ func NewDocumentStaticMapping() *DocumentMapping { } } +// NewNestedDocumentStaticMapping returns a new document +// mapping that treats sub-documents as nested +// objects and will not automatically index parts +// of the nested document without an explicit mapping. +func NewNestedDocumentStaticMapping() *DocumentMapping { + return &DocumentMapping{ + Enabled: true, + Nested: true, + } +} + // NewDocumentDisabledMapping returns a new document // mapping that will not perform any indexing. func NewDocumentDisabledMapping() *DocumentMapping { @@ -312,6 +336,11 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "nested": + err := util.UnmarshalJSON(v, &dm.Nested) + if err != nil { + return err + } case "default_analyzer": err := util.UnmarshalJSON(v, &dm.DefaultAnalyzer) if err != nil { @@ -381,6 +410,18 @@ func (dm *DocumentMapping) defaultSynonymSource(path []string) string { return rv } +// baseType returns the base type of v by dereferencing pointers +func baseType(v interface{}) reflect.Type { + if v == nil { + return nil + } + t := reflect.TypeOf(v) + for t.Kind() == reflect.Pointer { + t = t.Elem() + } + return t +} + func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { // allow default "json" tag to be overridden structTagKey := dm.StructTagKey @@ -434,11 +475,39 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } } case reflect.Slice, reflect.Array: + subDocMapping, _ := dm.documentMappingForPathElements(path) + allowNested := subDocMapping != nil && subDocMapping.Nested for i := 0; i < val.Len(); i++ { - if val.Index(i).CanInterface() { - fieldVal := val.Index(i).Interface() - dm.processProperty(fieldVal, path, append(indexes, uint64(i)), context) + // for each array element, check if it can be represented as an interface + idxVal := val.Index(i) + // skip invalid values + if !idxVal.CanInterface() { + continue + } + // get the actual value in interface form + actual := idxVal.Interface() + // if nested mapping, only create nested document for object elements + if allowNested && actual != nil { + // check the kind of the actual value, is it an object (struct or map)? + typ := baseType(actual) + if typ == nil { + continue + } + kind := typ.Kind() + // only create nested docs for real JSON objects + if kind == reflect.Struct || kind == reflect.Map { + // Create nested document only for only object elements + nestedDocument := document.NewDocument( + fmt.Sprintf("%s_$%s_$%d", context.doc.ID(), encodePath(path), i)) + nestedContext := context.im.newWalkContext(nestedDocument, dm) + dm.processProperty(actual, path, append(indexes, uint64(i)), nestedContext) + context.doc.AddNestedDocument(nestedDocument) + continue + } } + // non-nested mapping, or non-object element in nested mapping + // process the element normally + dm.processProperty(actual, path, append(indexes, uint64(i)), context) } case reflect.Ptr: ptrElem := val.Elem() diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go index 0b60749102..53c8dc61d9 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/field.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/field.go @@ -83,6 +83,9 @@ type FieldMapping struct { VectorIndexOptimizedFor string `json:"vector_index_optimized_for,omitempty"` SynonymSource string `json:"synonym_source,omitempty"` + + // Applicable to vector fields only - enables GPU acceleration for indexing and searching + GPU bool `json:"gpu,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -226,6 +229,9 @@ func (fm *FieldMapping) Options() index.FieldIndexingOptions { if fm.SkipFreqNorm { rv |= index.SkipFreqNorm } + if fm.GPU { + rv |= index.GPU + } return rv } @@ -479,6 +485,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "gpu": + err := util.UnmarshalJSON(v, &fm.GPU) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go index 7878cce8bb..143ff5a313 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/index.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/index.go @@ -17,12 +17,14 @@ package mapping import ( "encoding/json" "fmt" + "strings" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/analysis/analyzer/standard" "github.com/blevesearch/bleve/v2/analysis/datetime/optional" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/registry" + "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -195,11 +197,19 @@ func (im *IndexMappingImpl) Validate() error { // the map will hold the fully qualified field name to FieldMapping, so we can // check for conflicts as we validate each DocumentMapping. fieldAliasCtx := make(map[string]*FieldMapping) + // ensure that the nested property is not set for top-level default mapping + if im.DefaultMapping.Nested { + return fmt.Errorf("default mapping cannot be nested") + } err = im.DefaultMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err } - for _, docMapping := range im.TypeMapping { + for name, docMapping := range im.TypeMapping { + // ensure that the nested property is not set for top-level mappings + if docMapping.Nested { + return fmt.Errorf("type mapping named: %s cannot be nested", name) + } err = docMapping.Validate(im.cache, []string{}, fieldAliasCtx) if err != nil { return err @@ -574,3 +584,70 @@ func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceV } return nil } + +func (im *IndexMappingImpl) buildNestedPrefixes() map[string]int { + prefixDepth := make(map[string]int) + var collectNestedFields func(dm *DocumentMapping, pathComponents []string, currentDepth int) + collectNestedFields = func(dm *DocumentMapping, pathComponents []string, currentDepth int) { + for name, docMapping := range dm.Properties { + newPathComponents := append(pathComponents, name) + if docMapping.Nested { + // This is a nested field boundary + newDepth := currentDepth + 1 + prefixDepth[strings.Join(newPathComponents, pathSeparator)] = newDepth + // Continue deeper with incremented depth + collectNestedFields(docMapping, newPathComponents, newDepth) + } else { + // Not nested, continue with same depth + collectNestedFields(docMapping, newPathComponents, currentDepth) + } + } + } + // Start from depth 0 (root) + if im.DefaultMapping != nil && im.DefaultMapping.Enabled { + collectNestedFields(im.DefaultMapping, []string{}, 0) + } + // Now do this for each type mapping + for _, docMapping := range im.TypeMapping { + if docMapping.Enabled { + collectNestedFields(docMapping, []string{}, 0) + } + } + return prefixDepth +} + +func (im *IndexMappingImpl) NestedDepth(fs search.FieldSet) (int, int) { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0, 0 + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.NestedDepth(fs) +} + +func (im *IndexMappingImpl) CountNested() int { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return 0 + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.CountNested() +} + +func (im *IndexMappingImpl) IntersectsPrefix(fs search.FieldSet) bool { + if im.cache == nil || im.cache.NestedPrefixes == nil { + return false + } + + im.cache.NestedPrefixes.InitOnce(func() map[string]int { + return im.buildNestedPrefixes() + }) + + return im.cache.NestedPrefixes.IntersectsPrefix(fs) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go index a6c1591b88..7ff2f99278 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping.go @@ -20,6 +20,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/document" + "github.com/blevesearch/bleve/v2/search" ) // A Classifier is an interface describing any object which knows how to @@ -74,3 +75,21 @@ type SynonymMapping interface { SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error } + +// A NestedMapping extends the IndexMapping interface to provide +// additional methods for working with nested object mappings. +type NestedMapping interface { + // NestedDepth returns two values: + // - common: the highest nested level that is common to all given field paths, + // if 0 then there is no common nested level among the given field paths + // - max: the highest nested level that applies to at least one of the given field paths + // if 0 then none of the given field paths are nested + NestedDepth(fieldPaths search.FieldSet) (int, int) + + // IntersectsPrefix returns true if any of the given + // field paths intersect with a known nested prefix + IntersectsPrefix(fieldPaths search.FieldSet) bool + + // CountNested returns the number of nested object mappings + CountNested() int +} diff --git a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go index 393262b357..81d2cb9a3e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go +++ b/vendor/github.com/blevesearch/bleve/v2/mapping/mapping_vectors.go @@ -151,6 +151,12 @@ func (fm *FieldMapping) processVector(propertyMightBeVector interface{}, if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } + // bivf indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons + if index.OptimizationRequiresBinaryIndex(vectorIndexOptimizedFor) { + similarity = index.CosineSimilarity + } // normalize raw vector if similarity is cosine // Since the vector can be multi-vector (flattened array of multiple vectors), // we use NormalizeMultiVector to normalize each sub-vector independently. @@ -185,6 +191,12 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac if vectorIndexOptimizedFor == "" { vectorIndexOptimizedFor = index.DefaultIndexOptimization } + // bivf indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons + if index.OptimizationRequiresBinaryIndex(vectorIndexOptimizedFor) { + similarity = index.CosineSimilarity + } decodedVector, err := document.DecodeVector(encodedString) if err != nil || len(decodedVector) != fm.Dims { return @@ -197,6 +209,7 @@ func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interfac fieldName := getFieldName(pathString, path, fm) options := fm.Options() + field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector, fm.Dims, similarity, vectorIndexOptimizedFor, options) context.doc.AddField(field) @@ -264,6 +277,11 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, "(different vector index optimization values %s and %s)", effectiveFieldName, effectiveOptimizedFor, aliasOptimizedFor) } + if field.GPU != fieldAlias.GPU { + return fmt.Errorf("field: '%s', invalid alias "+ + "(different gpu values %v and %v)", effectiveFieldName, + field.GPU, fieldAlias.GPU) + } return nil } @@ -288,6 +306,11 @@ func validateVectorFieldAlias(field *FieldMapping, path []string, effectiveOptimizedFor, reflect.ValueOf(index.SupportedVectorIndexOptimizations).MapKeys()) } + // bivf indexes requires vector dimensionality to be a multiple of 8 + if index.OptimizationRequiresBinaryIndex(effectiveOptimizedFor) && field.Dims%8 != 0 { + return fmt.Errorf("field: '%s', incompatible vector dimensionality for BIVF: %d,"+ + " dimension should be a multiple of 8", effectiveFieldName, field.Dims) + } if fieldAliasCtx != nil { // writing to a nil map is unsafe fieldAliasCtx[effectiveFieldName] = field diff --git a/vendor/github.com/blevesearch/bleve/v2/numeric/prefix_coded.go b/vendor/github.com/blevesearch/bleve/v2/numeric/prefix_coded.go index 29bd0fc5c1..03ba043e37 100644 --- a/vendor/github.com/blevesearch/bleve/v2/numeric/prefix_coded.go +++ b/vendor/github.com/blevesearch/bleve/v2/numeric/prefix_coded.go @@ -66,6 +66,14 @@ func MustNewPrefixCodedInt64(in int64, shift uint) PrefixCoded { return rv } +func MustNewPrefixCodedInt64Prealloc(in int64, shift uint, prealloc []byte) PrefixCoded { + rv, _, err := NewPrefixCodedInt64Prealloc(in, shift, prealloc) + if err != nil { + panic(err) + } + return rv +} + // Shift returns the number of bits shifted // returns 0 if in uninitialized state func (p PrefixCoded) Shift() (uint, error) { diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/nested.go b/vendor/github.com/blevesearch/bleve/v2/registry/nested.go new file mode 100644 index 0000000000..fee7fda62f --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/registry/nested.go @@ -0,0 +1,136 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package registry + +import ( + "strings" + "sync" + + "github.com/blevesearch/bleve/v2/search" +) + +// NestedFieldCache caches nested field prefixes and their corresponding nesting levels. +// A nested field prefix is a field path prefix that indicates the start of a nested document. +// The nesting level indicates how deep the nested document is in the overall document structure. +type NestedFieldCache struct { + // nested prefix -> nested level + prefixDepth map[string]int + once sync.Once + m sync.RWMutex +} + +func NewNestedFieldCache() *NestedFieldCache { + return &NestedFieldCache{} +} + +func (nfc *NestedFieldCache) InitOnce(buildFunc func() map[string]int) { + nfc.once.Do(func() { + nfc.m.Lock() + defer nfc.m.Unlock() + nfc.prefixDepth = buildFunc() + }) +} + +// NestedDepth returns two values: +// - common: The nesting level of the longest prefix that applies to every field path +// in the provided FieldSet. A value of 0 means no nested prefix is shared +// across all field paths. +// - max: The nesting level of the longest prefix that applies to at least one +// field path in the provided FieldSet. A value of 0 means none of the +// field paths match any nested prefix. +func (nfc *NestedFieldCache) NestedDepth(fieldPaths search.FieldSet) (common int, max int) { + // if no field paths, no nested depth + if len(fieldPaths) == 0 { + return + } + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no nested depth + if len(nfc.prefixDepth) == 0 { + return + } + // for each prefix, check if its a common prefix or matches any path + // update common and max accordingly with the highest nesting level + // possible for each respective case + for prefix, level := range nfc.prefixDepth { + // only check prefixes that could increase one of the results + if level <= common && level <= max { + continue + } + // check prefix against field paths, getting whether it matches all paths (common) + // and whether it matches at least one path (any) + matchAll, matchAny := nfc.prefixMatch(prefix, fieldPaths) + // if it matches all paths, update common + if matchAll && level > common { + common = level + } + // if it matches any path, update max + if matchAny && level > max { + max = level + } + } + return common, max +} + +// CountNested returns the number of nested prefixes +func (nfc *NestedFieldCache) CountNested() int { + nfc.m.RLock() + defer nfc.m.RUnlock() + + return len(nfc.prefixDepth) +} + +// IntersectsPrefix returns true if any of the given +// field paths have a nested prefix +func (nfc *NestedFieldCache) IntersectsPrefix(fieldPaths search.FieldSet) bool { + // if no field paths, no intersection + if len(fieldPaths) == 0 { + return false + } + nfc.m.RLock() + defer nfc.m.RUnlock() + // if no cached prefixes, no intersection + if len(nfc.prefixDepth) == 0 { + return false + } + // Check each cached nested prefix to see if it intersects with any path + for prefix := range nfc.prefixDepth { + _, matchAny := nfc.prefixMatch(prefix, fieldPaths) + if matchAny { + return true + } + } + return false +} + +// prefixMatch checks whether the prefix matches all paths (common) and whether it matches at least one path (any) +// Caller must hold the read lock. +func (nfc *NestedFieldCache) prefixMatch(prefix string, fieldPaths search.FieldSet) (common bool, any bool) { + common = true + any = false + for path := range fieldPaths { + has := strings.HasPrefix(path, prefix) + if has { + any = true + } else { + common = false + } + // early exit if we have determined both values + if any && !common { + break + } + } + return common, any +} diff --git a/vendor/github.com/blevesearch/bleve/v2/registry/registry.go b/vendor/github.com/blevesearch/bleve/v2/registry/registry.go index 69ee8dd86a..36f209d4f0 100644 --- a/vendor/github.com/blevesearch/bleve/v2/registry/registry.go +++ b/vendor/github.com/blevesearch/bleve/v2/registry/registry.go @@ -49,6 +49,7 @@ type Cache struct { Fragmenters *FragmenterCache Highlighters *HighlighterCache SynonymSources *SynonymSourceCache + NestedPrefixes *NestedFieldCache } func NewCache() *Cache { @@ -63,6 +64,7 @@ func NewCache() *Cache { Fragmenters: NewFragmenterCache(), Highlighters: NewHighlighterCache(), SynonymSources: NewSynonymSourceCache(), + NestedPrefixes: NewNestedFieldCache(), } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search.go b/vendor/github.com/blevesearch/bleve/v2/search.go index ee53ac6e2a..708be0871e 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search.go @@ -15,9 +15,12 @@ package bleve import ( + "bytes" + "encoding/json" "fmt" "reflect" "regexp" + "slices" "sort" "strconv" "strings" @@ -625,11 +628,35 @@ func formatHit(rv *strings.Builder, hit *search.DocumentMatch, hitNumber int) *s } } for otherFieldName, otherFieldValue := range hit.Fields { + if otherFieldName == NestedDocumentKey { + continue + } if _, ok := hit.Fragments[otherFieldName]; !ok { fmt.Fprintf(rv, "\t%s\n", otherFieldName) fmt.Fprintf(rv, "\t\t%v\n", otherFieldValue) } } + // nested documents + if nested, ok := hit.Fields[NestedDocumentKey]; ok { + if list, ok := nested.([]*search.NestedDocumentMatch); ok { + fmt.Fprintf(rv, "\t%s (%d nested documents)\n", NestedDocumentKey, len(list)) + for ni, nd := range list { + fmt.Fprintf(rv, "\t\tNested #%d:\n", ni+1) + for f, frags := range nd.Fragments { + fmt.Fprintf(rv, "\t\t\t%s\n", f) + for _, frag := range frags { + fmt.Fprintf(rv, "\t\t\t\t%s\n", frag) + } + } + for f, v := range nd.Fields { + if _, ok := nd.Fragments[f]; !ok { + fmt.Fprintf(rv, "\t\t\t%s\n", f) + fmt.Fprintf(rv, "\t\t\t\t%v\n", v) + } + } + } + } + } if len(hit.DecodedSort) > 0 { fmt.Fprintf(rv, "\t_sort: [") for k, v := range hit.DecodedSort { @@ -806,3 +833,22 @@ func ParseParams(r *SearchRequest, input []byte) (*RequestParams, error) { return params, nil } + +// OptionalRawMessage is a wrapper around json.RawMessage that treats empty or `null` JSON as nil. +type OptionalRawMessage json.RawMessage + +func (n *OptionalRawMessage) UnmarshalJSON(data []byte) error { + if len(data) == 0 || bytes.Equal(data, []byte("null")) { + *n = nil + return nil + } + *n = slices.Clone(data) + return nil +} + +func (n OptionalRawMessage) MarshalJSON() ([]byte, error) { + if len(n) == 0 { + return []byte("null"), nil + } + return n, nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/nested.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/nested.go new file mode 100644 index 0000000000..ce2f790908 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/nested.go @@ -0,0 +1,103 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collector + +import ( + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +type collectStoreNested struct { + // descAdder is used to customize how descendants are merged into their parent + descAdder search.DescendantAdderCallbackFn + // nested reader to retrieve ancestor information + nr index.NestedReader + // the current root document match being built + currRoot *search.DocumentMatch + // the ancestor ID of the current root document being built + currRootAncestorID index.AncestorID + // prealloc slice for ancestor IDs + ancestors []index.AncestorID +} + +func newStoreNested(nr index.NestedReader, descAdder search.DescendantAdderCallbackFn) *collectStoreNested { + rv := &collectStoreNested{ + descAdder: descAdder, + nr: nr, + } + return rv +} + +// ProcessNestedDocument adds a document to the nested store, merging it into its root document +// as needed. If the returned DocumentMatch is nil, the incoming doc has been merged +// into its parent and should not be processed further. If the returned DocumentMatch +// is non-nil, it represents a complete root document that should be processed further. +// NOTE: This implementation assumes that documents are added in increasing order of their internal IDs +// which is guaranteed by all searchers in bleve. +func (c *collectStoreNested) ProcessNestedDocument(ctx *search.SearchContext, doc *search.DocumentMatch) (*search.DocumentMatch, error) { + // find ancestors for the doc + var err error + c.ancestors, err = c.nr.Ancestors(doc.IndexInternalID, c.ancestors[:0]) + if err != nil { + return nil, err + } + if len(c.ancestors) == 0 { + // should not happen, every doc should have at least itself as ancestor + return nil, nil + } + // root docID is the last ancestor + rootID := c.ancestors[len(c.ancestors)-1] + // check if there is an interim root already and if the incoming doc belongs to it + if c.currRoot != nil && c.currRootAncestorID.Equals(rootID) { + // there is an interim root already, and the incoming doc belongs to it + if err := c.descAdder(c.currRoot, doc); err != nil { + return nil, err + } + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return nil, nil + } + // completedRoot is the root document match to return, if any + var completedRoot *search.DocumentMatch + if c.currRoot != nil { + // we have an existing interim root, return it for processing + completedRoot = c.currRoot + } + // no interim root for now so either we have a root document incoming + // or we have a child doc and need to create an interim root + if len(c.ancestors) == 1 { + // incoming doc is the root itself + c.currRoot = doc + c.currRootAncestorID = rootID + return completedRoot, nil + } + // this is a child doc, create interim root + newDM := ctx.DocumentMatchPool.Get() + newDM.IndexInternalID = rootID.ToIndexInternalID(newDM.IndexInternalID) + // merge the incoming doc into the new interim root + c.currRoot = newDM + c.currRootAncestorID = rootID + if err := c.descAdder(c.currRoot, doc); err != nil { + return nil, err + } + // recycle the child document now that it's merged into the interim root + ctx.DocumentMatchPool.Put(doc) + return completedRoot, nil +} + +// Current returns the current interim root document match being built, if any +func (c *collectStoreNested) Current() *search.DocumentMatch { + return c.currRoot +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go index 739dd8348d..bab318d5c8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/collector/topn.go @@ -78,7 +78,9 @@ type TopNCollector struct { searchAfter *search.DocumentMatch knnHits map[string]*search.DocumentMatch - computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc + hybridMergeCallback search.HybridMergeCallbackFn + + nestedStore *collectStoreNested } // CheckDoneEvery controls how frequently we check the context deadline @@ -88,25 +90,74 @@ const CheckDoneEvery = uint64(1024) // skipping over the first 'skip' hits // ordering hits by the provided sort order func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { - return newTopNCollector(size, skip, sort) + return newTopNCollector(size, skip, sort, nil) } // NewTopNCollectorAfter builds a collector to find the top 'size' hits // skipping over the first 'skip' hits // ordering hits by the provided sort order +// starting after the provided 'after' sort values func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector { - rv := newTopNCollector(size, 0, sort) + rv := newTopNCollector(size, 0, sort, nil) + rv.searchAfter = createSearchAfterDocument(sort, after) + return rv +} + +// NewNestedTopNCollector builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { + return newTopNCollector(size, skip, sort, nr) +} + +// NewNestedTopNCollectorAfter builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +// starting after the provided 'after' sort values +// while ensuring the nested documents are handled correctly +// (i.e. parent document is returned instead of nested document) +func NewNestedTopNCollectorAfter(size int, sort search.SortOrder, after []string, nr index.NestedReader) *TopNCollector { + rv := newTopNCollector(size, 0, sort, nr) rv.searchAfter = createSearchAfterDocument(sort, after) return rv } -func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { +func newTopNCollector(size int, skip int, sort search.SortOrder, nr index.NestedReader) *TopNCollector { hc := &TopNCollector{size: size, skip: skip, sort: sort} hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int { return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j) }) + if nr != nil { + descAdder := func(parent, child *search.DocumentMatch) error { + // add descendant score to parent score + parent.Score += child.Score + // merge explanations + parent.Expl = parent.Expl.MergeWith(child.Expl) + // merge field term locations + parent.FieldTermLocations = search.MergeFieldTermLocationsFromMatch(parent.FieldTermLocations, child) + // add child's ID to parent's Descendants + // add other as descendant only if it is not the same document + if !parent.IndexInternalID.Equals(child.IndexInternalID) { + // Add a copy of child.IndexInternalID to descendants, because + // child.IndexInternalID will be reset when 'child' is recycled. + var descendantID index.IndexInternalID + // first check if parent's descendants slice has capacity to reuse + if len(parent.Descendants) < cap(parent.Descendants) { + // reuse the buffer element at len(parent.Descendants) + descendantID = parent.Descendants[:len(parent.Descendants)+1][len(parent.Descendants)] + } + // copy the contents of id into descendantID, allocating if needed + parent.Descendants = append(parent.Descendants, index.NewIndexInternalIDFrom(descendantID, child.IndexInternalID)) + } + return nil + } + hc.nestedStore = newStoreNested(nr, search.DescendantAdderCallbackFn(descAdder)) + } + // these lookups traverse an interface, so do once up-front if sort.RequiresDocID() { hc.needDocIds = true @@ -283,8 +334,13 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: next, err = searcher.Next(searchContext) } + // use a local totalDocs for counting total docs seen + // for context deadline checking, as hc.total is only + // incremented for actual(root) collected documents, and + // we need to check deadline for every document seen (root or nested) + var totalDocs uint64 for err == nil && next != nil { - if hc.total%CheckDoneEvery == 0 { + if totalDocs%CheckDoneEvery == 0 { select { case <-ctx.Done(): search.RecordSearchCost(ctx, search.AbortM, 0) @@ -292,27 +348,60 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, default: } } - - err = hc.adjustDocumentMatch(searchContext, reader, next) - if err != nil { - break - } - - err = hc.prepareDocumentMatch(searchContext, reader, next, false) - if err != nil { - break + totalDocs++ + if hc.nestedStore != nil { + // This may be a nested document — add it to the nested store first. + // If the nested store returns nil, the document was merged into its parent + // and should not be processed further. + // If it returns a non-nil document, it represents a complete root document + // and should be processed further. + next, err = hc.nestedStore.ProcessNestedDocument(searchContext, next) + if err != nil { + break + } } - - err = dmHandler(next) - if err != nil { - break + if next != nil { + err = hc.adjustDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + err = hc.prepareDocumentMatch(searchContext, reader, next, false) + if err != nil { + break + } + err = dmHandler(next) + if err != nil { + break + } } - next, err = searcher.Next(searchContext) } if err != nil { return err } + + // if we have a nested store, we may have an interim root + // that needs to be returned for processing + if hc.nestedStore != nil { + currRoot := hc.nestedStore.Current() + if currRoot != nil { + err = hc.adjustDocumentMatch(searchContext, reader, currRoot) + if err != nil { + return err + } + // no descendants at this point + err = hc.prepareDocumentMatch(searchContext, reader, currRoot, false) + if err != nil { + return err + } + + err = dmHandler(currRoot) + if err != nil { + return err + } + } + } + if hc.knnHits != nil { // we may have some knn hits left that did not match any of the top N tf-idf hits // we need to add them to the collector store to consider them as well. @@ -366,7 +455,10 @@ func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext, return err } if knnHit, ok := hc.knnHits[d.ID]; ok { - d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit) + // we have a knn hit corresponding to this document + hc.hybridMergeCallback(d, knnHit) + // remove this knn hit from the map as it's already + // been merged delete(hc.knnHits, d.ID) } } @@ -501,6 +593,14 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc } } + // first visit descendants if any + for _, descID := range d.Descendants { + err := hc.dvReader.VisitDocValues(descID, v) + if err != nil { + return err + } + } + // now visit the doc values for this document err := hc.dvReader.VisitDocValues(d.IndexInternalID, v) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() @@ -579,10 +679,10 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { return nil } -func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) { +func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, hybridMergeCallback search.HybridMergeCallbackFn) { hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits)) for _, hit := range knnHits { hc.knnHits[hit.ID] = hit } - hc.computeNewScoreExpl = newScoreExplComputer + hc.hybridMergeCallback = hybridMergeCallback } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/explanation.go b/vendor/github.com/blevesearch/bleve/v2/search/explanation.go index 924050016c..98c5e099db 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/explanation.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/explanation.go @@ -29,6 +29,8 @@ func init() { reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) } +const MergedExplMessage = "sum of merged explanations:" + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -54,3 +56,50 @@ func (expl *Explanation) Size() int { return sizeInBytes } + +// MergeExpl merges two explanations into one. +// If either explanation is nil, the other is returned. +// If the first explanation is already a merged explanation, +// the second explanation is appended to its children. +// Otherwise, a new merged explanation is created +// with the two explanations as its children. +func (expl *Explanation) MergeWith(other *Explanation) *Explanation { + if expl == nil { + return other + } + if other == nil || expl == other { + return expl + } + + newScore := expl.Value + other.Value + + // if both are merged explanations, combine children + if expl.Message == MergedExplMessage && other.Message == MergedExplMessage { + expl.Value = newScore + expl.Children = append(expl.Children, other.Children...) + return expl + } + + // atleast one is not a merged explanation see which one it is + // if expl is merged, append other + if expl.Message == MergedExplMessage { + // append other as a child to first + expl.Value = newScore + expl.Children = append(expl.Children, other) + return expl + } + + // if other is merged, append expl + if other.Message == MergedExplMessage { + other.Value = newScore + other.Children = append(other.Children, expl) + return other + } + // create a new explanation to hold the merged one + rv := &Explanation{ + Value: expl.Value + other.Value, + Message: MergedExplMessage, + Children: []*Explanation{expl, other}, + } + return rv +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go index e898a1e61c..d0adfa81fb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/highlight/highlighter/simple/highlighter_simple.go @@ -146,12 +146,8 @@ func (s *Highlighter) BestFragmentsInField(dm *search.DocumentMatch, doc index.D formattedFragments[i] += s.sep } } - - if dm.Fragments == nil { - dm.Fragments = make(search.FieldFragmentMap, 0) - } if len(formattedFragments) > 0 { - dm.Fragments[field] = formattedFragments + dm.AddFragments(field, formattedFragments) } return formattedFragments diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go b/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go index 3bf6f91456..96df4a6d85 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/boolean.go @@ -204,6 +204,8 @@ func (q *BooleanQuery) Searcher(ctx context.Context, i index.IndexReader, m mapp // Compare document IDs cmp := refDoc.IndexInternalID.Compare(d.IndexInternalID) if cmp < 0 { + // recycle refDoc now that we do not need it + sctx.DocumentMatchPool.Put(refDoc) // filterSearcher is behind the current document, Advance() it refDoc, err = filterSearcher.Advance(sctx, d.IndexInternalID) if err != nil || refDoc == nil { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go b/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go index a2043720a9..631956dca8 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/conjunction.go @@ -54,14 +54,39 @@ func (q *ConjunctionQuery) AddQuery(aq ...Query) { func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Conjuncts)) + cleanup := func() { + for _, searcher := range ss { + if searcher != nil { + _ = searcher.Close() + } + } + } + nestedMode, _ := ctx.Value(search.NestedSearchKey).(bool) + var nm mapping.NestedMapping + if nestedMode { + var ok bool + // get the nested mapping + if nm, ok = m.(mapping.NestedMapping); !ok { + // shouldn't be in nested mode if no nested mapping + nestedMode = false + } + } + // set of fields used in this query + var qfs search.FieldSet + var err error + for _, conjunct := range q.Conjuncts { + // Gather fields when nested mode is enabled + if nestedMode { + qfs, err = ExtractFields(conjunct, m, qfs) + if err != nil { + cleanup() + return nil, err + } + } sr, err := conjunct.Searcher(ctx, i, m, options) if err != nil { - for _, searcher := range ss { - if searcher != nil { - _ = searcher.Close() - } - } + cleanup() return nil, err } if _, ok := sr.(*searcher.MatchNoneSearcher); ok && q.queryStringMode { @@ -75,6 +100,23 @@ func (q *ConjunctionQuery) Searcher(ctx context.Context, i index.IndexReader, m return searcher.NewMatchNoneSearcher(i) } + if nestedMode { + // first determine the nested depth info for the query fields + commonDepth, maxDepth := nm.NestedDepth(qfs) + // if we have common depth == max depth then we can just use + // the normal conjunction searcher, as all fields share the same + // nested context, otherwise we need to use the nested conjunction searcher + // also, if we are querying the _all or _id fields, we need to use + // the nested conjunction searcher as well, with common depth 0 + // indicating matches happen only at the root level + if qfs.HasAll() || qfs.HasID() { + commonDepth = 0 + } + if commonDepth < maxDepth { + return searcher.NewNestedConjunctionSearcher(ctx, i, ss, commonDepth, options) + } + } + return searcher.NewConjunctionSearcher(ctx, i, ss, options) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/custom_filter.go b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_filter.go new file mode 100644 index 0000000000..07031312fc --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_filter.go @@ -0,0 +1,130 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package query + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" + index "github.com/blevesearch/bleve_index_api" +) + +// CustomFilterQuery wraps a child query and filters its candidate matches via +// an embedder-provided per-hit callback. +type CustomFilterQuery struct { + Query Query `json:"query"` + Fields []string `json:"fields,omitempty"` + + filterFunc searcher.CustomFilterFunc + payload map[string]interface{} +} + +// CustomFilterQueryParser lets an embedder override parsing of +// {"custom_filter": ...} nodes. It is intended to be assigned once during +// process startup or init, before any queries are parsed; callers must not +// mutate it concurrently with ParseQuery(). For example: +// +// func init() { +// query.CustomFilterQueryParser = parseCustomFilterQuery +// } +var CustomFilterQueryParser func([]byte) (Query, error) + +func NewCustomFilterQueryWithFilter(query Query, filter searcher.CustomFilterFunc, fields []string, payload map[string]interface{}) *CustomFilterQuery { + return &CustomFilterQuery{ + Query: query, + Fields: fields, + filterFunc: filter, + payload: payload, + } +} + +func (q *CustomFilterQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { + if q == nil { + return nil, fmt.Errorf("custom filter query is nil") + } + if q.Query == nil { + return nil, fmt.Errorf("custom filter query must have a query") + } + if q.filterFunc == nil { + return nil, fmt.Errorf("custom filter query must have a filter callback") + } + + // Build the inner searcher first; custom filtering wraps its output. + childSearcher, err := q.Query.Searcher(ctx, i, m, options) + if err != nil { + return nil, err + } + + // Create a doc value reader for the requested fields (if any) so the + // searcher can populate d.Fields before invoking the callback. + var dvReader index.DocValueReader + var fieldTypes map[string]string + if len(q.Fields) > 0 { + var err2 error + dvReader, err2 = i.DocValueReader(q.Fields) + if err2 != nil { + _ = childSearcher.Close() + return nil, err2 + } + fieldTypes = resolveFieldTypes(q.Fields, m) + } + + return searcher.NewCustomFilterSearcher(ctx, childSearcher, q.filterFunc, dvReader, i, fieldTypes), nil +} + +func (q *CustomFilterQuery) Validate() error { + if q == nil { + return fmt.Errorf("custom filter query is nil") + } + if q.Query == nil { + return fmt.Errorf("custom filter query must have a query") + } + if q.filterFunc == nil { + return fmt.Errorf("custom filter query must have a filter callback") + } + if vq, ok := q.Query.(ValidatableQuery); ok { + return vq.Validate() + } + return nil +} + +func (q *CustomFilterQuery) MarshalJSON() ([]byte, error) { + inner := make(map[string]interface{}, len(q.payload)+2) + for k, v := range q.payload { + inner[k] = v + } + inner["query"] = q.Query + if len(q.Fields) > 0 { + inner["fields"] = q.Fields + } + return json.Marshal(map[string]interface{}{ + "custom_filter": inner, + }) +} + +func (q *CustomFilterQuery) UnmarshalJSON(data []byte) error { + child, fields, payload, err := unmarshalCustomQueryPayload(data, "custom_filter") + if err != nil { + return err + } + q.Query = child + q.Fields = fields + q.payload = payload + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/custom_payload.go b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_payload.go new file mode 100644 index 0000000000..e220061b57 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_payload.go @@ -0,0 +1,96 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package query + +import ( + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/util" +) + +func unmarshalCustomQueryPayload(data []byte, key string) (Query, []string, map[string]interface{}, error) { + tmp := map[string]json.RawMessage{} + err := util.UnmarshalJSON(data, &tmp) + if err != nil { + return nil, nil, nil, err + } + + innerRaw, ok := tmp[key] + if !ok || innerRaw == nil { + return nil, nil, nil, nil + } + + var inner map[string]json.RawMessage + err = util.UnmarshalJSON(innerRaw, &inner) + if err != nil || inner == nil { + return nil, nil, nil, fmt.Errorf("%s query must be a JSON object", key) + } + + var child Query + if childQuery, ok := inner["query"]; ok && childQuery != nil { + child, err = ParseQuery(childQuery) + if err != nil { + return nil, nil, nil, err + } + } + + var fields []string + if rawFields, ok := inner["fields"]; ok && rawFields != nil { + if err := util.UnmarshalJSON(rawFields, &fields); err != nil { + return nil, nil, nil, fmt.Errorf("%s query has invalid %q: %w", + key, "fields", err) + } + } + + payload := make(map[string]interface{}, len(inner)) + for k, raw := range inner { + if k == "query" || k == "fields" { + continue + } + var v interface{} + if raw != nil { + err = util.UnmarshalJSON(raw, &v) + if err != nil { + return nil, nil, nil, fmt.Errorf("%s query has invalid %q payload: %w", + key, k, err) + } + } + payload[k] = v + } + + return child, fields, payload, nil +} + +// resolveFieldTypes looks up each field name in the index mapping and returns +// a map of field name → mapping type (e.g. "datetime", "number", "text"). +// This is used by the searcher layer to correctly decode doc value bytes. +func resolveFieldTypes(fields []string, m mapping.IndexMapping) map[string]string { + if m == nil || len(fields) == 0 { + return nil + } + types := make(map[string]string, len(fields)) + for _, f := range fields { + fm := m.FieldMappingForPath(f) + if fm.Type != "" { + types[f] = fm.Type + } + } + if len(types) == 0 { + return nil + } + return types +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/custom_score.go b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_score.go new file mode 100644 index 0000000000..e7f9d01edc --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/custom_score.go @@ -0,0 +1,130 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package query + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/v2/mapping" + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/search/searcher" + index "github.com/blevesearch/bleve_index_api" +) + +// CustomScoreQuery wraps a child query and re-scores its candidate matches via +// an embedder-provided per-hit callback. +type CustomScoreQuery struct { + Query Query `json:"query"` + Fields []string `json:"fields,omitempty"` + + scoreFunc searcher.CustomScoreFunc + payload map[string]interface{} +} + +// CustomScoreQueryParser lets an embedder override parsing of +// {"custom_score": ...} nodes. It is intended to be assigned once during +// process startup or init, before any queries are parsed; callers must not +// mutate it concurrently with ParseQuery(). For example: +// +// func init() { +// query.CustomScoreQueryParser = parseCustomScoreQuery +// } +var CustomScoreQueryParser func([]byte) (Query, error) + +func NewCustomScoreQueryWithScorer(query Query, score searcher.CustomScoreFunc, fields []string, payload map[string]interface{}) *CustomScoreQuery { + return &CustomScoreQuery{ + Query: query, + Fields: fields, + scoreFunc: score, + payload: payload, + } +} + +func (q *CustomScoreQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { + if q == nil { + return nil, fmt.Errorf("custom score query is nil") + } + if q.Query == nil { + return nil, fmt.Errorf("custom score query must have a query") + } + if q.scoreFunc == nil { + return nil, fmt.Errorf("custom score query must have a score callback") + } + + // Build the inner searcher first; custom scoring wraps its output. + childSearcher, err := q.Query.Searcher(ctx, i, m, options) + if err != nil { + return nil, err + } + + // Create a doc value reader for the requested fields (if any) so the + // searcher can populate d.Fields before invoking the callback. + var dvReader index.DocValueReader + var fieldTypes map[string]string + if len(q.Fields) > 0 { + var err2 error + dvReader, err2 = i.DocValueReader(q.Fields) + if err2 != nil { + _ = childSearcher.Close() + return nil, err2 + } + fieldTypes = resolveFieldTypes(q.Fields, m) + } + + return searcher.NewCustomScoreSearcher(ctx, childSearcher, q.scoreFunc, dvReader, i, fieldTypes), nil +} + +func (q *CustomScoreQuery) Validate() error { + if q == nil { + return fmt.Errorf("custom score query is nil") + } + if q.Query == nil { + return fmt.Errorf("custom score query must have a query") + } + if q.scoreFunc == nil { + return fmt.Errorf("custom score query must have a score callback") + } + if vq, ok := q.Query.(ValidatableQuery); ok { + return vq.Validate() + } + return nil +} + +func (q *CustomScoreQuery) MarshalJSON() ([]byte, error) { + inner := make(map[string]interface{}, len(q.payload)+2) + for k, v := range q.payload { + inner[k] = v + } + inner["query"] = q.Query + if len(q.Fields) > 0 { + inner["fields"] = q.Fields + } + return json.Marshal(map[string]interface{}{ + "custom_score": inner, + }) +} + +func (q *CustomScoreQuery) UnmarshalJSON(data []byte) error { + child, fields, payload, err := unmarshalCustomQueryPayload(data, "custom_score") + if err != nil { + return err + } + q.Query = child + q.Fields = fields + q.payload = payload + return nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go index ea8780a417..5282a740f3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/knn.go @@ -84,6 +84,12 @@ func (q *KNNQuery) Searcher(ctx context.Context, i index.IndexReader, if q.K <= 0 || len(q.Vector) == 0 { return nil, fmt.Errorf("k must be greater than 0 and vector must be non-empty") } + // bivf-sq8 indexes only supports hamming distance for the primary + // binary index. Similarity here is used for the backing flat index, + // which is set to cosine similarity for recall reasons + if index.OptimizationRequiresBinaryIndex(fieldMapping.VectorIndexOptimizedFor) { + similarityMetric = index.CosineSimilarity + } if similarityMetric == index.CosineSimilarity { // normalize the vector q.Vector = mapping.NormalizeVector(q.Vector) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go index 27c3978b17..cf64189b29 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/query.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query.go @@ -308,6 +308,20 @@ func ParseQuery(input []byte) (Query, error) { } return &rv, nil } + _, hasCustomFilter := tmp["custom_filter"] + if hasCustomFilter { + if CustomFilterQueryParser == nil { + return nil, fmt.Errorf("custom filter query parser is not registered") + } + return CustomFilterQueryParser(input) + } + _, hasCustomScore := tmp["custom_score"] + if hasCustomScore { + if CustomScoreQueryParser == nil { + return nil, fmt.Errorf("custom score query parser is not registered") + } + return CustomScoreQueryParser(input) + } _, hasDocIds := tmp["ids"] if hasDocIds { var rv DocIDQuery @@ -455,13 +469,10 @@ func DumpQuery(m mapping.IndexMapping, query Query) (string, error) { return string(data), err } -// FieldSet represents a set of queried fields. -type FieldSet map[string]struct{} - // ExtractFields returns a set of fields referenced by the query. // The returned set may be nil if the query does not explicitly reference any field // and the DefaultSearchField is unset in the index mapping. -func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, error) { +func ExtractFields(q Query, m mapping.IndexMapping, fs search.FieldSet) (search.FieldSet, error) { if q == nil || m == nil { return fs, nil } @@ -474,9 +485,9 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, erro } if f != "" { if fs == nil { - fs = make(FieldSet) + fs = search.NewFieldSet() } - fs[f] = struct{}{} + fs.AddField(f) } case *QueryStringQuery: var expandedQuery Query @@ -505,6 +516,11 @@ func ExtractFields(q Query, m mapping.IndexMapping, fs FieldSet) (FieldSet, erro break } } + case *DocIDQuery, *MatchAllQuery: + if fs == nil { + fs = search.NewFieldSet() + } + fs.AddField("_id") } return fs, err } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/query/query_string_lex.go b/vendor/github.com/blevesearch/bleve/v2/search/query/query_string_lex.go index c01fa6fc29..b7a1283393 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/query/query_string_lex.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/query/query_string_lex.go @@ -18,9 +18,40 @@ import ( "bufio" "io" "strings" + "sync" "unicode" ) +var queryStringLexPool = sync.Pool{ + New: func() interface{} { + return &queryStringLex{ + in: bufio.NewReader(strings.NewReader("")), + } + }, +} + +func getQueryStringLex(in io.Reader) *queryStringLex { + l := queryStringLexPool.Get().(*queryStringLex) + l.in.Reset(in) + l.currState = startState + l.currConsumed = true + l.buf = "" + l.inEscape = false + l.nextToken = nil + l.nextTokenType = 0 + l.seenDot = false + l.nextRune = 0 + l.nextRuneSize = 0 + l.atEOF = false + return l +} + +func putQueryStringLex(l *queryStringLex) { + l.in.Reset(strings.NewReader("")) + l.nextToken = nil + queryStringLexPool.Put(l) +} + const reservedChars = "+-=&|> 0 { - return nil, fmt.Errorf(strings.Join(lex.errs, "\n")) + return nil, fmt.Errorf("%s", strings.Join(lex.errs, "\n")) } return lex.query, nil } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go index 8d90434271..06f50cd4a7 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_knn.go @@ -123,7 +123,7 @@ func (sqs *KNNQueryScorer) Score(ctx *search.SearchContext, if sqs.options.Explain { rv.Expl = scoreExplanation } - rv.IndexInternalID = append(rv.IndexInternalID, knnMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, knnMatch.ID) return rv } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go index f5f8ec9356..d7e77f9779 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/scorer/scorer_term.go @@ -243,7 +243,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term } } - rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) + rv.IndexInternalID = index.NewIndexInternalIDFrom(rv.IndexInternalID, termMatch.ID) if len(termMatch.Vectors) > 0 { if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/search.go b/vendor/github.com/blevesearch/bleve/v2/search/search.go index 7240257877..541bbe42a1 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/search.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/search.go @@ -165,9 +165,9 @@ type DocumentMatch struct { // used to indicate the sub-scores that combined to form the // final score for this document match. This is only populated - // when the search request's query is a DisjunctionQuery - // or a ConjunctionQuery. The map key is the index of the sub-query - // in the DisjunctionQuery or ConjunctionQuery. The map value is the + // when the search request's query is a DisjunctionQuery. + // The map key is the index of the sub-query + // in the DisjunctionQuery. The map value is the // sub-score for that sub-query. ScoreBreakdown map[int]float64 `json:"score_breakdown,omitempty"` @@ -178,6 +178,10 @@ type DocumentMatch struct { // of the index that this match came from // of the current alias view, used in alias of aliases scenario IndexNames []string `json:"index_names,omitempty"` + + // Descendants holds the IDs of any child/descendant document that contributed + // to this root DocumentMatch. + Descendants []index.IndexInternalID `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -201,6 +205,21 @@ func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { dm.Fields[name] = valSlice } +func (dm *DocumentMatch) AddFragments(field string, fragments []string) { + if dm.Fragments == nil { + dm.Fragments = make(FieldFragmentMap) + } +OUTER: + for _, newFrag := range fragments { + for _, existingFrag := range dm.Fragments[field] { + if existingFrag == newFrag { + continue OUTER // no duplicates allowed + } + } + dm.Fragments[field] = append(dm.Fragments[field], newFrag) + } +} + // Reset allows an already allocated DocumentMatch to be reused func (dm *DocumentMatch) Reset() *DocumentMatch { // remember the []byte used for the IndexInternalID @@ -218,6 +237,11 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { scoreBreakdown := dm.ScoreBreakdown // clear out the score breakdown map clear(scoreBreakdown) + // remember the Descendants backing array + descendants := dm.Descendants + for i := range descendants { // recycle each IndexInternalID + descendants[i] = descendants[i][:0] + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) @@ -228,6 +252,8 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { dm.DecodedSort = decodedSort[:0] // reuse the FieldTermLocations already allocated (and reset len to 0) dm.FieldTermLocations = ftls[:0] + // reuse the Descendants already allocated (and reset len to 0) + dm.Descendants = descendants[:0] // reuse the score breakdown map already allocated (after clearing it) dm.ScoreBreakdown = scoreBreakdown return dm @@ -402,3 +428,20 @@ func (sc *SearchContext) Size() int { return sizeInBytes } + +// A NestedDocumentMatch is like a DocumentMatch but used for nested documents +// and does not have score or locations, or a score and is mainly used to +// hold field values and fragments, to be embedded in the parent DocumentMatch +type NestedDocumentMatch struct { + Fields map[string]interface{} `json:"fields,omitempty"` + Fragments FieldFragmentMap `json:"fragments,omitempty"` +} + +// NewNestedDocumentMatch creates a new NestedDocumentMatch instance +// with the given fields and fragments +func NewNestedDocumentMatch(fields map[string]interface{}, fragments FieldFragmentMap) *NestedDocumentMatch { + return &NestedDocumentMatch{ + Fields: fields, + Fragments: fragments, + } +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction_nested.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction_nested.go new file mode 100644 index 0000000000..9ab18dabe9 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_conjunction_nested.go @@ -0,0 +1,480 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + "fmt" + "math" + "reflect" + "slices" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeNestedConjunctionSearcher int + +func init() { + var ncs NestedConjunctionSearcher + reflectStaticSizeNestedConjunctionSearcher = int(reflect.TypeOf(ncs).Size()) +} + +type NestedConjunctionSearcher struct { + nestedReader index.NestedReader + searchers []search.Searcher + queryNorm float64 + currs []*search.DocumentMatch + currAncestors [][]index.AncestorID + currKeys []index.AncestorID + initialized bool + joinIdx int + options search.SearcherOptions + docQueue *CoalesceQueue + // reusable ID buffer for Advance() calls + advanceID index.IndexInternalID + // reusable buffer for Advance() calls + ancestors []index.AncestorID +} + +func NewNestedConjunctionSearcher(ctx context.Context, indexReader index.IndexReader, + searchers []search.Searcher, joinIdx int, options search.SearcherOptions) (search.Searcher, error) { + + var nr index.NestedReader + var ok bool + if nr, ok = indexReader.(index.NestedReader); !ok { + return nil, fmt.Errorf("indexReader does not support nested documents") + } + + // build our searcher + rv := NestedConjunctionSearcher{ + nestedReader: nr, + options: options, + searchers: searchers, + currs: make([]*search.DocumentMatch, len(searchers)), + currAncestors: make([][]index.AncestorID, len(searchers)), + currKeys: make([]index.AncestorID, len(searchers)), + joinIdx: joinIdx, + docQueue: NewCoalesceQueue(), + } + rv.computeQueryNorm() + + return &rv, nil +} + +func (s *NestedConjunctionSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *NestedConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeNestedConjunctionSearcher + size.SizeOfPtr + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + +func (s *NestedConjunctionSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *NestedConjunctionSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *NestedConjunctionSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *NestedConjunctionSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *NestedConjunctionSearcher) Min() int { + return 0 +} + +func (s *NestedConjunctionSearcher) DocumentMatchPoolSize() int { + rv := len(s.currs) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +func (s *NestedConjunctionSearcher) initialize(ctx *search.SearchContext) (bool, error) { + var err error + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return false, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return true, nil + } + // get the ancestry chain for this match + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return false, err + } + // check if the ancestry chain is > joinIdx, if not we reset the joinIdx + // to the minimum possible value across all searchers, ideally this will be + // done in query construction time itself, by using the covering depth across + // all sub-queries, but we do this here as a fallback + if s.joinIdx >= len(s.currAncestors[i]) { + s.joinIdx = len(s.currAncestors[i]) - 1 + } + } + // build currKeys for each searcher, do it here as we may have adjusted joinIdx + for i := range s.searchers { + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + } + s.initialized = true + return false, nil +} + +func (s *NestedConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + // initialize on first call to Next, by getting first match + // from each searcher and their ancestry chains + if !s.initialized { + done, err := s.initialize(ctx) + if err != nil { + return nil, err + } + if done { + return nil, nil + } + } + // check if the docQueue has any buffered matches + if s.docQueue.Len() > 0 { + return s.docQueue.Dequeue(ctx), nil + } + // now enter the main alignment loop + n := len(s.searchers) +OUTER: + for { + // pick the pivot searcher with the highest key (ancestor at joinIdx level) + if s.currs[0] == nil { + return nil, nil + } + maxKey := s.currKeys[0] + for i := 1; i < n; i++ { + // currs[i] is nil means one of the searchers is exhausted + if s.currs[i] == nil { + return nil, nil + } + currKey := s.currKeys[i] + if maxKey.Compare(currKey) < 0 { + maxKey = currKey + } + } + // store maxkey as advanceID only once only if needed + var advanceID index.IndexInternalID + // flag to track if all searchers are aligned + var aligned bool = true + // now try to align all other searchers to the + // we check if the a searchers key matches maxKey + // if not, we advance the pivot searcher to maxKey + // else do nothing and move to the next searcher + for i := 0; i < n; i++ { + cmp := s.currKeys[i].Compare(maxKey) + if cmp < 0 { + // not aligned, so advance this searcher to maxKey + // convert maxKey to advanceID only once + if advanceID == nil { + advanceID = s.toAdvanceID(maxKey) + } + var err error + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = s.searchers[i].Advance(ctx, advanceID) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + // recalc cmp + cmp = s.currKeys[i].Compare(maxKey) + } + if cmp != 0 { + // not aligned + aligned = false + } + } + // now check if all the searchers are aligned at the same maxKey + // if they are not aligned, we need to restart the loop of picking + // the pivot searcher with the highest key + if !aligned { + continue OUTER + } + // if we are here, all the searchers are aligned at maxKey + // now we need to buffer all the intermediate matches for every + // searcher at this key, until either the searcher's key changes + // or the searcher is exhausted + var err error + for i := 0; i < n; i++ { + for { + // buffer the current match + s.docQueue.Enqueue(s.currs[i]) + // advance to next match + s.currs[i], err = s.searchers[i].Next(ctx) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // searcher exhausted, break out + break + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + // check if key has changed + if !s.currKeys[i].Equals(maxKey) { + // key changed, break out + break + } + } + } + // finalize the docQueue for dequeueing + s.docQueue.Finalize() + // finally return the first buffered match + return s.docQueue.Dequeue(ctx), nil + } +} + +// ancestorFromRoot gets the AncestorID at the given position from the root +// if pos is 0, it returns the root AncestorID, and so on +func ancestorFromRoot(ancestors []index.AncestorID, pos int) index.AncestorID { + return ancestors[len(ancestors)-pos-1] +} + +// toAdvanceID converts an AncestorID to IndexInternalID, reusing the advanceID buffer. +// The returned ID is safe to pass to Advance() since Advance() never retains references. +func (s *NestedConjunctionSearcher) toAdvanceID(key index.AncestorID) index.IndexInternalID { + // Reset length to 0 while preserving capacity for buffer reuse + s.advanceID = s.advanceID[:0] + // Convert key to IndexInternalID, reusing the underlying buffer + s.advanceID = key.ToIndexInternalID(s.advanceID) + return s.advanceID +} + +func (s *NestedConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + done, err := s.initialize(ctx) + if err != nil { + return nil, err + } + if done { + return nil, nil + } + } + // first check if the docQueue has any buffered matches + // if so we first check if any of them can satisfy the Advance(ID) + for s.docQueue.Len() > 0 { + dm := s.docQueue.Dequeue(ctx) + if dm.IndexInternalID.Compare(ID) >= 0 { + return dm, nil + } + // otherwise recycle this match + ctx.DocumentMatchPool.Put(dm) + } + var err error + // now we first get the ancestry chain for the given ID + s.ancestors, err = s.nestedReader.Ancestors(ID, s.ancestors[:0]) + if err != nil { + return nil, err + } + // we now follow the the following logic for each searcher: + // let S be the length of the ancestry chain for the searcher + // let I be the length of the ancestry chain for the given ID + // 1. if S > I: + // then we just Advance() the searcher to the given ID if required + // 2. else if S <= I: + // then we get the AncestorID at position (S - 1) from the root of + // the given ID's ancestry chain, and Advance() the searcher to + // it if required + for i, searcher := range s.searchers { + if s.currs[i] == nil { + return nil, nil // already exhausted, nothing to do + } + var targetID index.IndexInternalID + S := len(s.currAncestors[i]) + I := len(s.ancestors) + if S > I { + // case 1: S > I + targetID = ID + } else { + // case 2: S <= I + targetID = s.toAdvanceID(ancestorFromRoot(s.ancestors, S-1)) + } + if s.currs[i].IndexInternalID.Compare(targetID) < 0 { + // need to advance this searcher + ctx.DocumentMatchPool.Put(s.currs[i]) + s.currs[i], err = searcher.Advance(ctx, targetID) + if err != nil { + return nil, err + } + if s.currs[i] == nil { + // one of the searchers is exhausted, so we are done + return nil, nil + } + // recalc ancestors + s.currAncestors[i], err = s.nestedReader.Ancestors(s.currs[i].IndexInternalID, s.currAncestors[i][:0]) + if err != nil { + return nil, err + } + // recalc key + s.currKeys[i] = ancestorFromRoot(s.currAncestors[i], s.joinIdx) + } + } + // we need to call Next() in a loop until we reach or exceed the given ID + // the Next() call basically gives us a match that is aligned correctly, but + // if joinIdx < I, we can have multiple matches for the same joinIdx ancestor + // and they may be < ID, so we need to loop + for { + next, err := s.Next(ctx) + if err != nil { + return nil, err + } + if next == nil { + return nil, nil + } + if next.IndexInternalID.Compare(ID) >= 0 { + return next, nil + } + ctx.DocumentMatchPool.Put(next) + } +} + +// ------------------------------------------------------------------------------------------ +type CoalesceQueue struct { + order []*search.DocumentMatch // queue of DocumentMatch +} + +func NewCoalesceQueue() *CoalesceQueue { + cq := &CoalesceQueue{ + order: make([]*search.DocumentMatch, 0), + } + return cq +} + +// Enqueue appends the given DocumentMatch to the queue. Coalescing of duplicates +// is deferred until Dequeue, after Finalize has sorted items by IndexInternalID. +func (cq *CoalesceQueue) Enqueue(it *search.DocumentMatch) { + // append to order slice (this is a stack) + cq.order = append(cq.order, it) +} + +// Finalize prepares the queue for dequeue operations by sorting the items based on +// their IndexInternalID values. This MUST be called before any Dequeue operations, +// and after all Enqueue operations are complete. The sort is done in descending order +// so that dequeueing will basically be popping from the end of the slice, allowing for +// slice reuse. +func (cq *CoalesceQueue) Finalize() { + slices.SortFunc(cq.order, func(a, b *search.DocumentMatch) int { + return b.IndexInternalID.Compare(a.IndexInternalID) + }) +} + +// Dequeue removes and returns the next DocumentMatch in sorted order, merging any +// consecutive duplicates. Merged items are recycled via ctx.DocumentMatchPool. +// Returns nil when the queue is empty. +func (cq *CoalesceQueue) Dequeue(ctx *search.SearchContext) *search.DocumentMatch { + if cq.Len() == 0 { + return nil + } + + // pop from end of slice + rv := cq.order[len(cq.order)-1] + cq.order = cq.order[:len(cq.order)-1] + + // merge duplicates + for cq.Len() > 0 { + // peek at next item + next := cq.order[len(cq.order)-1] + if !rv.IndexInternalID.Equals(next.IndexInternalID) { + // different ID, stop merging + break + } + // pop the next item + cq.order = cq.order[:len(cq.order)-1] + // same ID, merge + rv.Score += next.Score + rv.Expl = rv.Expl.MergeWith(next.Expl) + rv.FieldTermLocations = search.MergeFieldTermLocationsFromMatch( + rv.FieldTermLocations, next) + // recycle the merged item + ctx.DocumentMatchPool.Put(next) + } + + return rv +} + +// Len returns the number of DocumentMatch items currently in the queue. +func (cq *CoalesceQueue) Len() int { + return len(cq.order) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_fields.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_fields.go new file mode 100644 index 0000000000..5b4ff93dd6 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_fields.go @@ -0,0 +1,111 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "time" + + "github.com/blevesearch/bleve/v2/numeric" + "github.com/blevesearch/bleve/v2/search" + index "github.com/blevesearch/bleve_index_api" +) + +// loadDocValuesOnHit uses the supplied DocValueReader to visit doc values +// for the given hit and populate hit.Fields. It also resolves hit.ID if empty. +// It is a no-op when dvReader is nil. +// +// fieldTypes maps field name → mapping type (e.g. "datetime", "number"). +// When provided, datetime fields are decoded from their stored nanosecond +// int64 into an RFC3339Nano string, while numeric fields use IEEE 754 bit +// reinterpretation to recover the original float64. When nil, all prefix-coded +// values use the numeric (float64) path. +func loadDocValuesOnHit(hit *search.DocumentMatch, dvReader index.DocValueReader, + r index.IndexReader) error { + return loadDocValuesOnHitWithTypes(hit, dvReader, r, nil) +} + +func loadDocValuesOnHitWithTypes(hit *search.DocumentMatch, dvReader index.DocValueReader, + r index.IndexReader, fieldTypes map[string]string) error { + // Always resolve external ID so the callback can read hit.ID. + if hit.ID == "" && r != nil { + extID, err := r.ExternalID(hit.IndexInternalID) + if err != nil { + return err + } + hit.ID = extID + } + + if dvReader == nil { + return nil + } + + err := dvReader.VisitDocValues(hit.IndexInternalID, func(field string, term []byte) { + value := decodeDocValueTerm(term, fieldTypes[field]) + if value != nil { + hit.AddFieldValue(field, value) + } + }) + + return err +} + +// decodeDocValueTerm converts raw doc value bytes into a typed Go value. +// Numeric fields are prefix-coded int64s (only shift-0 terms carry values). +// Boolean fields are stored as "T" or "F". +// Everything else (text/keyword) is returned as a string. +// +// fieldType is the mapping type string for the field (e.g. "datetime", +// "number"). When fieldType is "datetime", the prefix-coded int64 is +// treated as raw nanoseconds (time.UnixNano()) and converted to a UTC +// RFC3339Nano-formatted string. For numeric fields the int64 is decoded via +// Int64ToFloat64 (IEEE 754 bit reinterpretation). +func decodeDocValueTerm(term []byte, fieldType string) interface{} { + if len(term) == 0 { + return nil + } + + // Check if it's a prefix-coded numeric term. + if valid, shift := numeric.ValidPrefixCodedTermBytes(term); valid { + // Only shift-0 terms carry the actual value. + if shift != 0 { + return nil + } + i64, err := numeric.PrefixCoded(term).Int64() + if err != nil { + return nil + } + if fieldType == "datetime" { + // Datetime doc values store time.UnixNano() directly as int64. + // Convert back to a formatted string so callbacks (including + // JS UDFs) receive a human-readable date like "2022-03-10T00:00:00Z". + return time.Unix(0, i64).UTC().Format(time.RFC3339Nano) + } + // Numeric float64 fields use Float64ToInt64 bit manipulation encoding. + return numeric.Int64ToFloat64(i64) + } + + // Boolean fields are stored as "T" or "F". + if len(term) == 1 { + if term[0] == 'T' { + return true + } + if term[0] == 'F' { + return false + } + } + + // Default: text/keyword — return as string. + return string(term) +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_filter.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_filter.go new file mode 100644 index 0000000000..dcf1f7c056 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_filter.go @@ -0,0 +1,121 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + "reflect" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeCustomFilterSearcher int + +func init() { + var cfs CustomFilterSearcher + reflectStaticSizeCustomFilterSearcher = int(reflect.TypeOf(cfs).Size()) +} + +// CustomFilterFunc decides whether a hit (with doc-value fields populated) +// should be kept. Unlike FilterFunc it does not receive a SearchContext since +// custom-query callbacks only need the DocumentMatch. +type CustomFilterFunc func(d *search.DocumentMatch) bool + +// CustomFilterSearcher wraps a child searcher, optionally loads doc values +// into each DocumentMatch, then applies a CustomFilterFunc to decide whether +// to keep the hit. Unlike FilteringSearcher this variant is purpose-built for +// custom queries that need field values at callback time. +type CustomFilterSearcher struct { + child search.Searcher + accept CustomFilterFunc + dvReader index.DocValueReader + indexReader index.IndexReader + fieldTypes map[string]string +} + +func NewCustomFilterSearcher(ctx context.Context, child search.Searcher, + filter CustomFilterFunc, dvReader index.DocValueReader, + indexReader index.IndexReader, + fieldTypes map[string]string) *CustomFilterSearcher { + return &CustomFilterSearcher{ + child: child, + accept: filter, + dvReader: dvReader, + indexReader: indexReader, + fieldTypes: fieldTypes, + } +} + +func (f *CustomFilterSearcher) Size() int { + return reflectStaticSizeCustomFilterSearcher + size.SizeOfPtr + + f.child.Size() +} + +func (f *CustomFilterSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + next, err := f.child.Next(ctx) + for next != nil && err == nil { + if err = loadDocValuesOnHitWithTypes(next, f.dvReader, f.indexReader, f.fieldTypes); err != nil { + return nil, err + } + if f.accept(next) { + return next, nil + } + next, err = f.child.Next(ctx) + } + return nil, err +} + +func (f *CustomFilterSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + adv, err := f.child.Advance(ctx, ID) + if err != nil { + return nil, err + } + if adv == nil { + return nil, nil + } + if err = loadDocValuesOnHitWithTypes(adv, f.dvReader, f.indexReader, f.fieldTypes); err != nil { + return nil, err + } + if f.accept(adv) { + return adv, nil + } + return f.Next(ctx) +} + +func (f *CustomFilterSearcher) Close() error { + return f.child.Close() +} + +func (f *CustomFilterSearcher) Weight() float64 { + return f.child.Weight() +} + +func (f *CustomFilterSearcher) SetQueryNorm(n float64) { + f.child.SetQueryNorm(n) +} + +func (f *CustomFilterSearcher) Count() uint64 { + return f.child.Count() +} + +func (f *CustomFilterSearcher) Min() int { + return f.child.Min() +} + +func (f *CustomFilterSearcher) DocumentMatchPoolSize() int { + return f.child.DocumentMatchPoolSize() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_score.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_score.go new file mode 100644 index 0000000000..15eb121266 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_custom_score.go @@ -0,0 +1,114 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "context" + "reflect" + + "github.com/blevesearch/bleve/v2/search" + "github.com/blevesearch/bleve/v2/size" + index "github.com/blevesearch/bleve_index_api" +) + +var reflectStaticSizeCustomScoreSearcher int + +func init() { + var sfs CustomScoreSearcher + reflectStaticSizeCustomScoreSearcher = int(reflect.TypeOf(sfs).Size()) +} + +// CustomScoreFunc defines a function which can mutate document scores. +type CustomScoreFunc func(d *search.DocumentMatch) float64 + +// CustomScoreSearcher wraps any other searcher, optionally loads doc values +// into each DocumentMatch, then mutates the score using the supplied +// CustomScoreFunc. +type CustomScoreSearcher struct { + child search.Searcher + mutate CustomScoreFunc + dvReader index.DocValueReader + indexReader index.IndexReader + fieldTypes map[string]string +} + +func NewCustomScoreSearcher(ctx context.Context, s search.Searcher, mutate CustomScoreFunc, + dvReader index.DocValueReader, indexReader index.IndexReader, + fieldTypes map[string]string) *CustomScoreSearcher { + return &CustomScoreSearcher{ + child: s, + mutate: mutate, + dvReader: dvReader, + indexReader: indexReader, + fieldTypes: fieldTypes, + } +} + +func (f *CustomScoreSearcher) Size() int { + return reflectStaticSizeCustomScoreSearcher + size.SizeOfPtr + + f.child.Size() +} + +func (f *CustomScoreSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + next, err := f.child.Next(ctx) + if err != nil { + return nil, err + } + if next != nil { + if err = loadDocValuesOnHitWithTypes(next, f.dvReader, f.indexReader, f.fieldTypes); err != nil { + return nil, err + } + next.Score = f.mutate(next) + } + return next, nil +} + +func (f *CustomScoreSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + adv, err := f.child.Advance(ctx, ID) + if err != nil { + return nil, err + } + if adv != nil { + if err = loadDocValuesOnHitWithTypes(adv, f.dvReader, f.indexReader, f.fieldTypes); err != nil { + return nil, err + } + adv.Score = f.mutate(adv) + } + return adv, nil +} + +func (f *CustomScoreSearcher) Close() error { + return f.child.Close() +} + +func (f *CustomScoreSearcher) Weight() float64 { + return f.child.Weight() +} + +func (f *CustomScoreSearcher) SetQueryNorm(n float64) { + f.child.SetQueryNorm(n) +} + +func (f *CustomScoreSearcher) Count() uint64 { + return f.child.Count() +} + +func (f *CustomScoreSearcher) Min() int { + return f.child.Min() +} + +func (f *CustomScoreSearcher) DocumentMatchPoolSize() int { + return f.child.DocumentMatchPoolSize() +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go index 3da876bd35..4c68e5691d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_disjunction_heap.go @@ -15,7 +15,6 @@ package searcher import ( - "bytes" "container/heap" "context" "math" @@ -169,7 +168,7 @@ func (s *DisjunctionHeapSearcher) updateMatches() error { matchingIdxs = append(matchingIdxs, next.matchingIdx) // now as long as top of heap matches, keep popping - for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { + for len(s.heap) > 0 && next.curr.IndexInternalID.Equals(s.heap[0].curr.IndexInternalID) { next = heap.Pop(s).(*SearcherCurr) matching = append(matching, next.curr) matchingCurrs = append(matchingCurrs, next) @@ -264,7 +263,7 @@ func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, // find all searchers that actually need to be advanced // advance them, using s.matchingCurrs as temp storage - for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { + for len(s.heap) > 0 && s.heap[0].curr.IndexInternalID.Compare(ID) < 0 { searcherCurr := heap.Pop(s).(*SearcherCurr) ctx.DocumentMatchPool.Put(searcherCurr.curr) curr, err := searcherCurr.searcher.Advance(ctx, ID) @@ -347,7 +346,7 @@ func (s *DisjunctionHeapSearcher) Less(i, j int) bool { } else if s.heap[j].curr == nil { return false } - return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 + return s.heap[i].curr.IndexInternalID.Compare(s.heap[j].curr.IndexInternalID) < 0 } func (s *DisjunctionHeapSearcher) Swap(i, j int) { diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_filter.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_filter.go index 97d706b5f9..ef070c73f4 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_filter.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_filter.go @@ -60,6 +60,9 @@ func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMat if f.accept(ctx, next) { return next, nil } + // recycle this document match now, since + // we do not need it anymore + ctx.DocumentMatchPool.Put(next) next, err = f.child.Next(ctx) } return nil, err @@ -76,6 +79,9 @@ func (f *FilteringSearcher) Advance(ctx *search.SearchContext, ID index.IndexInt if f.accept(ctx, adv) { return adv, nil } + // recycle this document match now, since + // we do not need it anymore + ctx.DocumentMatchPool.Put(adv) return f.Next(ctx) } diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go index c2551a8718..a146fa654d 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoboundingbox.go @@ -53,7 +53,7 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade } return NewFilteringSearcher(ctx, boxSearcher, buildRectFilter(ctx, dvReader, - field, minLon, minLat, maxLon, maxLat)), nil + minLon, minLat, maxLon, maxLat)), nil } } @@ -88,7 +88,7 @@ func NewGeoBoundingBoxSearcher(ctx context.Context, indexReader index.IndexReade } // add filter to check points near the boundary onBoundarySearcher = NewFilteringSearcher(ctx, rawOnBoundarySearcher, - buildRectFilter(ctx, dvReader, field, minLon, minLat, maxLon, maxLat)) + buildRectFilter(ctx, dvReader, minLon, minLat, maxLon, maxLat)) openedSearchers = append(openedSearchers, onBoundarySearcher) } @@ -205,28 +205,35 @@ func buildIsIndexedFunc(ctx context.Context, indexReader index.IndexReader, fiel return isIndexed, closeF, err } -func buildRectFilter(ctx context.Context, dvReader index.DocValueReader, field string, +func buildRectFilter(ctx context.Context, dvReader index.DocValueReader, minLon, minLat, maxLon, maxLat float64, ) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + var i64 int64 + i64, err = prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - var i64 int64 - i64, err = prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - if err == nil && found { + lons, lats = lons[:0], lats[:0] + found = false + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopointdistance.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopointdistance.go index 357ac4de35..7591bcc608 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopointdistance.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopointdistance.go @@ -66,7 +66,7 @@ func NewGeoPointDistanceSearcher(ctx context.Context, indexReader index.IndexRea // wrap it in a filtering searcher which checks the actual distance return NewFilteringSearcher(ctx, rectSearcher, - buildDistFilter(ctx, dvReader, field, centerLon, centerLat, dist)), nil + buildDistFilter(ctx, dvReader, centerLon, centerLat, dist)), nil } // boxSearcher builds a searcher for the described bounding box @@ -113,27 +113,33 @@ func boxSearcher(ctx context.Context, indexReader index.IndexReader, return boxSearcher, nil } -func buildDistFilter(ctx context.Context, dvReader index.DocValueReader, field string, +func buildDistFilter(ctx context.Context, dvReader index.DocValueReader, centerLon, centerLat, maxDist float64) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - if err == nil && found { + lons, lats = lons[:0], lats[:0] + found = false + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopolygon.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopolygon.go index dc04bb66a0..fb6e09be49 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopolygon.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geopolygon.go @@ -85,28 +85,37 @@ func almostEqual(a, b float64) bool { // here: https://wrf.ecse.rpi.edu/nikola/pubdetails/pnpoly.html func buildPolygonFilter(ctx context.Context, dvReader index.DocValueReader, field string, coordinates []geo.Point) FilterFunc { + // reuse the following for each document match that is checked using the filter + var lons, lats []float64 + var found bool + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) + found = true + } + } + } + rayIntersectsSegment := func(point, a, b geo.Point) bool { + return (a.Lat > point.Lat) != (b.Lat > point.Lat) && + point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon + } return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { // check geo matches against all numeric type terms indexed - var lons, lats []float64 - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lons = append(lons, geo.MortonUnhashLon(uint64(i64))) - lats = append(lats, geo.MortonUnhashLat(uint64(i64))) - found = true - } - } - }) - + lons, lats = lons[:0], lats[:0] + found = false // Note: this approach works for points which are strictly inside // the polygon. ie it might fail for certain points on the polygon boundaries. - if err == nil && found { + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) @@ -116,10 +125,6 @@ func buildPolygonFilter(ctx context.Context, dvReader index.DocValueReader, fiel if len(coordinates) < 3 { return false } - rayIntersectsSegment := func(point, a, b geo.Point) bool { - return (a.Lat > point.Lat) != (b.Lat > point.Lat) && - point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon - } for i := range lons { pt := geo.Point{Lon: lons[i], Lat: lats[i]} diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go index 703693d781..cd020fafc4 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_geoshape.go @@ -58,18 +58,13 @@ func NewGeoShapeSearcher(ctx context.Context, indexReader index.IndexReader, sha return NewFilteringSearcher(ctx, mSearcher, buildRelationFilterOnShapes(ctx, dvReader, field, relation, shape)), nil } -// Using the same term splitter slice used in the doc values in zap. -// TODO: This needs to be revisited whenever we change the zap -// implementation of doc values. -var termSeparatorSplitSlice = []byte{0xff} - func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueReader, field string, relation string, shape index.GeoJSON, ) FilterFunc { // this is for accumulating the shape's actual complete value // spread across multiple docvalue visitor callbacks. var dvShapeValue []byte - var startReading, finishReading bool + var startReading, finishReading, found bool var reader *bytes.Reader var bufPool *s2.GeoBufferPool @@ -77,51 +72,58 @@ func buildRelationFilterOnShapes(ctx context.Context, dvReader index.DocValueRea bufPool = bufPoolCallback() } - return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { - var found bool - - err := dvReader.VisitDocValues(d.IndexInternalID, - func(field string, term []byte) { - // only consider the values which are GlueBytes prefixed or - // if it had already started reading the shape bytes from previous callbacks. - if startReading || len(term) > geo.GlueBytesOffset { - - if !startReading && bytes.Equal(geo.GlueBytes, term[:geo.GlueBytesOffset]) { - startReading = true - - if bytes.Equal(geo.GlueBytes, term[len(term)-geo.GlueBytesOffset:]) { - term = term[:len(term)-geo.GlueBytesOffset] - finishReading = true - } - - dvShapeValue = append(dvShapeValue, term[geo.GlueBytesOffset:]...) - - } else if startReading && !finishReading { - if len(term) > geo.GlueBytesOffset && - bytes.Equal(geo.GlueBytes, term[len(term)-geo.GlueBytesOffset:]) { - term = term[:len(term)-geo.GlueBytesOffset] - finishReading = true - } - - term = append(termSeparatorSplitSlice, term...) - dvShapeValue = append(dvShapeValue, term...) - } - - // apply the filter once the entire docvalue is finished reading. - if finishReading { - v, err := geojson.FilterGeoShapesOnRelation(shape, dvShapeValue, relation, &reader, bufPool) - if err == nil && v { - found = true - } - - dvShapeValue = dvShapeValue[:0] - startReading = false - finishReading = false - } + dvVisitor := func(_ string, term []byte) { + if found { + // avoid redundant work if already found + return + } + tl := len(term) + // only consider the values which are GlueBytes prefixed or + // if it had already started reading the shape bytes from previous callbacks. + if startReading || tl > geo.GlueBytesOffset { + + if !startReading && bytes.Equal(geo.GlueBytes, term[:geo.GlueBytesOffset]) { + startReading = true + + if bytes.Equal(geo.GlueBytes, term[tl-geo.GlueBytesOffset:]) { + term = term[:tl-geo.GlueBytesOffset] + finishReading = true } - }) - if err == nil && found { + dvShapeValue = append(dvShapeValue, term[geo.GlueBytesOffset:]...) + + } else if startReading && !finishReading { + if tl > geo.GlueBytesOffset && + bytes.Equal(geo.GlueBytes, term[tl-geo.GlueBytesOffset:]) { + term = term[:tl-geo.GlueBytesOffset] + finishReading = true + } + + dvShapeValue = append(dvShapeValue, index.DocValueTermSeparator) + dvShapeValue = append(dvShapeValue, term...) + } + + // apply the filter once the entire docvalue is finished reading. + if finishReading { + v, err := geojson.FilterGeoShapesOnRelation(shape, dvShapeValue, relation, &reader, bufPool) + if err == nil && v { + found = true + } + + dvShapeValue = dvShapeValue[:0] + startReading = false + finishReading = false + } + } + } + + return func(sctx *search.SearchContext, d *search.DocumentMatch) bool { + // reset state variables for each document + found = false + startReading = false + finishReading = false + dvShapeValue = dvShapeValue[:0] + if err := dvReader.VisitDocValues(d.IndexInternalID, dvVisitor); err == nil && found { bytes := dvReader.BytesRead() if bytes > 0 { reportIOStats(ctx, bytes) diff --git a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_numeric_range.go b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_numeric_range.go index f086051c11..cd8f007196 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_numeric_range.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/searcher/search_numeric_range.go @@ -132,7 +132,7 @@ func filterCandidateTerms(indexReader index.IndexReader, for err == nil && tfd != nil { termBytes := []byte(tfd.Term) i := sort.Search(len(terms), func(i int) bool { return bytes.Compare(terms[i], termBytes) >= 0 }) - if i < len(terms) && bytes.Compare(terms[i], termBytes) == 0 { + if i < len(terms) && bytes.Equal(terms[i], termBytes) { rv = append(rv, terms[i]) } terms = terms[i:] diff --git a/vendor/github.com/blevesearch/bleve/v2/search/sort.go b/vendor/github.com/blevesearch/bleve/v2/search/sort.go index 44e5cd91c9..64230c1165 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/sort.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/sort.go @@ -683,29 +683,29 @@ type SortGeoDistance struct { Field string Desc bool Unit string - values []string + values [][]byte Lon float64 Lat float64 unitMult float64 + tmp []byte } // UpdateVisitor notifies this sort field that in this document // this field has the specified term func (s *SortGeoDistance) UpdateVisitor(field string, term []byte) { if field == s.Field { - s.values = append(s.values, string(term)) + s.values = append(s.values, term) } } // Value returns the sort value of the DocumentMatch -// it also resets the state of this SortField for +// it also resets the state of this SortGeoDistance for // processing the next document func (s *SortGeoDistance) Value(i *DocumentMatch) string { - iTerms := s.filterTermsByType(s.values) - iTerm := s.filterTermsByMode(iTerms) + iTerm := s.findPrefixCodedNumericTerm(s.values) s.values = s.values[:0] - if iTerm == "" { + if iTerm == nil { return maxDistance } @@ -723,7 +723,8 @@ func (s *SortGeoDistance) Value(i *DocumentMatch) string { dist /= s.unitMult } distInt64 := numeric.Float64ToInt64(dist) - return string(numeric.MustNewPrefixCodedInt64(distInt64, 0)) + s.tmp = numeric.MustNewPrefixCodedInt64Prealloc(distInt64, 0, s.tmp) + return string(s.tmp) } func (s *SortGeoDistance) DecodeValue(value string) string { @@ -739,25 +740,16 @@ func (s *SortGeoDistance) Descending() bool { return s.Desc } -func (s *SortGeoDistance) filterTermsByMode(terms []string) string { - if len(terms) >= 1 { - return terms[0] - } - - return "" -} - -// filterTermsByType attempts to make one pass on the terms -// return only valid prefix coded numbers with shift of 0 -func (s *SortGeoDistance) filterTermsByType(terms []string) []string { - var termsWithShiftZero []string +// findPrefixCodedNumericTerm looks through the provided terms +// and returns the first valid prefix coded numeric term with shift of 0 +func (s *SortGeoDistance) findPrefixCodedNumericTerm(terms [][]byte) []byte { for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { - termsWithShiftZero = append(termsWithShiftZero, term) + return term } } - return termsWithShiftZero + return nil } // RequiresDocID says this SearchSort does not require the DocID be loaded diff --git a/vendor/github.com/blevesearch/bleve/v2/search/util.go b/vendor/github.com/blevesearch/bleve/v2/search/util.go index 005fda67df..81f22768a9 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search/util.go +++ b/vendor/github.com/blevesearch/bleve/v2/search/util.go @@ -50,41 +50,54 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { n := len(dest) for _, dm := range matches { - n += len(dm.FieldTermLocations) + if dm != nil { + n += len(dm.FieldTermLocations) + } } if cap(dest) < n { dest = append(make([]FieldTermLocation, 0, n), dest...) } for _, dm := range matches { - for _, ftl := range dm.FieldTermLocations { - dest = append(dest, FieldTermLocation{ - Field: ftl.Field, - Term: ftl.Term, - Location: Location{ - Pos: ftl.Location.Pos, - Start: ftl.Location.Start, - End: ftl.Location.End, - ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), - }, - }) + if dm != nil { + dest = mergeFieldTermLocationFromMatch(dest, dm) } } return dest } -type SearchIOStatsCallbackFunc func(uint64) +// MergeFieldTermLocationsFromMatch merges field term locations from a single DocumentMatch +// into dest, returning the updated slice. +func MergeFieldTermLocationsFromMatch(dest []FieldTermLocation, match *DocumentMatch) []FieldTermLocation { + if match == nil { + return dest + } + n := len(dest) + len(match.FieldTermLocations) + if cap(dest) < n { + dest = append(make([]FieldTermLocation, 0, n), dest...) + } + return mergeFieldTermLocationFromMatch(dest, match) +} + +// mergeFieldTermLocationFromMatch appends field term locations from a DocumentMatch into dest. +// Assumes dest has sufficient capacity. +func mergeFieldTermLocationFromMatch(dest []FieldTermLocation, dm *DocumentMatch) []FieldTermLocation { + for _, ftl := range dm.FieldTermLocations { + dest = append(dest, FieldTermLocation{ + Field: ftl.Field, + Term: ftl.Term, + Location: Location{ + Pos: ftl.Location.Pos, + Start: ftl.Location.Start, + End: ftl.Location.End, + ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), + }, + }) + } -// Implementation of SearchIncrementalCostCallbackFn should handle the following messages -// - add: increment the cost of a search operation -// (which can be specific to a query type as well) -// - abort: query was aborted due to a cancel of search's context (for eg), -// which can be handled differently as well -// - done: indicates that a search was complete and the tracked cost can be -// handled safely by the implementation. -type SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, - SearchQueryType, uint64) + return dest +} type ( SearchIncrementalCostCallbackMsg uint @@ -156,6 +169,10 @@ const ( // ScoreFusionKey is used to communicate whether KNN hits need to be preserved for // hybrid search algorithms (like RRF) ScoreFusionKey ContextKey = "_fusion_rescoring_key" + + // NestedSearchKey is used to communicate whether the search is performed + // in an index with nested documents + NestedSearchKey ContextKey = "_nested_search_key" ) func RecordSearchCost(ctx context.Context, @@ -184,9 +201,7 @@ const ( MinGeoBufPoolSize = 24 ) -type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool - -// *PreSearchDataKey are used to store the data gathered during the presearch phase +// PreSearchDataKey are used to store the data gathered during the presearch phase // which would be use in the actual search phase. const ( KnnPreSearchDataKey = "_knn_pre_search_data_key" @@ -197,14 +212,39 @@ const ( const GlobalScoring = "_global_scoring" type ( + // SearcherStartCallbackFn is a callback function type used to signal the start of + // searcher creation phase. SearcherStartCallbackFn func(size uint64) error - SearcherEndCallbackFn func(size uint64) error + // SearcherEndCallbackFn is a callback function type used to signal the end of + // a searcher creation phase. + SearcherEndCallbackFn func(size uint64) error + // GetScoringModelCallbackFn is a callback function type used to get the scoring model + // to be used for scoring documents during search. + GetScoringModelCallbackFn func() string + // HybridMergeCallbackFn is a callback function type used to merge a KNN document match + // into a full text search document match, of the same docID as part of hybrid search. + HybridMergeCallbackFn func(ftsMatch *DocumentMatch, knnMatch *DocumentMatch) + // DescendantAdderCallback is a callback function type used to customize how a descendant + // DocumentMatch is merged into its parent. This allows different descendant addition strategies for + // different use cases (e.g., TopN vs KNN collection). + DescendantAdderCallbackFn func(parent *DocumentMatch, descendant *DocumentMatch) error + // GeoBufferPoolCallbackFunc is a callback function type used to get the geo buffer pool + // to be used during geo searches. + GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool + // SearchIOStatsCallbackFunc is a callback function type used to report search IO stats + // during search. + SearchIOStatsCallbackFunc func(uint64) + // Implementation of SearchIncrementalCostCallbackFn should handle the following messages + // - add: increment the cost of a search operation + // (which can be specific to a query type as well) + // - abort: query was aborted due to a cancel of search's context (for eg), + // which can be handled differently as well + // - done: indicates that a search was complete and the tracked cost can be + // handled safely by the implementation. + SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, + SearchQueryType, uint64) ) -type GetScoringModelCallbackFn func() string - -type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) - // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string @@ -237,3 +277,28 @@ type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` } + +// FieldSet represents a set of queried fields. +type FieldSet map[string]struct{} + +// NewFieldSet creates a new FieldSet. +func NewFieldSet() FieldSet { + return make(map[string]struct{}) +} + +// Add adds a field to the set. +func (fs FieldSet) AddField(field string) { + fs[field] = struct{}{} +} + +// HasID returns true if the field set contains the "_id" field. +func (fs FieldSet) HasID() bool { + _, ok := fs["_id"] + return ok +} + +// HasAll returns true if the field set contains the "_all" field. +func (fs FieldSet) HasAll() bool { + _, ok := fs["_all"] + return ok +} diff --git a/vendor/github.com/blevesearch/bleve/v2/search_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_knn.go index 54771ede01..fa119f48cb 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search_knn.go @@ -27,6 +27,7 @@ import ( "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/query" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -42,18 +43,18 @@ type SearchRequest struct { Query query.Query `json:"query"` Size int `json:"size"` From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` + Highlight *HighlightRequest `json:"highlight,omitempty"` + Fields []string `json:"fields,omitempty"` + Facets FacetsRequest `json:"facets,omitempty"` Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score,omitempty"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` + SearchAfter []string `json:"search_after,omitempty"` + SearchBefore []string `json:"search_before,omitempty"` - KNN []*KNNRequest `json:"knn"` - KNNOperator knnOperator `json:"knn_operator"` + KNN []*KNNRequest `json:"knn,omitempty"` + KNNOperator knnOperator `json:"knn_operator,omitempty"` // PreSearchData will be a map that will be used // in the second phase of any 2-phase search, to provide additional @@ -125,35 +126,35 @@ func (r *SearchRequest) AddKNNOperator(operator knnOperator) { // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { type tempKNNReq struct { - Field string `json:"field"` - Vector []float32 `json:"vector"` - VectorBase64 string `json:"vector_base64"` - K int64 `json:"k"` - Boost *query.Boost `json:"boost,omitempty"` - Params json.RawMessage `json:"params"` - FilterQuery json.RawMessage `json:"filter,omitempty"` + Field string `json:"field"` + Vector []float32 `json:"vector"` + VectorBase64 string `json:"vector_base64"` + K int64 `json:"k"` + Boost *query.Boost `json:"boost,omitempty"` + Params OptionalRawMessage `json:"params"` + FilterQuery OptionalRawMessage `json:"filter,omitempty"` } var temp struct { - Q json.RawMessage `json:"query"` - Size *int `json:"size"` - From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` - Explain bool `json:"explain"` - Sort []json.RawMessage `json:"sort"` - IncludeLocations bool `json:"includeLocations"` - Score string `json:"score"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` - KNN []*tempKNNReq `json:"knn"` - KNNOperator knnOperator `json:"knn_operator"` - PreSearchData json.RawMessage `json:"pre_search_data"` - Params json.RawMessage `json:"params"` - } - - err := json.Unmarshal(input, &temp) + Q json.RawMessage `json:"query"` + Size *int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort []json.RawMessage `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + KNN []*tempKNNReq `json:"knn"` + KNNOperator knnOperator `json:"knn_operator"` + PreSearchData OptionalRawMessage `json:"pre_search_data"` + Params OptionalRawMessage `json:"params"` + } + + err := util.UnmarshalJSON(input, &temp) if err != nil { return err } @@ -216,11 +217,10 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { r.KNN[i].VectorBase64 = temp.KNN[i].VectorBase64 r.KNN[i].K = temp.KNN[i].K r.KNN[i].Boost = temp.KNN[i].Boost - r.KNN[i].Params = temp.KNN[i].Params - if len(knnReq.FilterQuery) == 0 { - // Setting this to nil to avoid ParseQuery() setting it to a match none - r.KNN[i].FilterQuery = nil - } else { + if len(temp.KNN[i].Params) > 0 { + r.KNN[i].Params = json.RawMessage(temp.KNN[i].Params) + } + if len(temp.KNN[i].FilterQuery) > 0 { r.KNN[i].FilterQuery, err = query.ParseQuery(knnReq.FilterQuery) if err != nil { return err @@ -377,7 +377,7 @@ func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMat } } req.Sort.Value(hit) - err, _ = LoadAndHighlightFields(hit, req, "", reader, nil) + err, _ = LoadAndHighlightAllFields(hit, req, "", reader, nil) if err != nil { return err } @@ -474,17 +474,15 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return knnHits, nil } -func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopNCollector) { if len(knnHits) > 0 { - newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) { - totalScore := queryMatch.Score + knnMatch.Score - if !req.Explain { - // exit early as we don't need to compute the explanation - return totalScore, nil - } - return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}} + mergeFn := func(ftsMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) { + // Boost the FTS score using the KNN score + ftsMatch.Score += knnMatch.Score + // Combine the FTS explanation with the KNN explanation, if present + ftsMatch.Expl.MergeWith(knnMatch.Expl) } - coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer)) + coll.SetKNNHits(knnHits, search.HybridMergeCallbackFn(mergeFn)) } } diff --git a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go index 172f258ec7..f294a476f3 100644 --- a/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go +++ b/vendor/github.com/blevesearch/bleve/v2/search_no_knn.go @@ -25,6 +25,7 @@ import ( "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/query" + "github.com/blevesearch/bleve/v2/util" index "github.com/blevesearch/bleve_index_api" ) @@ -55,15 +56,15 @@ type SearchRequest struct { Query query.Query `json:"query"` Size int `json:"size"` From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` + Highlight *HighlightRequest `json:"highlight,omitempty"` + Fields []string `json:"fields,omitempty"` + Facets FacetsRequest `json:"facets,omitempty"` Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score,omitempty"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` + SearchAfter []string `json:"search_after,omitempty"` + SearchBefore []string `json:"search_before,omitempty"` // PreSearchData will be a map that will be used // in the second phase of any 2-phase search, to provide additional @@ -86,23 +87,23 @@ type SearchRequest struct { // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { var temp struct { - Q json.RawMessage `json:"query"` - Size *int `json:"size"` - From int `json:"from"` - Highlight *HighlightRequest `json:"highlight"` - Fields []string `json:"fields"` - Facets FacetsRequest `json:"facets"` - Explain bool `json:"explain"` - Sort []json.RawMessage `json:"sort"` - IncludeLocations bool `json:"includeLocations"` - Score string `json:"score"` - SearchAfter []string `json:"search_after"` - SearchBefore []string `json:"search_before"` - PreSearchData json.RawMessage `json:"pre_search_data"` - Params json.RawMessage `json:"params"` + Q json.RawMessage `json:"query"` + Size *int `json:"size"` + From int `json:"from"` + Highlight *HighlightRequest `json:"highlight"` + Fields []string `json:"fields"` + Facets FacetsRequest `json:"facets"` + Explain bool `json:"explain"` + Sort []json.RawMessage `json:"sort"` + IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` + PreSearchData OptionalRawMessage `json:"pre_search_data"` + Params OptionalRawMessage `json:"params"` } - err := json.Unmarshal(input, &temp) + err := util.UnmarshalJSON(input, &temp) if err != nil { return err } @@ -197,7 +198,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea return nil, nil } -func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { +func setKnnHitsInCollector(knnHits []*search.DocumentMatch, coll *collector.TopNCollector) { } func requestHasKNN(req *SearchRequest) bool { diff --git a/vendor/github.com/blevesearch/bleve/v2/util/bolt.go b/vendor/github.com/blevesearch/bleve/v2/util/bolt.go new file mode 100644 index 0000000000..d05500b7d0 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/util/bolt.go @@ -0,0 +1,170 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package util + +import ( + "fmt" + "os" + + bolt "go.etcd.io/bbolt" +) + +// All of the bolt impls provide a layer of indirection to allow for processing +// of values as they are read/written to bolt depending on the key or bucket name +// This is used to allow better support for file callbacks + +// wrapper around bolt.DB +type RootBoltImpl struct { + *bolt.DB +} + +// wrapper around bolt.Tx +type BoltTxImpl struct { + *bolt.Tx +} + +// wrapper around bolt.Bucket +type BoltBucketImpl struct { + *bolt.Bucket + + name string // store the name of the bucket during creation +} + +func OpenBolt(path string, mode os.FileMode, options *bolt.Options) (*RootBoltImpl, error) { + db, err := bolt.Open(path, mode, options) + if err != nil { + return nil, err + } + return &RootBoltImpl{DB: db}, nil +} + +func (r *RootBoltImpl) Begin(writable bool) (*BoltTxImpl, error) { + tx, err := r.DB.Begin(writable) + if err != nil { + return nil, err + } + return &BoltTxImpl{Tx: tx}, nil +} + +func (r *RootBoltImpl) View(fn func(*BoltTxImpl) error) error { + return r.DB.View(func(tx *bolt.Tx) error { + return fn(&BoltTxImpl{Tx: tx}) + }) +} + +func (r *RootBoltImpl) Update(fn func(*BoltTxImpl) error) error { + return r.DB.Update(func(tx *bolt.Tx) error { + return fn(&BoltTxImpl{Tx: tx}) + }) +} + +func (tx *BoltTxImpl) CreateBucketIfNotExists(name []byte) (*BoltBucketImpl, error) { + bucket, err := tx.Tx.CreateBucketIfNotExists(name) + if err != nil { + return nil, err + } + return &BoltBucketImpl{ + name: string(name), + Bucket: bucket, + }, nil +} + +func (tx *BoltTxImpl) Bucket(name []byte) *BoltBucketImpl { + bucket := tx.Tx.Bucket(name) + if bucket == nil { + return nil + } + return &BoltBucketImpl{ + name: string(name), + Bucket: bucket, + } +} + +func (b *BoltBucketImpl) GetBucket(name []byte) *BoltBucketImpl { + bucket := b.Bucket.Bucket(name) + if bucket == nil { + return nil + } + return &BoltBucketImpl{ + name: string(name), + Bucket: bucket, + } +} + +func (b *BoltBucketImpl) CreateBucketIfNotExists(name []byte) (*BoltBucketImpl, error) { + bucket, err := b.Bucket.CreateBucketIfNotExists(name) + if err != nil { + return nil, err + } + return &BoltBucketImpl{ + name: string(name), + Bucket: bucket, + }, nil +} + +// Process values during ForEach if the bucket name or key is in the boltKeysProcessed map +func (b *BoltBucketImpl) ForEach(fn func(key []byte, value []byte) error, reader FileReader) error { + _, ok1 := boltKeysProcessed[b.name] + return b.Bucket.ForEach(func(k, v []byte) error { + v = append([]byte(nil), v...) + if _, ok2 := boltKeysProcessed[string(k)]; ok1 || ok2 { + if reader == nil { + return fmt.Errorf("reader callback is required for bucket %s", b.name) + } + processedValue, err := reader.Process(v) + if err != nil { + return err + } + return fn(k, processedValue) + } + return fn(k, v) + }) +} + +// Process values during Put/Get if the bucket name or key is in the boltKeysProcessed map +func (b *BoltBucketImpl) Put(key []byte, value []byte, writer FileWriter) error { + _, ok1 := boltKeysProcessed[string(key)] + _, ok2 := boltKeysProcessed[b.name] + value = append([]byte(nil), value...) + if ok1 || ok2 { + if writer == nil { + return fmt.Errorf("writer callback is required for key %s", string(key)) + } + processedValue := writer.Process(value) + return b.Bucket.Put(key, processedValue) + } + return b.Bucket.Put(key, value) +} + +// Process values during Put/Get if the bucket name or key is in the boltKeysProcessed map +func (b *BoltBucketImpl) Get(key []byte, reader FileReader) ([]byte, error) { + _, ok1 := boltKeysProcessed[string(key)] + _, ok2 := boltKeysProcessed[b.name] + if ok1 || ok2 { + if reader == nil { + return nil, fmt.Errorf("reader callback is required for key %s", string(key)) + } + val := b.Bucket.Get(key) + if val == nil { + return nil, nil + } + processedVal, err := reader.Process(val) + if err != nil { + return nil, err + } + return processedVal, nil + } + return b.Bucket.Get(key), nil +} diff --git a/vendor/github.com/blevesearch/bleve/v2/util/file_callbacks.go b/vendor/github.com/blevesearch/bleve/v2/util/file_callbacks.go new file mode 100644 index 0000000000..c485006e24 --- /dev/null +++ b/vendor/github.com/blevesearch/bleve/v2/util/file_callbacks.go @@ -0,0 +1,129 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package util + +import ( + "fmt" + + index "github.com/blevesearch/bleve_index_api" +) + +// This file provides a mechanism for users of bleve to provide callbacks +// that can process data before it is written to disk, and after it is read +// from disk. This can be used for things like encryption, compression, etc. + +// The user is responsible for ensuring that the writer and reader callbacks +// are compatible with each other, and that any state needed by the callbacks +// is managed appropriately. For example, if the writer callback uses a +// unique key or nonce per write, the reader callback must be able to +// determine the correct key or nonce to use for each read. + +// The callbacks are identified by an id string, which is returned by the +// WriterHook. The same id string is passed to the ReaderHook when creating a reader. +// This allows the reader to determine which callback to use for a given file. + +// Support for identifying all callbacks used by a given index and to remove +// selected callbacks associated with ids is provided via index.WriterIdsInUse() +// and index.DropWriterIds(). + +const DefaultFileCallbackId = "" + +// FileWriter and FileReader interfaces are wrappers around the callback functions +// provided by the user. They provide a convenient way to apply the callbacks to data +// being written to or read from a file. They also store the id the callbacks, +// which can be useful for managing state across multiple reads and writes. +type FileWriter interface { + Process(data []byte) []byte + Id() string +} +type fileWriterImpl struct { + id string + processor func(data []byte) []byte +} + +func NewFileWriter(context []byte) (FileWriter, error) { + rv := &fileWriterImpl{} + + if index.WriterHook != nil { + var err error + rv.id, rv.processor, err = index.WriterHook(context) + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func (w *fileWriterImpl) Process(data []byte) []byte { + if w.processor != nil { + return w.processor(data) + } + return data +} + +func (w *fileWriterImpl) Id() string { + return w.id +} + +type FileReader interface { + Process(data []byte) ([]byte, error) + Id() string +} + +type fileReaderImpl struct { + id string + processor func(data []byte) ([]byte, error) +} + +func NewFileReader(id string, context []byte) (FileReader, error) { + rv := &fileReaderImpl{ + id: id, + } + + if index.ReaderHook != nil { + var err error + rv.processor, err = index.ReaderHook(id, context) + if err != nil { + return nil, err + } + } else if id != "" { + return nil, fmt.Errorf("reader callback id %s provided but no ReaderHook is set", id) + } + + return rv, nil +} + +func (r *fileReaderImpl) Process(data []byte) ([]byte, error) { + if r.processor != nil { + return r.processor(data) + } + return data, nil +} + +func (r *fileReaderImpl) Id() string { + return r.id +} + +// ----------------------------------------------------------------------- + +// set of bolt keys and bucket names that require processing by the reader +// and writer callbacks. +var boltKeysProcessed = map[string]struct{}{ + string(BoltDeletedKey): {}, + string(BoltInternalKey): {}, + string(BoltStatsKey): {}, + string(BoltUpdatedFieldsKey): {}, +} diff --git a/vendor/github.com/blevesearch/bleve/v2/util/keys.go b/vendor/github.com/blevesearch/bleve/v2/util/keys.go index b71a7f48ba..fb321eba5f 100644 --- a/vendor/github.com/blevesearch/bleve/v2/util/keys.go +++ b/vendor/github.com/blevesearch/bleve/v2/util/keys.go @@ -17,6 +17,9 @@ package util var ( // Bolt keys BoltSnapshotsBucket = []byte{'s'} + BoltTrainerKey = []byte{'t'} + BoltTrainCompleteKey = []byte{'c'} + BoltTrainedSamplesKey = []byte{'n'} BoltPathKey = []byte{'p'} BoltDeletedKey = []byte{'d'} BoltInternalKey = []byte{'i'} @@ -27,6 +30,7 @@ var ( BoltStatsKey = []byte("stats") BoltUpdatedFieldsKey = []byte("fields") TotBytesWrittenKey = []byte("TotBytesWritten") + BoltMetaDataFileWriterIDKey = []byte("fileWriterID") MappingInternalKey = []byte("_mapping") ) diff --git a/vendor/github.com/blevesearch/bleve_index_api/.golangci.yml b/vendor/github.com/blevesearch/bleve_index_api/.golangci.yml deleted file mode 100644 index a00f6c57e4..0000000000 --- a/vendor/github.com/blevesearch/bleve_index_api/.golangci.yml +++ /dev/null @@ -1,37 +0,0 @@ -linters: - # please, do not use `enable-all`: it's deprecated and will be removed soon. - # inverted configuration with `enable-all` and `disable` is not scalable during updates of golangci-lint - disable-all: true - enable: - - bodyclose - - deadcode - - depguard - - dogsled - - dupl - - errcheck - - goconst - - gocritic - - gocyclo - - gofmt - - goimports - - gomnd - - goprintffuncname - - gosimple - - govet - - ineffassign - - interfacer - - lll - - misspell - - nakedret - - nolintlint - - rowserrcheck - - scopelint - - staticcheck - - structcheck - - stylecheck - - typecheck - - unconvert - - unparam - - unused - - varcheck - - whitespace diff --git a/vendor/github.com/blevesearch/bleve_index_api/README.md b/vendor/github.com/blevesearch/bleve_index_api/README.md index 46daa68322..84d05e3f9f 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/README.md +++ b/vendor/github.com/blevesearch/bleve_index_api/README.md @@ -1,11 +1,10 @@ # Bleve Index API -[![PkgGoDev](https://pkg.go.dev/badge/github.com/blevesearch/bleve_index_api)](https://pkg.go.dev/github.com/blevesearch/bleve_index_api) -[![Tests](https://github.com/blevesearch/bleve_index_api/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve_index_api/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) -[![Lint](https://github.com/blevesearch/bleve_index_api/workflows/Lint/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve_index_api/actions?query=workflow%3ALint+event%3Apush+branch%3Amaster) +[![Go Reference](https://pkg.go.dev/badge/github.com/blevesearch/bleve_index_api.svg)](https://pkg.go.dev/github.com/blevesearch/bleve_index_api) +[![Tests](https://github.com/blevesearch/bleve_index_api/actions/workflows/tests.yml/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve_index_api/actions/workflows/tests.yml?query=event%3Apush+branch%3Amaster) Bleve supports a pluggable Index interface. By placing these interfaces in their own, *hopefully* slowly evolving module, it frees up Bleve and the underlying index to each introduce new major versions without interfering with one another. -With that in mind, we anticipate introducing non-breaking changes only to this module, and keeping the major version at 1.x for some time. \ No newline at end of file +With that in mind, we anticipate introducing non-breaking changes only to this module, and keeping the major version at 1.x for some time. diff --git a/vendor/github.com/blevesearch/bleve_index_api/directory.go b/vendor/github.com/blevesearch/bleve_index_api/directory.go index 709a384565..4a9df17f21 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/directory.go +++ b/vendor/github.com/blevesearch/bleve_index_api/directory.go @@ -21,3 +21,8 @@ import ( type Directory interface { GetWriter(filePath string) (io.WriteCloser, error) } + +type IndexDirectory interface { + Directory + SetPathInBolt(key []byte, value []byte) error +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/document.go b/vendor/github.com/blevesearch/bleve_index_api/document.go index bc91c6c4cc..e3fd48841b 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/document.go +++ b/vendor/github.com/blevesearch/bleve_index_api/document.go @@ -124,3 +124,11 @@ type SynonymDocument interface { // The provided visitor function is called for each synonym field. VisitSynonymFields(visitor SynonymFieldVisitor) } + +// NestedDocument is a document that contains other documents inside it. +type NestedDocument interface { + Document + // VisitNestedDocuments allows iteration over all nested documents in the document. + // The provided visitor function is called for each nested document. + VisitNestedDocuments(visitor func(doc Document)) +} diff --git a/vendor/github.com/blevesearch/bleve_index_api/index.go b/vendor/github.com/blevesearch/bleve_index_api/index.go index 12d907e590..679dd4ff0f 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/index.go +++ b/vendor/github.com/blevesearch/bleve_index_api/index.go @@ -17,6 +17,8 @@ package index import ( "bytes" "context" + "encoding/binary" + "fmt" "reflect" ) @@ -57,6 +59,11 @@ type CopyIndex interface { CopyReader() CopyReader } +type TrainableIndex interface { + Index + Train(*Batch) error +} + // EventIndex is an optional interface for exposing the support for firing event // callbacks for various events in the index. type EventIndex interface { @@ -185,17 +192,46 @@ func (tfv *TermFieldVector) Size() int { len(tfv.Field) + len(tfv.ArrayPositions)*sizeOfUint64 } -// IndexInternalID is an opaque document identifier interal to the index impl +// IndexInternalID is an opaque document identifier internal to the index impl type IndexInternalID []byte +// NewIndexInternalID encodes a uint64 into an 8-byte big-endian ID, reusing `buf` when possible. +func NewIndexInternalID(buf []byte, in uint64) IndexInternalID { + if len(buf) != 8 { + if cap(buf) >= 8 { + buf = buf[0:8] + } else { + buf = make([]byte, 8) + } + } + binary.BigEndian.PutUint64(buf, in) + return buf +} + +// NewIndexInternalIDFrom creates a new IndexInternalID by copying from `other`, reusing `buf` when possible. +func NewIndexInternalIDFrom(buf IndexInternalID, other IndexInternalID) IndexInternalID { + buf = buf[:0] + return append(buf, other...) +} + +// Equals checks if two IndexInternalID values are equal. func (id IndexInternalID) Equals(other IndexInternalID) bool { return id.Compare(other) == 0 } +// Compare compares two IndexInternalID values, inherently comparing the encoded uint64 values. func (id IndexInternalID) Compare(other IndexInternalID) int { return bytes.Compare(id, other) } +// Value returns the uint64 value encoded in the IndexInternalID. +func (id IndexInternalID) Value() (uint64, error) { + if len(id) != 8 { + return 0, fmt.Errorf("wrong len for IndexInternalID: %q", id) + } + return binary.BigEndian.Uint64(id), nil +} + type TermFieldDoc struct { Term string ID IndexInternalID @@ -353,6 +389,21 @@ type ThesaurusReader interface { ThesaurusKeysPrefix(name string, termPrefix []byte) (ThesaurusKeys, error) } +// EligibleDocumentIterator provides an interface to iterate over eligible document IDs. +type EligibleDocumentIterator interface { + // Next returns the next document ID and whether it exists. + // When ok is false, iteration is complete. + Next() (id uint64, ok bool) +} + +// EligibleDocumentList represents a list of eligible document IDs for filtering. +type EligibleDocumentList interface { + // Iterator returns an iterator for the eligible document IDs. + Iterator() EligibleDocumentIterator + // Count returns the number of eligible document IDs. + Count() uint64 +} + // EligibleDocumentSelector filters documents based on specific eligibility criteria. // It can be extended with additional methods for filtering and retrieval. type EligibleDocumentSelector interface { @@ -360,10 +411,9 @@ type EligibleDocumentSelector interface { // id is the internal identifier of the document to be added. AddEligibleDocumentMatch(id IndexInternalID) error - // SegmentEligibleDocs returns a list of eligible document IDs within a given segment. - // segmentID identifies the segment for which eligible documents are retrieved. - // This must be called after all eligible documents have been added. - SegmentEligibleDocs(segmentID int) []uint64 + // SegmentEligibleDocuments returns an EligibleDocumentList for the specified segment. + // This must be called after all eligible documents have been added via AddEligibleDocumentMatch. + SegmentEligibleDocuments(segmentID int) EligibleDocumentList } // ----------------------------------------------------------------------------- @@ -391,3 +441,55 @@ type IndexInsightsReader interface { // cluster densities (or cardinalities) CentroidCardinalities(field string, limit int, descending bool) (cenCards []CentroidCardinality, err error) } + +// ----------------------------------------------------------------------------- +// NestedReader is an extended index reader that supports hierarchical document structures. +type NestedReader interface { + IndexReader + // Ancestors returns the ancestral chain for a given document ID in the index. + // For nested documents, this method retrieves all parent documents in the hierarchy + // leading up to the root document ID. + Ancestors(id IndexInternalID, prealloc []AncestorID) ([]AncestorID, error) +} + +// AncestorID represents the identifier of an ancestor document in an ancestor chain. +type AncestorID uint64 + +// NewAncestorID creates a new AncestorID from the given uint64 value. +func NewAncestorID(val uint64) AncestorID { + return AncestorID(val) +} + +// Compare compares two AncestorID values. +func (a AncestorID) Compare(b AncestorID) int { + switch { + case a < b: + return -1 + case a > b: + return 1 + default: + return 0 + } +} + +// Equals checks if two AncestorID values are equal. +func (a AncestorID) Equals(b AncestorID) bool { + return a == b +} + +// Add returns a new AncestorID by adding the given uint64 value to the current AncestorID. +func (a AncestorID) Add(n uint64) AncestorID { + return AncestorID(uint64(a) + n) +} + +// ToIndexInternalID converts the AncestorID to an IndexInternalID. +func (a AncestorID) ToIndexInternalID(prealloc IndexInternalID) IndexInternalID { + return NewIndexInternalID(prealloc, uint64(a)) +} + +// Default no-op implementation. Is called before writing any user data to a file. +var WriterHook func(context []byte) (string, func(data []byte) []byte, error) + +// Default no-op implementation. Is called after reading any user data from a file. +var ReaderHook func(id string, context []byte) ( + func(data []byte) ([]byte, error), error) diff --git a/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go b/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go index 4e92024b9b..c77dd5a559 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go +++ b/vendor/github.com/blevesearch/bleve_index_api/indexing_options.go @@ -14,7 +14,7 @@ package index -type FieldIndexingOptions int +type FieldIndexingOptions uint64 const ( IndexField FieldIndexingOptions = 1 << iota @@ -22,6 +22,9 @@ const ( IncludeTermVectors DocValues SkipFreqNorm + SkipDVCompression + SkipDVChunking + GPU ) const ( @@ -33,6 +36,9 @@ const ( // for a query performed on a text field. const DefaultScoringModel = TFIDFScoring +// Sentinel value used to separate terms in doc values encoding +const DocValueTermSeparator byte = 0xff + // Supported similarity models var SupportedScoringModels = map[string]struct{}{ BM25Scoring: {}, @@ -59,6 +65,18 @@ func (o FieldIndexingOptions) SkipFreqNorm() bool { return o&SkipFreqNorm != 0 } +func (o FieldIndexingOptions) SkipDVCompression() bool { + return o&SkipDVCompression != 0 +} + +func (o FieldIndexingOptions) SkipDVChunking() bool { + return o&SkipDVChunking != 0 +} + +func (o FieldIndexingOptions) UseGPU() bool { + return o&GPU != 0 +} + func (o FieldIndexingOptions) String() string { rv := "" if o.IsIndexed() { @@ -88,5 +106,23 @@ func (o FieldIndexingOptions) String() string { } rv += "FN" } + if !o.SkipDVCompression() { + if rv != "" { + rv += ", " + } + rv += "DV_COMPRESSION" + } + if !o.SkipDVChunking() { + if rv != "" { + rv += ", " + } + rv += "DV_CHUNKING" + } + if o.UseGPU() { + if rv != "" { + rv += ", " + } + rv += "GPU" + } return rv } diff --git a/vendor/github.com/blevesearch/bleve_index_api/vector.go b/vendor/github.com/blevesearch/bleve_index_api/vector.go index 1057cf980a..a19f463032 100644 --- a/vendor/github.com/blevesearch/bleve_index_api/vector.go +++ b/vendor/github.com/blevesearch/bleve_index_api/vector.go @@ -18,6 +18,9 @@ package index type VectorField interface { + // Name of the field + Name() string + // The vector data Vector() []float32 // Dimensionality of the vector Dims() int @@ -25,6 +28,8 @@ type VectorField interface { Similarity() string // nlist/nprobe config (recall/latency) the index is optimized for IndexOptimizedFor() string + // Field indexing options + Options() FieldIndexingOptions } // ----------------------------------------------------------------------------- @@ -49,9 +54,12 @@ var SupportedVectorSimilarityMetrics = map[string]struct{}{ // ----------------------------------------------------------------------------- const ( - IndexOptimizedForRecall = "recall" - IndexOptimizedForLatency = "latency" - IndexOptimizedForMemoryEfficient = "memory-efficient" + IndexOptimizedForRecall = "recall" // Flat or IVF,SQ8 indexes + IndexOptimizedForLatency = "latency" // Flat or IVF,SQ8 indexes; nprobe halved + IndexOptimizedForMemoryEfficient = "memory-efficient" // Flat or IVF,SQ4 indexes + IndexBIVFWithBackingFlat = "bivf-flat" // BFlat or BIVF with Flat backing index + IndexBIVFWithBackingSQ8 = "bivf-sq8" // BFlat or BIVF with SQ8 backing index + IndexIVFRaBitQ = "ivf,rabitq" // Flat or IVF,RaBitQ indexes ) const DefaultIndexOptimization = IndexOptimizedForRecall @@ -60,6 +68,9 @@ var SupportedVectorIndexOptimizations = map[string]int{ IndexOptimizedForRecall: 0, IndexOptimizedForLatency: 1, IndexOptimizedForMemoryEfficient: 2, + IndexBIVFWithBackingFlat: 3, + IndexBIVFWithBackingSQ8: 4, + IndexIVFRaBitQ: 5, } // Reverse maps vector index optimizations': int -> string @@ -67,4 +78,23 @@ var VectorIndexOptimizationsReverseLookup = map[int]string{ 0: IndexOptimizedForRecall, 1: IndexOptimizedForLatency, 2: IndexOptimizedForMemoryEfficient, + 3: IndexBIVFWithBackingFlat, + 4: IndexBIVFWithBackingSQ8, + 5: IndexIVFRaBitQ, } + +func OptimizationRequiresBinaryIndex(optimization string) bool { + switch optimization { + case IndexBIVFWithBackingFlat, IndexBIVFWithBackingSQ8: + return true + default: + return false + } +} + +const TrainedIndexFileName = "trained_index" +const TrainingKey = "_training" + +const TrainedIndexCallback = "_trained_index_callback" + +type TrainedIndexCallbackFn func(string) (interface{}, error) diff --git a/vendor/github.com/blevesearch/geo/geojson/geojson_s2_util.go b/vendor/github.com/blevesearch/geo/geojson/geojson_s2_util.go index 91ad975e30..8afbbffba4 100644 --- a/vendor/github.com/blevesearch/geo/geojson/geojson_s2_util.go +++ b/vendor/github.com/blevesearch/geo/geojson/geojson_s2_util.go @@ -136,44 +136,53 @@ func geometryCollectionIntersectsShape(gc *GeometryCollection, func polygonsContainsLineStrings(s2pgns []*s2.Polygon, pls []*s2.Polyline) bool { - linesWithIn := make(map[int]struct{}) checker := s2.NewCrossingEdgeQuery(s2.NewShapeIndex()) -nextLine: - for lineIndex, pl := range pls { + + // Every line segment in every linestring must be + // fully contained in atleast one of the polygons + for _, pl := range pls { for i := 0; i < len(*pl)-1; i++ { start := (*pl)[i] end := (*pl)[i+1] + contains := false for _, s2pgn := range s2pgns { containsStart := s2pgn.ContainsPoint(start) containsEnd := s2pgn.ContainsPoint(end) + // check if both end points are contained and if so, + // check if the line segment between them crosses the boundary of the polygon if containsStart && containsEnd { crossings := checker.Crossings(start, end, s2pgn, s2.CrossingTypeInterior) if len(crossings) > 0 { - continue nextLine + continue } - linesWithIn[lineIndex] = struct{}{} - continue nextLine + contains = true + break } else { + // else we check if the line segment is an edge of the polygon for _, loop := range s2pgn.Loops() { for i := 0; i < loop.NumVertices(); i++ { if !containsStart && start.ApproxEqual(loop.Vertex(i)) { containsStart = true - } else if !containsEnd && end.ApproxEqual(loop.Vertex(i)) { + } + if !containsEnd && end.ApproxEqual(loop.Vertex(i)) { containsEnd = true } if containsStart && containsEnd { - linesWithIn[lineIndex] = struct{}{} - continue nextLine + contains = true + break } } } } } + if !contains { + return false + } } } - return len(pls) == len(linesWithIn) + return true } func rectangleIntersectsWithPolygons(s2rect *s2.Rect, diff --git a/vendor/github.com/blevesearch/geo/geojson/geojson_shapes_impl.go b/vendor/github.com/blevesearch/geo/geojson/geojson_shapes_impl.go index 7d8e096738..5f80030473 100644 --- a/vendor/github.com/blevesearch/geo/geojson/geojson_shapes_impl.go +++ b/vendor/github.com/blevesearch/geo/geojson/geojson_shapes_impl.go @@ -1717,11 +1717,14 @@ func checkEnvelopeIntersectsShape(s2rect *s2.Rect, shapeIn, // check if the other shape is a circle. if c, ok := other.(*Circle); ok { - s2pgn := s2PolygonFromS2Rectangle(s2rect) - cp := c.s2cap.Center() - projected := s2pgn.Project(&cp) - distance := projected.Distance(cp) - return distance <= c.s2cap.Radius(), nil + // check if the distance of the center of the circle from the + // rectangle is less than the radius of the circle. + if s2rect.DistanceToLatLng(s2.LatLngFromPoint(c.s2cap.Center())) <= + c.s2cap.Radius() { + return true, nil + } + + return false, nil } // check if the other shape is a envelope. diff --git a/vendor/github.com/blevesearch/go-faiss/faiss.go b/vendor/github.com/blevesearch/go-faiss/faiss.go index a7087e7459..f151b8576b 100644 --- a/vendor/github.com/blevesearch/go-faiss/faiss.go +++ b/vendor/github.com/blevesearch/go-faiss/faiss.go @@ -12,7 +12,10 @@ package faiss #include */ import "C" -import "errors" +import ( + "errors" + "fmt" +) func getLastError() error { return errors.New(C.GoString(C.faiss_get_last_error())) @@ -39,3 +42,11 @@ func NormalizeVector(vector []float32) []float32 { return vector } + +var ( + errNotIVFIndex = fmt.Errorf("index is not of ivf type") + errMergeFromNotSupported = fmt.Errorf("merge api not supported") + errNotBIVFIndex = fmt.Errorf("index is not of bivf type") + errFailedToSetQuantizers = fmt.Errorf("couldn't set the quantizers") + errSourceIndexNil = fmt.Errorf("source index is nil") +) diff --git a/vendor/github.com/blevesearch/go-faiss/gpu.go b/vendor/github.com/blevesearch/go-faiss/gpu.go new file mode 100644 index 0000000000..b305f6b613 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/gpu.go @@ -0,0 +1,309 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build gpu + +package faiss + +/* +#include +#include +#include +#include +#include +*/ +import "C" +import ( + "errors" + "fmt" + "math/rand" + "sort" + "sync" + "sync/atomic" + "time" + "unsafe" +) + +var ( + errAccessingGPUDevices = errors.New("error accessing GPU devices") + errNilIndex = errors.New("index is nil") + errNoGPUDevices = errors.New("no GPU devices available") +) + +// memorySpace controls where GPU index data is allocated. +type memorySpace int + +const ( + // memorySpaceDevice uses standard GPU memory (cudaMalloc). + memorySpaceDevice memorySpace = 1 + // memorySpaceUnified uses CUDA managed memory (cudaMallocManaged), + // allowing the index to exceed GPU memory on Pascal+ (CC 6.0+) GPUs. + memorySpaceUnified memorySpace = 2 +) + +const ( + // the minimum amount of free memory that must be available on a GPU to be considered for index cloning. + minGPUFreeMemory = 512 * 1024 * 1024 // 512 MiB + // the default memory space to use for GPU indices + defaultGPUMemoryMode = memorySpaceUnified +) + +var ( + gpuCount int + loadBalancer *gpuLoadBalancer +) + +func init() { + var err error + gpuCount, err = numGPUs() + if err != nil || gpuCount <= 0 { + gpuCount = 0 + } + + // With exactly one GPU there is nothing to balance; getBestGPUDevice() + // returns device 0 directly when loadBalancer is nil. + // TODO: verify if 500 milliseconds is a good interval + if gpuCount > 1 { + loadBalancer = newGPULoadBalancer(500 * time.Millisecond) + go loadBalancer.monitor() + } +} + +// numGPUs returns the number of available GPU devices. +func numGPUs() (int, error) { + var rv C.int + c := C.faiss_get_num_gpus(&rv) + if c != 0 { + return 0, fmt.Errorf("error getting number of GPUs, err: %v", getLastError()) + } + return int(rv), nil +} + +// gpuLoadBalancer monitors GPU free memory on a fixed interval, keeps a +// memory-sorted list of devices, and hands them out in round-robin order. +// At each interval the list is re-sorted and the round-robin counter resets +// to 0, so the next cycle always starts from the GPU with the most free memory. +type gpuLoadBalancer struct { + mu sync.RWMutex + sortedDevices []int + idx atomic.Uint32 + interval time.Duration + // scratch buffers reused across refresh calls; only accessed by the monitor goroutine + freeMemory []uint64 + scratchDevs []int +} + +func newGPULoadBalancer(interval time.Duration) *gpuLoadBalancer { + lb := &gpuLoadBalancer{ + interval: interval, + freeMemory: make([]uint64, gpuCount), + scratchDevs: make([]int, 0, gpuCount), + sortedDevices: make([]int, 0, gpuCount), + } + return lb +} + +func (lb *gpuLoadBalancer) monitor() { + ticker := time.NewTicker(lb.interval) + defer ticker.Stop() + + // Perform an initial sort before any requests come in. + lb.refresh() + + for range ticker.C { + lb.refresh() + } +} + +// refresh queries every GPU for free memory, sorts the device list in descending +// order of free memory, and resets the round-robin counter to 0. +// If all queries fail the sorted list becomes empty, causing nextDevice to error. +func (lb *gpuLoadBalancer) refresh() { + // Zero freeMemory before querying; failed queries leave their slot as 0, + // which naturally excludes those devices from selection. + clear(lb.freeMemory) + lb.scratchDevs = lb.scratchDevs[:0] + + var wg sync.WaitGroup + wg.Add(gpuCount) + for i := 0; i < gpuCount; i++ { + go func(device int) { + defer wg.Done() + var freeBytes C.size_t + if C.faiss_gpu_free_memory(C.int(device), &freeBytes) == 0 { + lb.freeMemory[device] = uint64(freeBytes) + } + }(i) + } + wg.Wait() + + // Only include devices that reported non-zero free memory, and have at least minGPUFreeMemory free. + for i, mem := range lb.freeMemory { + if mem > minGPUFreeMemory { + lb.scratchDevs = append(lb.scratchDevs, i) + } + } + + // Shuffle first, then sort descending by free memory to make the + // sort as "unstable" as possible + // This is useful to add fairness between GPUs with the same memory + rand.Shuffle(len(lb.scratchDevs), func(i, j int) { + lb.scratchDevs[i], lb.scratchDevs[j] = lb.scratchDevs[j], lb.scratchDevs[i] + }) + // Sort in a descending order by free memory so index 0 is the most appealing GPU. + sort.Slice(lb.scratchDevs, func(i, j int) bool { + return lb.freeMemory[lb.scratchDevs[i]] > lb.freeMemory[lb.scratchDevs[j]] + }) + + lb.mu.Lock() + old := lb.sortedDevices + lb.sortedDevices = lb.scratchDevs + lb.scratchDevs = old[:0] + lb.idx.Store(0) + lb.mu.Unlock() +} + +// nextDevice returns the next GPU device in round-robin order. +// Returns an error if no devices are currently available. +func (lb *gpuLoadBalancer) nextDevice() (int, error) { + lb.mu.RLock() + defer lb.mu.RUnlock() + + devices := lb.sortedDevices + n := len(devices) + if n == 0 { + return 0, errAccessingGPUDevices + } + + // atomically allocates the GPU. Minus 1 for zero based index + idx := lb.idx.Add(1) - 1 + return devices[int(idx%uint32(n))], nil +} + +func getBestGPUDevice() (int, error) { + if gpuCount == 0 { + return 0, errNoGPUDevices + } + // With exactly one GPU there is nothing to balance; always use device 0. + if loadBalancer == nil { + return 0, nil + } + return loadBalancer.nextDevice() +} + +// only expose API used by zapx +type GPUIndexImpl struct { + idx *faissIndex + gpuResource *C.FaissStandardGpuResources +} + +func (g *GPUIndexImpl) cPtr() *C.FaissIndex { + return g.idx.idx +} + +func (g *GPUIndexImpl) Train(x []float32) error { + return g.idx.Train(x) +} + +func (g *GPUIndexImpl) Add(x []float32) error { + return g.idx.Add(x) +} + +func (g *GPUIndexImpl) Search(x []float32, k int64) ([]float32, []int64, error) { + return g.idx.Search(x, k) +} + +func (g *GPUIndexImpl) Close() { + if g.idx != nil { + g.idx.Close() + g.idx = nil + } + if g.gpuResource != nil { + C.faiss_StandardGpuResources_free(g.gpuResource) + g.gpuResource = nil + } +} + +// CloneToGPU transfers a CPU index to the best available GPU based on free memory. +func CloneToGPU(cpuIndex *IndexImpl) (*GPUIndexImpl, error) { + if cpuIndex == nil { + return nil, errNilIndex + } + + // Use the load balancer to select the best GPU device + device, err := getBestGPUDevice() + if err != nil { + return nil, err + } + + var gpuResource *C.FaissStandardGpuResources + if code := C.faiss_StandardGpuResources_new(&gpuResource); code != 0 { + return nil, fmt.Errorf("failed to initialize GPU resources: error code %d, err: %v", code, getLastError()) + } + + // Disable the pre-allocated temp memory pool so that all GPU memory is + // available for index data; unified memory mode handles intermediate + // allocations via cudaMalloc/cudaFree on demand. + if code := C.faiss_StandardGpuResources_noTempMemory(gpuResource); code != 0 { + C.faiss_StandardGpuResources_free(gpuResource) + return nil, fmt.Errorf("failed to disable GPU temp memory: error code %d, err: %v", code, getLastError()) + } + + var clonerOpts *C.FaissGpuClonerOptions + if code := C.faiss_GpuClonerOptions_new(&clonerOpts); code != 0 { + C.faiss_StandardGpuResources_free(gpuResource) + return nil, fmt.Errorf("failed to create cloner options: error code %d, err: %v", code, getLastError()) + } + defer C.faiss_GpuClonerOptions_free(clonerOpts) + + C.faiss_GpuClonerOptions_set_memorySpace(clonerOpts, C.int(defaultGPUMemoryMode)) + + var gpuIdx *C.FaissGpuIndex + code := C.faiss_index_cpu_to_gpu_with_options( + gpuResource, + C.int(device), + cpuIndex.cPtr(), + clonerOpts, + &gpuIdx, + ) + if code != 0 { + C.faiss_StandardGpuResources_free(gpuResource) + return nil, fmt.Errorf("failed to transfer index to GPU device %d: error code %d, err: %v", device, code, getLastError()) + } + + idx := &faissIndex{ + idx: (*C.FaissIndex)(unsafe.Pointer(gpuIdx)), + } + + return &GPUIndexImpl{ + idx: idx, + gpuResource: gpuResource, + }, nil +} + +func CloneToCPU(gpuIndex *GPUIndexImpl) (*IndexImpl, error) { + if gpuIndex == nil { + return nil, errNilIndex + } + + var cpuIdx *C.FaissIndex + code := C.faiss_index_gpu_to_cpu( + gpuIndex.cPtr(), + &cpuIdx, + ) + if code != 0 { + return nil, fmt.Errorf("failed to transfer index to CPU: %v", getLastError()) + } + return &IndexImpl{&faissIndex{idx: cpuIdx}}, nil +} diff --git a/vendor/github.com/blevesearch/go-faiss/gpu_stub.go b/vendor/github.com/blevesearch/go-faiss/gpu_stub.go new file mode 100644 index 0000000000..427f8b65f4 --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/gpu_stub.go @@ -0,0 +1,41 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !gpu + +package faiss + +import "errors" + +// GPUIndexImpl is an opaque type when not built with GPU support. +type GPUIndexImpl struct{} + +func (g *GPUIndexImpl) Train(x []float32) error { return errGPUNotBuilt } +func (g *GPUIndexImpl) Add(x []float32) error { return errGPUNotBuilt } +func (g *GPUIndexImpl) Search(x []float32, k int64) ([]float32, []int64, error) { + return nil, nil, errGPUNotBuilt +} +func (g *GPUIndexImpl) Close() {} + +var errGPUNotBuilt = errors.New("not built with GPU support (requires -tags gpu)") + +// CloneToGPU is not available without the gpu build tag. +func CloneToGPU(_ *IndexImpl) (*GPUIndexImpl, error) { + return nil, errGPUNotBuilt +} + +// CloneToCPU is not available without the gpu build tag. +func CloneToCPU(_ *GPUIndexImpl) (*IndexImpl, error) { + return nil, errGPUNotBuilt +} diff --git a/vendor/github.com/blevesearch/go-faiss/index.go b/vendor/github.com/blevesearch/go-faiss/index.go index 3a399e5b63..e716a1f881 100644 --- a/vendor/github.com/blevesearch/go-faiss/index.go +++ b/vendor/github.com/blevesearch/go-faiss/index.go @@ -33,6 +33,15 @@ type Index interface { // Ntotal returns the number of indexed vectors. Ntotal() int64 + // set the direct map type for IVF indexes. + // 0 for No Map + // 1 for Array + // 2 for Hash + SetDirectMap(maptype int) error + + // set the number of probes for IVF indexes + SetNProbe(nprobe int32) + // MetricType returns the metric type of the index. MetricType() int @@ -48,42 +57,45 @@ type Index interface { // Returns true if the index is an IVF index. IsIVFIndex() bool - // Applicable only to IVF indexes: Returns a map where the keys - // are cluster IDs and the values represent the count of input vectors that belong - // to each cluster. - // This method only considers the given vecIDs and does not account for all - // vectors in the index. - // Example: - // If vecIDs = [1, 2, 3, 4, 5], and: - // - Vectors 1 and 2 belong to cluster 1 - // - Vectors 3, 4, and 5 belong to cluster 2 - // The output will be: map[1:2, 2:3] - ObtainClusterVectorCountsFromIVFIndex(vecIDs []int64) (map[int64]int64, error) - - // Applicable only to IVF indexes: Returns the centroid IDs in decreasing order - // of proximity to query 'x' and their distance from 'x' - ObtainClustersWithDistancesFromIVFIndex(x []float32, centroidIDs []int64) ( + // Returns true if the index is a scalar quantization (SQ) index. + IsSQIndex() bool + + // Returns true if the index has RaBitQ + HasRaBitQ() bool + + // Returns the IVF parameters nprobe and nlist for IVF indexes. + IVFParams() (nprobe, nlist int) + + // Applicable only to IVF indexes: Returns a slice where each index represents + // a cluster (list) ID and the value is the count of selected vectors belonging + // to that cluster. Only vectors specified by the given Selector are considered. + ObtainClusterVectorCountsFromIVFIndex(include Selector, nlist int) ([]int64, error) + + // Applicable only to IVF indexes: Returns the centroid IDs in the selector in + // decreasing order of proximity to query 'x' and their distance from 'x' + ObtainClustersWithDistancesFromIVFIndex(x []float32, centroids Selector, numCentroids int64) ( []int64, []float32, error) // Applicable only to IVF indexes: Returns the top k centroid cardinalities and // their vectors in chosen order (descending or ascending) ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ([]uint64, [][]float32, error) + // fetch centroid count + Nlist() int + // Search queries the index with the vectors in x. // Returns the IDs of the k nearest neighbors for each query vector and the // corresponding distances. Search(x []float32, k int64) (distances []float32, labels []int64, err error) - SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) (distances []float32, - labels []int64, err error) - - SearchWithIDs(x []float32, k int64, include []int64, params json.RawMessage) (distances []float32, - labels []int64, err error) + // SearchWithOptions performs a search with additional optional constraints. + // - Selector can be used to restrict the search to a subset of the indexed vectors based on their IDs. + // - params is a JSON object that can contain additional search parameters specific to the index type, such as IVF search parameters. + SearchWithOptions(x []float32, k int64, sel Selector, params json.RawMessage) (distances []float32, labels []int64, err error) // Applicable only to IVF indexes: Search clusters whose IDs are in eligibleCentroidIDs - SearchClustersFromIVFIndex(selector Selector, eligibleCentroidIDs []int64, - minEligibleCentroids int, k int64, x, centroidDis []float32, - params json.RawMessage) ([]float32, []int64, error) + SearchClustersFromIVFIndex(eligibleCentroidIDs []int64, centroidDis []float32, centroidsToProbe int, + x []float32, k int64, include Selector, params json.RawMessage) ([]float32, []int64, error) Reconstruct(key int64) ([]float32, error) @@ -95,6 +107,9 @@ type Index interface { // Returns all vectors with distance < radius. RangeSearch(x []float32, radius float32) (*RangeSearchResult, error) + // DistCompute computes the distance between the query vector and the vectors specified by ids. + DistCompute(x []float32, labels []int64) ([]float32, error) + // Reset removes all vectors from the index. Reset() error @@ -109,6 +124,10 @@ type Index interface { Size() uint64 cPtr() *C.FaissIndex + + // set the quantizers from a source index into this index, applicable only + // for IVF indexes + SetQuantizers(source Index) error } type faissIndex struct { @@ -156,24 +175,34 @@ func (idx *faissIndex) Add(x []float32) error { return nil } -func (idx *faissIndex) ObtainClusterVectorCountsFromIVFIndex(vecIDs []int64) (map[int64]int64, error) { - if !idx.IsIVFIndex() { - return nil, fmt.Errorf("index is not an IVF index") +func (idx *faissIndex) ObtainClusterVectorCountsFromIVFIndex(includedVectors Selector, nlist int) ([]int64, error) { + // Applicable only to IVF indexes + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return nil, errNotIVFIndex } - clusterIDs := make([]int64, len(vecIDs)) - if c := C.faiss_get_lists_for_keys( - idx.idx, - (*C.idx_t)(unsafe.Pointer(&vecIDs[0])), - (C.size_t)(len(vecIDs)), - (*C.idx_t)(unsafe.Pointer(&clusterIDs[0])), + // Creating a slice to hold the count of vectors per cluster + // Since we have nlist clusters, we create a slice of size nlist + // listCount[i] will hold the count of vectors in cluster i + listCount := make([]int64, nlist) + // Creating a FAISS selector based on the include bitmap. + params, err := NewStandardSearchParams(includedVectors) + if err != nil { + return nil, err + } + defer params.Delete() + // Calling the C function to populate listCount + // with the count of vectors per cluster, considering only + // the vectors specified in the include selector. + if c := C.faiss_IndexIVF_list_vector_count( + ivfPtr, + (*C.idx_t)(unsafe.Pointer(&listCount[0])), + C.size_t(nlist), + params.sp, ); c != 0 { return nil, getLastError() } - rv := make(map[int64]int64, len(vecIDs)) - for _, v := range clusterIDs { - rv[v]++ - } - return rv, nil + return listCount, nil } func (idx *faissIndex) IsIVFIndex() bool { @@ -183,36 +212,38 @@ func (idx *faissIndex) IsIVFIndex() bool { return true } -func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, centroidIDs []int64) ( +func (idx *faissIndex) HasRaBitQ() bool { + return C.faiss_IndexIVF_has_RaBitQ(idx.idx) == 0 +} + +func (idx *faissIndex) ObtainClustersWithDistancesFromIVFIndex(x []float32, includedCentroids Selector, numCentroids int64) ( []int64, []float32, error) { - // Selector to include only the centroids whose IDs are part of 'centroidIDs'. - includeSelector, err := NewIDSelectorBatch(centroidIDs) - if err != nil { - return nil, nil, err + // Applicable only to IVF indexes + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return nil, nil, errNotIVFIndex } - defer includeSelector.Delete() - - params, err := NewSearchParams(idx, json.RawMessage{}, includeSelector.Get(), nil) + params, err := NewStandardSearchParams(includedCentroids) if err != nil { return nil, nil, err } defer params.Delete() // Populate these with the centroids and their distances. - centroids := make([]int64, len(centroidIDs)) - centroidDistances := make([]float32, len(centroidIDs)) + centroids := make([]int64, numCentroids) + centroidDistances := make([]float32, numCentroids) n := len(x) / idx.D() - c := C.faiss_Search_closest_eligible_centroids( - idx.idx, + if c := C.faiss_IndexIVF_search_closest_eligible_centroids( + ivfPtr, (C.idx_t)(n), (*C.float)(&x[0]), - (C.idx_t)(len(centroidIDs)), + (C.idx_t)(numCentroids), (*C.float)(¢roidDistances[0]), (*C.idx_t)(¢roids[0]), - params.sp) - if c != 0 { + params.sp, + ); c != 0 { return nil, nil, getLastError() } @@ -284,19 +315,38 @@ func getIndicesOfKCentroidCardinalities(cardinalities []C.size_t, k int, descend return indices[:k] } +func (idx *faissIndex) Nlist() int { + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return 0 + } + return int(C.faiss_IndexIVF_nlist(idx.idx)) +} -func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, - eligibleCentroidIDs []int64, minEligibleCentroids int, k int64, x, - centroidDis []float32, params json.RawMessage) ([]float32, []int64, error) { - +func (idx *faissIndex) SearchClustersFromIVFIndex(eligibleCentroidIDs []int64, centroidDis []float32, centroidsToProbe int, + x []float32, k int64, include Selector, params json.RawMessage) ([]float32, []int64, error) { + // Applicable only to IVF indexes + ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) + if ivfPtr == nil { + return nil, nil, errNotIVFIndex + } + // If no include selector is provided, we have no results to return. + // return an error indicating that the SearchClustersFromIVFIndex requires a valid selector. + if include == nil { + return nil, nil, fmt.Errorf("SearchClustersFromIVFIndex requires a valid include selector") + } + // create a temporary search params object to set nprobe, this will override + // the nprobe and the nlist set at index time, this will allow the search to + // probe only the clusters specified in eligibleCentroidIDs tempParams := &defaultSearchParamsIVF{ + // Nlist is set to the number of eligible centroids, which will override + // the nlist set at index time. Nlist: len(eligibleCentroidIDs), // Have to override nprobe so that more clusters will be searched for this // query, if required. - Nprobe: minEligibleCentroids, + Nprobe: centroidsToProbe, } - - searchParams, err := NewSearchParams(idx, params, selector.Get(), tempParams) + searchParams, err := NewSearchParams(idx, params, include, tempParams) if err != nil { return nil, nil, err } @@ -306,13 +356,17 @@ func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, distances := make([]float32, int64(n)*k) labels := make([]int64, int64(n)*k) - - effectiveNprobe := getNProbeFromSearchParams(searchParams) + // Adjust the slices to match the effective nprobe set in searchParams, as the input + // parameters may have different nprobe value, which will be a hard override, over the + // centroidsToProbe value passed to this function. + // If the effective nprobe is greater than the length of eligibleCentroidIDs, + // we limit it to the length of eligibleCentroidIDs. + effectiveNprobe := min(getNProbeFromSearchParams(searchParams), int32(len(eligibleCentroidIDs))) eligibleCentroidIDs = eligibleCentroidIDs[:effectiveNprobe] centroidDis = centroidDis[:effectiveNprobe] if c := C.faiss_IndexIVF_search_preassigned_with_params( - idx.idx, + ivfPtr, (C.idx_t)(n), (*C.float)(&x[0]), (C.idx_t)(k), @@ -321,7 +375,8 @@ func (idx *faissIndex) SearchClustersFromIVFIndex(selector Selector, (*C.float)(&distances[0]), (*C.idx_t)(&labels[0]), (C.int)(0), - searchParams.sp); c != 0 { + searchParams.sp, + ); c != 0 { return nil, nil, getLastError() } @@ -341,6 +396,9 @@ func (idx *faissIndex) AddWithIDs(x []float32, xids []int64) error { return nil } +// Always use SearchWithOptions for indexes involving RaBitQ, as +// simple Search is highly unoptimized for RaBitQ indexes and +// will not leverage the quantizer for search. func (idx *faissIndex) Search(x []float32, k int64) ( distances []float32, labels []int64, err error, ) { @@ -361,51 +419,11 @@ func (idx *faissIndex) Search(x []float32, k int64) ( return } -func (idx *faissIndex) SearchWithoutIDs(x []float32, k int64, exclude []int64, params json.RawMessage) ( - distances []float32, labels []int64, err error, -) { - if params == nil && len(exclude) == 0 { +func (idx *faissIndex) SearchWithOptions(x []float32, k int64, sel Selector, params json.RawMessage) ([]float32, []int64, error) { + if sel == nil && params == nil && !idx.HasRaBitQ() { return idx.Search(x, k) } - - var selector *C.FaissIDSelector - if len(exclude) > 0 { - excludeSelector, err := NewIDSelectorNot(exclude) - if err != nil { - return nil, nil, err - } - selector = excludeSelector.Get() - defer excludeSelector.Delete() - } - - searchParams, err := NewSearchParams(idx, params, selector, nil) - if err != nil { - return nil, nil, err - } - defer searchParams.Delete() - - distances, labels, err = idx.searchWithParams(x, k, searchParams.sp) - - return -} - -func (idx *faissIndex) SearchWithIDs(x []float32, k int64, include []int64, - params json.RawMessage) (distances []float32, labels []int64, err error, -) { - includeSelector, err := NewIDSelectorBatch(include) - if err != nil { - return nil, nil, err - } - defer includeSelector.Delete() - - searchParams, err := NewSearchParams(idx, params, includeSelector.Get(), nil) - if err != nil { - return nil, nil, err - } - defer searchParams.Delete() - - distances, labels, err = idx.searchWithParams(x, k, searchParams.sp) - return + return idx.searchWithOptions(x, k, sel, params) } func (idx *faissIndex) Reconstruct(key int64) (recons []float32, err error) { @@ -436,22 +454,17 @@ func (idx *faissIndex) ReconstructBatch(keys []int64, recons []float32) ([]float return recons, err } -func (i *IndexImpl) MergeFrom(other Index, add_id int64) error { - if impl, ok := other.(*IndexImpl); ok { - return i.Index.MergeFrom(impl.Index, add_id) - } - return fmt.Errorf("merge not support") -} - func (idx *faissIndex) MergeFrom(other Index, add_id int64) (err error) { - otherIdx, ok := other.(*faissIndex) - if !ok { - return fmt.Errorf("merge api not supported") + // currrently we support the mergeFrom API only for IVF and SQ indexes + // todo: support on Flat index as well + if !(idx.IsIVFIndex() && other.IsIVFIndex()) && + !(idx.IsSQIndex() && other.IsSQIndex()) { + return fmt.Errorf("faissIndex MergeFrom err: %w", errMergeFromNotSupported) } if c := C.faiss_Index_merge_from( - idx.idx, - otherIdx.idx, + idx.cPtr(), + other.cPtr(), (C.idx_t)(add_id), ); c != 0 { err = getLastError() @@ -480,6 +493,16 @@ func (idx *faissIndex) RangeSearch(x []float32, radius float32) ( return &RangeSearchResult{rsr}, nil } +func (idx *faissIndex) DistCompute(queryData []float32, ids []int64) ([]float32, error) { + distances := make([]float32, len(ids)) + if c := C.faiss_Index_dist_compute(idx.idx, (*C.float)(&queryData[0]), + (*C.idx_t)(&ids[0]), (C.size_t)(len(ids)), (*C.float)(&distances[0])); c != 0 { + return nil, getLastError() + } + + return distances, nil +} + func (idx *faissIndex) Reset() error { if c := C.faiss_Index_reset(idx.idx); c != 0 { return getLastError() @@ -499,26 +522,30 @@ func (idx *faissIndex) Close() { C.faiss_Index_free(idx.idx) } -func (idx *faissIndex) searchWithParams(x []float32, k int64, searchParams *C.FaissSearchParameters) ( - distances []float32, labels []int64, err error, -) { +func (idx *faissIndex) searchWithOptions(x []float32, k int64, sel Selector, params json.RawMessage) ([]float32, []int64, error) { + // Build a search params object to contain either the selector, the additional params, or both. + searchParams, err := NewSearchParams(idx, params, sel, nil) + if err != nil { + return nil, nil, err + } + defer searchParams.Delete() + n := len(x) / idx.D() - distances = make([]float32, int64(n)*k) - labels = make([]int64, int64(n)*k) + distances := make([]float32, int64(n)*k) + labels := make([]int64, int64(n)*k) if c := C.faiss_Index_search_with_params( idx.idx, C.idx_t(n), (*C.float)(&x[0]), C.idx_t(k), - searchParams, + searchParams.sp, (*C.float)(&distances[0]), (*C.idx_t)(&labels[0]), ); c != 0 { - err = getLastError() + return nil, nil, getLastError() } - - return + return distances, labels, nil } // ----------------------------------------------------------------------------- diff --git a/vendor/github.com/blevesearch/go-faiss/index_binary.go b/vendor/github.com/blevesearch/go-faiss/index_binary.go new file mode 100644 index 0000000000..83f6dfe53a --- /dev/null +++ b/vendor/github.com/blevesearch/go-faiss/index_binary.go @@ -0,0 +1,465 @@ +package faiss + +/* +#include +#include +#include +#include +#include +#include +#include +*/ +import "C" +import ( + "encoding/json" + "fmt" + "unsafe" +) + +type BinaryIndex interface { + // D returns the dimension of the indexed vectors. + D() int + + // MetricType returns the metric type of the index. + MetricType() int + + // Ntotal returns the total number of vectors currently stored in the index. + Ntotal() int64 + + // set the direct map type for IVF indexes. + // 0 for No Map + // 1 for Array + // 2 for Hash + SetDirectMap(maptype int) error + + // set the number of probes for IVF indexes + SetNProbe(nprobe int32) + + // returns true if the underlying index is an IVF index + IsIVFIndex() bool + + // IVFParams returns the nlist and nprobe parameters for IVF indexes + IVFParams() (nprobe int, nlist int) + + // trains the index on a representative set of vectors + Train(xb []uint8) error + + // adds vectors to the index + Add(xb []uint8) error + + // sets the qunatizers from the source index, supposed to be used only for + // BIVF indexes and returns error otherwise + SetQuantizers(srcIndex BinaryIndex) error + + // merges another binary index into this one, currently applicable only for + // IVF indexes returns an error + MergeFrom(other BinaryIndex, add_id int64) error + + // queries the index with the vectors in xb + // returns the IDs of the k nearest neighbors for each query vector and + // their corresponding distances + Search(xb []uint8, k int64) (distances []int32, labels []int64, err error) + + // SearchWithOptions performs a search with additional optional constraints. + // - Selector can be used to restrict the search to a subset of the indexed vectors based on their IDs. + // - params is a JSON object that can contain additional search parameters specific to the index type, such as IVF search parameters. + SearchWithOptions(xb []uint8, k int64, sel Selector, params json.RawMessage) (distances []int32, labels []int64, err error) + + // returns a slice where each index corresponds to a cluster in an IVF + // index, and the value at each index is the count of vectors in that + // cluster, considering only the vectors specified in the include selector. + ObtainClusterVectorCountsFromIVFIndex(include Selector, nlist int) ( + []int64, error) + + // returns the IDs and distances of the closest numCentroids centroids to + // the query vector xb, considering only the centroids specified in the + // includedCentroids selector. + ObtainClustersWithDistancesFromIVFIndex(xb []uint8, includedCentroids Selector, + numCentroids int64) ([]int64, []int32, error) + + // Applicable only to IVF indexes: Returns the top k centroid cardinalities and + // their vectors in chosen order (descending or ascending) + ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ([]uint64, [][]uint8, error) + + // searches the specified clusters in an IVF index for the k nearest neighbors + // of the query vector xb, considering only the vectors specified in the include selector + // and additional search parameters passed as a JSON object. + SearchClustersFromIVFIndex(eligibleCentroidIDs []int64, centroidDis []int32, + centroidsToProbe int, xb []uint8, k int64, include Selector, + params json.RawMessage) ([]int32, []int64, error) + + // returns the total size of the index in bytes + Size() uint64 + + // frees the memory associated with the index + Close() + + bPtr() *C.FaissIndexBinary +} + +type faissBinaryIndex struct { + bIdx *C.FaissIndexBinary +} + +func (b *faissBinaryIndex) bPtr() *C.FaissIndexBinary { + return b.bIdx +} + +func (b *faissBinaryIndex) D() int { + return int(C.faiss_IndexBinary_d(b.bIdx)) +} + +func (b *faissBinaryIndex) MetricType() int { + return int(C.faiss_IndexBinary_metric_type(b.bIdx)) +} + +func (b *faissBinaryIndex) Ntotal() int64 { + return int64(C.faiss_IndexBinary_ntotal(b.bIdx)) +} + +func (b *faissBinaryIndex) SetDirectMap(mapType int) (err error) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return errNotBIVFIndex + } + if c := C.faiss_IndexBinaryIVF_set_direct_map( + ivfPtrBinary, + C.int(mapType), + ); c != 0 { + err = getLastError() + } + return err +} + +func (b *faissBinaryIndex) SetNProbe(nprobe int32) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return + } + C.faiss_IndexBinaryIVF_set_nprobe(ivfPtrBinary, C.size_t(nprobe)) +} + +func (b *faissBinaryIndex) IsIVFIndex() bool { + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + return ivfPtrBinary != nil +} + +func (b *faissBinaryIndex) IVFParams() (nprobe int, nlist int) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return 0, 0 + } + nlist = int(C.faiss_IndexBinaryIVF_nlist(ivfPtrBinary)) + nprobe = int(C.faiss_IndexBinaryIVF_nprobe(ivfPtrBinary)) + return nprobe, nlist +} + +func (b *faissBinaryIndex) Train(x []uint8) error { + n := (len(x) * 8) / b.D() + if c := C.faiss_IndexBinary_train(b.bIdx, C.idx_t(n), + (*C.uint8_t)(&x[0])); c != 0 { + return getLastError() + } + return nil +} + +func (b *faissBinaryIndex) Add(x []uint8) error { + n := (len(x) * 8) / b.D() + if c := C.faiss_IndexBinary_add(b.bIdx, C.idx_t(n), + (*C.uint8_t)(&x[0])); c != 0 { + return getLastError() + } + return nil +} + +func (b *faissBinaryIndex) Search(xb []uint8, k int64) ( + []int32, []int64, error) { + nq := (len(xb) * 8) / b.D() + distances := make([]int32, int64(nq)*k) + labels := make([]int64, int64(nq)*k) + + if c := C.faiss_IndexBinary_search( + b.bIdx, + C.idx_t(nq), + (*C.uint8_t)(&xb[0]), + C.idx_t(k), + (*C.int32_t)(&distances[0]), + (*C.idx_t)(&labels[0]), + ); c != 0 { + return nil, nil, getLastError() + } + return distances, labels, nil +} + +func (b *faissBinaryIndex) SearchWithOptions(xb []uint8, k int64, sel Selector, params json.RawMessage) ([]int32, []int64, error) { + if sel == nil && params == nil { + return b.Search(xb, k) + } + return b.searchWithOptions(xb, k, sel, params) +} + +func (b *faissBinaryIndex) searchWithOptions(xb []uint8, k int64, selector Selector, + params json.RawMessage) ([]int32, []int64, error) { + // Build a binary search params object to contain either the selector, the additional params, or both. + searchParams, err := NewBinarySearchParams(b, params, selector, nil) + if err != nil { + return nil, nil, err + } + defer searchParams.Delete() + + nq := (len(xb) * 8) / b.D() + distances := make([]int32, int64(nq)*k) + labels := make([]int64, int64(nq)*k) + + if c := C.faiss_IndexBinary_search_with_params( + b.bIdx, + C.idx_t(nq), + (*C.uint8_t)(&xb[0]), + C.idx_t(k), + searchParams.sp, + (*C.int32_t)(&distances[0]), + (*C.idx_t)(&labels[0]), + ); c != 0 { + return nil, nil, getLastError() + } + return distances, labels, nil +} + +func (b *faissBinaryIndex) ObtainClusterVectorCountsFromIVFIndex(includedVectors Selector, nlist int) ([]int64, error) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return nil, errNotBIVFIndex + } + // Creating a slice to hold the count of vectors per cluster + // Since we have nlist clusters, we create a slice of size nlist + // listCount[i] will hold the count of vectors in cluster i + listCount := make([]int64, nlist) + // Creating a FAISS selector based on the include bitmap. + params, err := NewStandardSearchParams(includedVectors) + if err != nil { + return nil, err + } + defer params.Delete() + // Calling the C function to populate listCount + // with the count of vectors per cluster, considering only + // the vectors specified in the include selector. + if c := C.faiss_IndexBinaryIVF_list_vector_count( + ivfPtrBinary, + (*C.idx_t)(unsafe.Pointer(&listCount[0])), + C.size_t(nlist), + params.sp, + ); c != 0 { + return nil, getLastError() + } + return listCount, nil +} + +func (b *faissBinaryIndex) ObtainClustersWithDistancesFromIVFIndex(xb []uint8, includedCentroids Selector, numCentroids int64) ([]int64, []int32, error) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return nil, nil, errNotBIVFIndex + } + params, err := NewStandardSearchParams(includedCentroids) + if err != nil { + return nil, nil, err + } + defer params.Delete() + + // Populate these with the centroids and their distances. + centroids := make([]int64, numCentroids) + centroidDistances := make([]int32, numCentroids) + + n := (len(xb) * 8) / b.D() + + if c := C.faiss_IndexBinaryIVF_search_closest_eligible_centroids( + ivfPtrBinary, + (C.idx_t)(n), + (*C.uint8_t)(&xb[0]), + (C.idx_t)(numCentroids), + (*C.int32_t)(¢roidDistances[0]), + (*C.idx_t)(¢roids[0]), + params.sp, + ); c != 0 { + return nil, nil, getLastError() + } + + return centroids, centroidDistances, nil +} + +func (b *faissBinaryIndex) ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ( + []uint64, [][]uint8, error) { + if limit <= 0 { + return nil, nil, nil + } + + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return nil, nil, errNotBIVFIndex + } + + nlist := int(C.faiss_IndexBinaryIVF_nlist(ivfPtrBinary)) + if nlist == 0 { + return nil, nil, nil + } + + centroidCardinalities := make([]C.size_t, nlist) + + // Allocate a flat buffer for all centroids, then slice it per centroid + d := b.D() + flatCentroids := make([]uint8, nlist*d/8) + + // Call the C function to fill centroid vectors and cardinalities + c := C.faiss_IndexBinaryIVF_get_centroids_and_cardinality( + ivfPtrBinary, + (*C.uint8_t)(&flatCentroids[0]), + (*C.size_t)(¢roidCardinalities[0]), + nil, + ) + if c != 0 { + return nil, nil, getLastError() + } + + topIndices := getIndicesOfKCentroidCardinalities( + centroidCardinalities, + min(limit, nlist), + descending) + + rvCardinalities := make([]uint64, len(topIndices)) + rvCentroids := make([][]uint8, len(topIndices)) + + for i, idx := range topIndices { + rvCardinalities[i] = uint64(centroidCardinalities[idx]) + rvCentroids[i] = flatCentroids[idx*d : (idx+1)*d] + } + + return rvCardinalities, rvCentroids, nil + +} + +func (b *faissBinaryIndex) SearchClustersFromIVFIndex(eligibleCentroidIDs []int64, centroidDis []int32, centroidsToProbe int, + xb []uint8, k int64, include Selector, params json.RawMessage) ([]int32, []int64, error) { + // Applicable only to IVF indexes + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(b.bIdx) + if ivfPtrBinary == nil { + return nil, nil, errNotBIVFIndex + } + // If no include selector is provided, we have no results to return. + // return an error indicating that the SearchClustersFromIVFIndex requires a valid selector. + if include == nil { + return nil, nil, fmt.Errorf("SearchClustersFromIVFIndex requires a valid include selector") + } + // create a temporary search params object to set nprobe, this will override + // the nprobe and the nlist set at index time, this will allow the search to + // probe only the clusters specified in eligibleCentroidIDs + tempParams := &defaultSearchParamsIVF{ + // Nlist is set to the number of eligible centroids, which will override + // the nlist set at index time. + Nlist: len(eligibleCentroidIDs), + // Have to override nprobe so that more clusters will be searched for this + // query, if required. + Nprobe: centroidsToProbe, + } + searchParams, err := NewBinarySearchParams(b, params, include, tempParams) + if err != nil { + return nil, nil, err + } + defer searchParams.Delete() + + n := (len(xb) * 8) / b.D() + + distances := make([]int32, int64(n)*k) + labels := make([]int64, int64(n)*k) + // Adjust the slices to match the effective nprobe set in searchParams, as the input + // parameters may have different nprobe value, which will be a hard override, over the + // centroidsToProbe value passed to this function. + // If the effective nprobe is greater than the length of eligibleCentroidIDs, + // we limit it to the length of eligibleCentroidIDs. + effectiveNprobe := min(getNProbeFromSearchParams(searchParams), int32(len(eligibleCentroidIDs))) + eligibleCentroidIDs = eligibleCentroidIDs[:effectiveNprobe] + centroidDis = centroidDis[:effectiveNprobe] + + if c := C.faiss_IndexBinaryIVF_search_preassigned_with_params( + ivfPtrBinary, + (C.idx_t)(n), + (*C.uint8_t)(&xb[0]), + (C.idx_t)(k), + (*C.idx_t)(&eligibleCentroidIDs[0]), + (*C.int32_t)(¢roidDis[0]), + (*C.int32_t)(&distances[0]), + (*C.idx_t)(&labels[0]), + (C.int)(0), + searchParams.sp, + ); c != 0 { + return nil, nil, getLastError() + } + + return distances, labels, nil +} + +func (b *faissBinaryIndex) Size() uint64 { + size := C.faiss_IndexBinary_size(b.bIdx) + return uint64(size) +} + +func (idx *faissBinaryIndex) Close() { + C.faiss_IndexBinary_free(idx.bIdx) +} + +type BinaryIndexImpl struct { + BinaryIndex +} + +func BinaryIndexFactory(dims int, description string) (*BinaryIndexImpl, error) { + var cDescription *C.char + if description != "" { + cDescription = C.CString(description) + defer C.free(unsafe.Pointer(cDescription)) + } + var idx faissBinaryIndex + if c := C.faiss_index_binary_factory(&idx.bIdx, C.int(dims), cDescription); c != 0 { + return nil, getLastError() + } + + return &BinaryIndexImpl{&idx}, nil +} + +func (idx *faissBinaryIndex) SetQuantizers(srcIndex BinaryIndex) error { + bivf := C.faiss_IndexBinaryIVF_cast(idx.bPtr()) + if bivf == nil { + return errNotBIVFIndex + } + + srcIndexPtr := srcIndex.bPtr() + if srcIndexPtr == nil { + return fmt.Errorf("coarse quantizer is not valid") + } + + err := C.faiss_Set_quantizers_binary(idx.bIdx, srcIndexPtr) + if err != 0 { + return fmt.Errorf("faissBinaryIndex err: %w", errFailedToSetQuantizers) + } + + return nil +} + +func (idx *faissBinaryIndex) MergeFrom(other BinaryIndex, add_id int64) (err error) { + if !idx.IsIVFIndex() && !other.IsIVFIndex() { + return fmt.Errorf("faissBinaryIndex err: %w", errNotBIVFIndex) + } + + if c := C.faiss_IndexBinaryIVF_merge_from( + idx.bPtr(), + other.bPtr(), + (C.idx_t)(add_id), + ); c != 0 { + err = getLastError() + } + + return err +} diff --git a/vendor/github.com/blevesearch/go-faiss/index_flat.go b/vendor/github.com/blevesearch/go-faiss/index_flat.go index b8a3c03880..65d3bb64ee 100644 --- a/vendor/github.com/blevesearch/go-faiss/index_flat.go +++ b/vendor/github.com/blevesearch/go-faiss/index_flat.go @@ -44,13 +44,3 @@ func (idx *IndexFlat) Xb() []float32 { C.faiss_IndexFlat_xb(idx.cPtr(), &ptr, &size) return (*[1 << 30]float32)(unsafe.Pointer(ptr))[:size:size] } - -// AsFlat casts idx to a flat index. -// AsFlat panics if idx is not a flat index. -func (idx *IndexImpl) AsFlat() *IndexFlat { - ptr := C.faiss_IndexFlat_cast(idx.cPtr()) - if ptr == nil { - panic("index is not a flat index") - } - return &IndexFlat{&faissIndex{ptr}} -} diff --git a/vendor/github.com/blevesearch/go-faiss/index_io.go b/vendor/github.com/blevesearch/go-faiss/index_io.go index 608f4d75fe..830877f6f0 100644 --- a/vendor/github.com/blevesearch/go-faiss/index_io.go +++ b/vendor/github.com/blevesearch/go-faiss/index_io.go @@ -118,3 +118,43 @@ func ReadIndex(filename string, ioflags int) (*IndexImpl, error) { } return &IndexImpl{&idx}, nil } + +func WriteBinaryIndexIntoBuffer(idx BinaryIndex) ([]byte, error) { + // the values to be returned by the faiss APIs + tempBuf := (*C.uchar)(nil) + bufSize := C.size_t(0) + + if c := C.faiss_write_index_binary_buf( + idx.bPtr(), + &bufSize, + &tempBuf, + ); c != 0 { + C.faiss_free_buf(&tempBuf) + return nil, getLastError() + } + + val := unsafe.Slice((*byte)(unsafe.Pointer(tempBuf)), uint(bufSize)) + + rv := make([]byte, uint(bufSize)) + copy(rv, val) + + C.faiss_free_buf(&tempBuf) + val = nil + + return rv, nil +} + +func ReadBinaryIndexFromBuffer(buf []byte, ioflags int) (*BinaryIndexImpl, error) { + ptr := (*C.uchar)(unsafe.Pointer(&buf[0])) + size := C.size_t(len(buf)) + + var bIdx faissBinaryIndex + if c := C.faiss_read_index_binary_buf(ptr, + size, + C.int(ioflags), + &bIdx.bIdx); c != 0 { + return nil, getLastError() + } + + return &BinaryIndexImpl{&bIdx}, nil +} diff --git a/vendor/github.com/blevesearch/go-faiss/index_ivf.go b/vendor/github.com/blevesearch/go-faiss/index_ivf.go index 38f023aa90..0ae0fc737b 100644 --- a/vendor/github.com/blevesearch/go-faiss/index_ivf.go +++ b/vendor/github.com/blevesearch/go-faiss/index_ivf.go @@ -6,17 +6,18 @@ package faiss #include #include #include +#include */ import "C" import ( "fmt" ) -func (idx *IndexImpl) SetDirectMap(mapType int) (err error) { +func (idx *faissIndex) SetDirectMap(mapType int) (err error) { ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) if ivfPtr == nil { - return fmt.Errorf("index is not of ivf type") + return errNotIVFIndex } if c := C.faiss_IndexIVF_set_direct_map( ivfPtr, @@ -27,7 +28,7 @@ func (idx *IndexImpl) SetDirectMap(mapType int) (err error) { return err } -func (idx *IndexImpl) GetSubIndex() (*IndexImpl, error) { +func (idx *faissIndex) GetSubIndex() (Index, error) { ptr := C.faiss_IndexIDMap2_cast(idx.cPtr()) if ptr == nil { @@ -44,7 +45,7 @@ func (idx *IndexImpl) GetSubIndex() (*IndexImpl, error) { // pass nprobe to be set as index time option for IVF indexes only. // varying nprobe impacts recall but with an increase in latency. -func (idx *IndexImpl) SetNProbe(nprobe int32) { +func (idx *faissIndex) SetNProbe(nprobe int32) { ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) if ivfPtr == nil { return @@ -52,10 +53,35 @@ func (idx *IndexImpl) SetNProbe(nprobe int32) { C.faiss_IndexIVF_set_nprobe(ivfPtr, C.size_t(nprobe)) } -func (idx *IndexImpl) GetNProbe() int32 { +func (idx *faissIndex) IVFParams() (nprobe, nlist int) { ivfPtr := C.faiss_IndexIVF_cast(idx.cPtr()) if ivfPtr == nil { - return 0 + return 0, 0 } - return int32(C.faiss_IndexIVF_nprobe(ivfPtr)) + return int(C.faiss_IndexIVF_nprobe(ivfPtr)), + int(C.faiss_IndexIVF_nlist(ivfPtr)) +} + +func (idx *faissIndex) IsSQIndex() bool { + sqPtr := C.faiss_IndexScalarQuantizer_cast(idx.cPtr()) + return sqPtr != nil +} + +func (idx *faissIndex) SetQuantizers(srcIndex Index) error { + if !(idx.IsIVFIndex() && srcIndex.IsIVFIndex()) && + !(idx.IsSQIndex() && srcIndex.IsSQIndex()) { + return fmt.Errorf("faissIndex SetQuantizers: %w, index type not supported", errFailedToSetQuantizers) + } + + srcIndexPtr := srcIndex.cPtr() + if srcIndexPtr == nil { + return fmt.Errorf("coarse quantizer is not valid") + } + + err := C.faiss_Set_quantizers(idx.idx, srcIndexPtr) + if err != 0 { + return fmt.Errorf("faissIndex SetQuantizers: %w", errFailedToSetQuantizers) + } + + return nil } diff --git a/vendor/github.com/blevesearch/go-faiss/search_params.go b/vendor/github.com/blevesearch/go-faiss/search_params.go index 6086073823..baa028e458 100644 --- a/vendor/github.com/blevesearch/go-faiss/search_params.go +++ b/vendor/github.com/blevesearch/go-faiss/search_params.go @@ -3,6 +3,8 @@ package faiss /* #include #include +#include +#include #include */ import "C" @@ -54,60 +56,148 @@ func getNProbeFromSearchParams(params *SearchParams) int32 { return int32(C.faiss_SearchParametersIVF_nprobe(params.sp)) } -// Returns a valid SearchParams object, -// thus caller must clean up the object -// by invoking Delete() method. -func NewSearchParams(idx Index, params json.RawMessage, sel *C.FaissIDSelector, +// Returns a valid SearchParams object, configured according to the provided +// parameters and selector. The returned SearchParams object is allocated, +// thus caller must clean up the object by invoking Delete() method. +func NewSearchParams(idx Index, params json.RawMessage, selector Selector, defaultParams *defaultSearchParamsIVF) (*SearchParams, error) { - rv := &SearchParams{} - if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 { - return nil, fmt.Errorf("failed to create faiss search params") + // Get the selector C pointer, if any. + // A nil selector indicates no ID filtering, and it is valid + // to send a nil pointer to Faiss. + var sel *C.FaissIDSelector + if selector != nil { + sel = selector.Get() } - // check if the index is IVF and set the search params - if ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()); ivfIdx != nil { - rv.sp = C.faiss_SearchParametersIVF_cast(rv.sp) - if len(params) == 0 && sel == nil { - return rv, nil + + ivfIdx := C.faiss_IndexIVF_cast(idx.cPtr()) + // if the index is not an IVF index, create a standard SearchParameters object + if ivfIdx == nil { + rv := &SearchParams{} + // Create standard SearchParameters for non-IVF index + if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 { + return nil, fmt.Errorf("failed to create faiss search params") } - var nlist, nprobe, nvecs, maxCodes int - nlist = int(C.faiss_IndexIVF_nlist(ivfIdx)) - nprobe = int(C.faiss_IndexIVF_nprobe(ivfIdx)) - nvecs = int(C.faiss_Index_ntotal(idx.cPtr())) - if defaultParams != nil { - if defaultParams.Nlist > 0 { - nlist = defaultParams.Nlist - } - if defaultParams.Nprobe > 0 { - nprobe = defaultParams.Nprobe - } + return rv, nil + } + + nlist := int(C.faiss_IndexIVF_nlist(ivfIdx)) + nprobe := int(C.faiss_IndexIVF_nprobe(ivfIdx)) + nvecs := int(C.faiss_Index_ntotal(idx.cPtr())) + + maxCodes, nprobe, err := resolveSearchParams(params, defaultParams, nlist, nprobe, nvecs) + if err != nil { + return nil, err + } + + if idx.HasRaBitQ() { + return buildRaBitQSearchParams(maxCodes, nprobe, sel) + } + return buildIVFSearchParams(maxCodes, nprobe, sel) +} + +func resolveSearchParams(params json.RawMessage, defaultParams *defaultSearchParamsIVF, + nlist, nprobe, nvecs int) (int, int, error) { + if defaultParams != nil { + if defaultParams.Nlist > 0 { + nlist = defaultParams.Nlist } - var ivfParams searchParamsIVF - if len(params) > 0 { - if err := json.Unmarshal(params, &ivfParams); err != nil { - rv.Delete() - return nil, fmt.Errorf("failed to unmarshal IVF search params, "+ - "err:%v", err) - } - if err := ivfParams.Validate(); err != nil { - rv.Delete() - return nil, err - } + if defaultParams.Nprobe > 0 { + nprobe = defaultParams.Nprobe } - if ivfParams.NprobePct > 0 { - nprobe = max(int(float32(nlist)*(ivfParams.NprobePct/100)), 1) + } + var ivfParams searchParamsIVF + if len(params) > 0 { + if err := json.Unmarshal(params, &ivfParams); err != nil { + return 0, 0, fmt.Errorf("failed to unmarshal IVF search params, "+ + "err:%v", err) } - if ivfParams.MaxCodesPct > 0 { - maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100)) - } // else, maxCodes will be set to the default value of 0, which means no limit - if c := C.faiss_SearchParametersIVF_new_with( - &rv.sp, - sel, - C.size_t(nprobe), - C.size_t(maxCodes), - ); c != 0 { - rv.Delete() - return nil, fmt.Errorf("failed to create faiss IVF search params") + if err := ivfParams.Validate(); err != nil { + return 0, 0, err } } + if ivfParams.NprobePct > 0 { + nprobe = max(int(float32(nlist)*(ivfParams.NprobePct/100)), 1) + } + var maxCodes int + if ivfParams.MaxCodesPct > 0 { + maxCodes = int(float32(nvecs) * (ivfParams.MaxCodesPct / 100)) + } // else, maxCodes will be set to the default value of 0, which means no limit + return maxCodes, nprobe, nil +} + +func buildIVFSearchParams(maxCodes, nprobe int, sel *C.FaissIDSelector) (*SearchParams, error) { + sp := &SearchParams{} + if c := C.faiss_SearchParametersIVF_new_with( + &sp.sp, + sel, + C.size_t(nprobe), + C.size_t(maxCodes), + ); c != 0 { + return nil, fmt.Errorf("failed to create faiss IVF search params") + } + + return sp, nil +} + +func buildRaBitQSearchParams(maxCodes, nprobe int, sel *C.FaissIDSelector) (*SearchParams, error) { + sp := &SearchParams{} + if c := C.faiss_SearchParametersRaBitQ_new_with( + &sp.sp, + sel, + C.size_t(nprobe), + C.size_t(maxCodes), + ); c != 0 { + return nil, fmt.Errorf("failed to create faiss IVF RaBitQ search params") + } + + return sp, nil +} + +// Returns a standard SearchParams object without any special settings with +// the provided selector. The returned SearchParams object is allocated, +// thus caller must clean up the object by invoking Delete() method. +func NewStandardSearchParams(selector Selector) (*SearchParams, error) { + var sel *C.FaissIDSelector + if selector != nil { + sel = selector.Get() + } + rv := &SearchParams{} + if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 { + return nil, fmt.Errorf("failed to create faiss search params") + } return rv, nil } + +func NewBinarySearchParams(idx BinaryIndex, params json.RawMessage, selector Selector, + defaultParams *defaultSearchParamsIVF) (*SearchParams, error) { + // Get the selector C pointer, if any. + // A nil selector indicates no ID filtering, and it is valid + // to send a nil pointer to Faiss. + var sel *C.FaissIDSelector + if selector != nil { + sel = selector.Get() + } + + ivfPtrBinary := C.faiss_IndexBinaryIVF_cast(idx.bPtr()) + + // if the index is not an IVF index, create a standard SearchParameters object + if ivfPtrBinary == nil { + rv := &SearchParams{} + // Create standard SearchParameters for non-IVF index + if c := C.faiss_SearchParameters_new(&rv.sp, sel); c != 0 { + return nil, fmt.Errorf("failed to create faiss search params") + } + return rv, nil + } + + nlist := int(C.faiss_IndexBinaryIVF_nlist(ivfPtrBinary)) + nprobe := int(C.faiss_IndexBinaryIVF_nprobe(ivfPtrBinary)) + nvecs := int(C.faiss_IndexBinary_ntotal(idx.bPtr())) + + maxCodes, nprobe, err := resolveSearchParams(params, defaultParams, nlist, nprobe, nvecs) + if err != nil { + return nil, err + } + + return buildIVFSearchParams(maxCodes, nprobe, sel) +} diff --git a/vendor/github.com/blevesearch/go-faiss/selector.go b/vendor/github.com/blevesearch/go-faiss/selector.go index 8e95c4618f..d250096d41 100644 --- a/vendor/github.com/blevesearch/go-faiss/selector.go +++ b/vendor/github.com/blevesearch/go-faiss/selector.go @@ -5,36 +5,30 @@ package faiss */ import "C" +// Note: currently we have only one implementation, but we keep the interface for future extensibility type Selector interface { + ExcludeFilter() bool Get() *C.FaissIDSelector Delete() } // IDSelector represents a set of IDs to remove. type IDSelector struct { - sel *C.FaissIDSelector -} - -// Delete frees the memory associated with s. -func (s *IDSelector) Delete() { - if s == nil || s.sel == nil { - return - } - - C.faiss_IDSelector_free(s.sel) + exclude bool + sel *C.FaissIDSelector + inner *C.FaissIDSelector } func (s *IDSelector) Get() *C.FaissIDSelector { return s.sel } -type IDSelectorNot struct { - sel *C.FaissIDSelector - batchSel *C.FaissIDSelector +func (s *IDSelector) ExcludeFilter() bool { + return s.exclude } // Delete frees the memory associated with s. -func (s *IDSelectorNot) Delete() { +func (s *IDSelector) Delete() { if s == nil { return } @@ -42,15 +36,11 @@ func (s *IDSelectorNot) Delete() { if s.sel != nil { C.faiss_IDSelector_free(s.sel) } - if s.batchSel != nil { - C.faiss_IDSelector_free(s.batchSel) + if s.inner != nil { + C.faiss_IDSelector_free(s.inner) } } -func (s *IDSelectorNot) Get() *C.FaissIDSelector { - return s.sel -} - // NewIDSelectorRange creates a selector that removes IDs on [imin, imax). func NewIDSelectorRange(imin, imax int64) (Selector, error) { var sel *C.FaissIDSelectorRange @@ -58,7 +48,7 @@ func NewIDSelectorRange(imin, imax int64) (Selector, error) { if c != 0 { return nil, getLastError() } - return &IDSelector{(*C.FaissIDSelector)(sel)}, nil + return &IDSelector{sel: (*C.FaissIDSelector)(sel)}, nil } // NewIDSelectorBatch creates a new batch selector. @@ -71,12 +61,12 @@ func NewIDSelectorBatch(indices []int64) (Selector, error) { ); c != 0 { return nil, getLastError() } - return &IDSelector{(*C.FaissIDSelector)(sel)}, nil + return &IDSelector{sel: (*C.FaissIDSelector)(sel)}, nil } -// NewIDSelectorNot creates a new Not selector, wrapped around a +// NewIDSelectorBatchNot creates a new Not selector, wrapped around a // batch selector, with the IDs in 'exclude'. -func NewIDSelectorNot(exclude []int64) (Selector, error) { +func NewIDSelectorBatchNot(exclude []int64) (Selector, error) { batchSelector, err := NewIDSelectorBatch(exclude) if err != nil { return nil, err @@ -90,6 +80,49 @@ func NewIDSelectorNot(exclude []int64) (Selector, error) { batchSelector.Delete() return nil, getLastError() } - return &IDSelectorNot{sel: (*C.FaissIDSelector)(sel), - batchSel: batchSelector.Get()}, nil + return &IDSelector{exclude: true, + sel: (*C.FaissIDSelector)(sel), + inner: batchSelector.Get()}, nil +} + +// NewIDSelectorBitmap creates a selector using a bitset, where each bit +// indicates whether the corresponding ID is to be selected. +// NOTE: This function assumes that len(bitmap)*8 covers the full range of IDs +// in the index, and only works when we have vector IDs ranging from 0 to N-1, +// where N is the number of vectors in the index. +// The length of the bitmap should be at least ceil(N/8). +func NewIDSelectorBitmap(bitmap []byte) (Selector, error) { + var sel *C.FaissIDSelectorBitmap + if c := C.faiss_IDSelectorBitmap_new( + &sel, + C.size_t(len(bitmap)), + (*C.uint8_t)(&bitmap[0]), + ); c != 0 { + return nil, getLastError() + } + return &IDSelector{sel: (*C.FaissIDSelector)(sel)}, nil +} + +// NewIDSelectorBitmapNot creates a NOT selector using a bitset, where each bit +// indicates whether the corresponding ID is NOT to be selected. +// NOTE: This function assumes that len(bitmap)*8 covers the full range of IDs +// in the index, and only works when we have vector IDs ranging from 0 to N-1, +// where N is the number of vectors in the index. +// The length of the bitmap should be at least ceil(N/8). +func NewIDSelectorBitmapNot(bitmap []byte) (Selector, error) { + bitmapSelector, err := NewIDSelectorBitmap(bitmap) + if err != nil { + return nil, err + } + var sel *C.FaissIDSelectorNot + if c := C.faiss_IDSelectorNot_new( + &sel, + bitmapSelector.Get(), + ); c != 0 { + bitmapSelector.Delete() + return nil, getLastError() + } + return &IDSelector{exclude: true, + sel: (*C.FaissIDSelector)(sel), + inner: bitmapSelector.Get()}, nil } diff --git a/vendor/github.com/blevesearch/mmap-go/.gitignore b/vendor/github.com/blevesearch/mmap-go/.gitignore index 0c0a5e4916..6c694e4b7d 100644 --- a/vendor/github.com/blevesearch/mmap-go/.gitignore +++ b/vendor/github.com/blevesearch/mmap-go/.gitignore @@ -7,4 +7,5 @@ _obj _test testdata /.idea -*.iml \ No newline at end of file +*.iml +/notes.txt diff --git a/vendor/github.com/blevesearch/mmap-go/.travis.yml b/vendor/github.com/blevesearch/mmap-go/.travis.yml deleted file mode 100644 index 169eb1f354..0000000000 --- a/vendor/github.com/blevesearch/mmap-go/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -language: go -os: - - linux - - osx - - windows -go: - - 1.11.4 -env: - global: - - GO111MODULE=on -install: - - go mod download - - go get github.com/mattn/goveralls -script: - - go test -v -covermode=count -coverprofile=coverage.out -bench . -cpu 1,4 - - '[ "${TRAVIS_PULL_REQUEST}" = "false" ] && $HOME/gopath/bin/goveralls -coverprofile=coverage.out -service=travis-ci -repotoken $COVERALLS_TOKEN || true' diff --git a/vendor/github.com/blevesearch/mmap-go/README.md b/vendor/github.com/blevesearch/mmap-go/README.md index 4cc2bfe1c8..30166aa4a9 100644 --- a/vendor/github.com/blevesearch/mmap-go/README.md +++ b/vendor/github.com/blevesearch/mmap-go/README.md @@ -1,12 +1,14 @@ mmap-go ======= +[![Tests](https://github.com/blevesearch/mmap-go/actions/workflows/tests.yml/badge.svg?branch=master&event=push)](https://github.com/blevesearch/mmap-go/actions/workflows/tests.yml?query=event%3Apush+branch%3Amaster) +[![Go Reference](https://pkg.go.dev/badge/github.com/blevesearch/mmap-go.svg)](https://pkg.go.dev/github.com/blevesearch/mmap-go) mmap-go is a portable mmap package for the [Go programming language](http://golang.org). -It has been tested on Linux (386, amd64), OS X, and Windows (386). It should also -work on other Unix-like platforms, but hasn't been tested with them. I'm interested -to hear about the results. - -I haven't been able to add more features without adding significant complexity, -so mmap-go doesn't support mprotect, mincore, and maybe a few other things. -If you're running on a Unix-like platform and need some of these features, -I suggest Gustavo Niemeyer's [gommap](http://labix.org/gommap). + +Operating System Support +======================== +This package is tested using GitHub Actions on Linux, macOS, and Windows. It should also work on other Unix-like platforms, but hasn't been tested with them. I'm interested to hear about the results. + +This package compiles for Plan 9 and WebAssembly, but its functions always return errors. + +Related functions such as `mprotect` and `mincore` aren't included. I haven't found a way to implement them on Windows without introducing significant complexity. If you're running on a Unix-like platform and really need these features, it should still be possible to implement them on top of this package via `syscall`. diff --git a/vendor/github.com/blevesearch/mmap-go/mmap.go b/vendor/github.com/blevesearch/mmap-go/mmap.go index 29655bd222..736f29a2d7 100644 --- a/vendor/github.com/blevesearch/mmap-go/mmap.go +++ b/vendor/github.com/blevesearch/mmap-go/mmap.go @@ -8,10 +8,10 @@ // Package mmap allows mapping files into memory. It tries to provide a simple, reasonably portable interface, // but doesn't go out of its way to abstract away every little platform detail. // This specifically means: -// * forked processes may or may not inherit mappings -// * a file's timestamp may or may not be updated by writes through mappings -// * specifying a size larger than the file's actual size can increase the file's size -// * If the mapped file is being modified by another process while your program's running, don't expect consistent results between platforms +// - forked processes may or may not inherit mappings +// - a file's timestamp may or may not be updated by writes through mappings +// - specifying a size larger than the file's actual size can increase the file's size +// - If the mapped file is being modified by another process while your program's running, don't expect consistent results between platforms package mmap import ( diff --git a/vendor/github.com/blevesearch/mmap-go/mmap_plan9.go b/vendor/github.com/blevesearch/mmap-go/mmap_plan9.go new file mode 100644 index 0000000000..e4c33d39b8 --- /dev/null +++ b/vendor/github.com/blevesearch/mmap-go/mmap_plan9.go @@ -0,0 +1,27 @@ +// Copyright 2020 Evan Shaw. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mmap + +import "syscall" + +func mmap(len int, inprot, inflags, fd uintptr, off int64) ([]byte, error) { + return nil, syscall.EPLAN9 +} + +func (m MMap) flush() error { + return syscall.EPLAN9 +} + +func (m MMap) lock() error { + return syscall.EPLAN9 +} + +func (m MMap) unlock() error { + return syscall.EPLAN9 +} + +func (m MMap) unmap() error { + return syscall.EPLAN9 +} diff --git a/vendor/github.com/blevesearch/mmap-go/mmap_unix.go b/vendor/github.com/blevesearch/mmap-go/mmap_unix.go index 25b13e51fd..62d0aef6cd 100644 --- a/vendor/github.com/blevesearch/mmap-go/mmap_unix.go +++ b/vendor/github.com/blevesearch/mmap-go/mmap_unix.go @@ -2,6 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build darwin || dragonfly || freebsd || linux || openbsd || solaris || netbsd // +build darwin dragonfly freebsd linux openbsd solaris netbsd package mmap diff --git a/vendor/github.com/blevesearch/mmap-go/mmap_wasm.go b/vendor/github.com/blevesearch/mmap-go/mmap_wasm.go new file mode 100644 index 0000000000..cfe1c50b03 --- /dev/null +++ b/vendor/github.com/blevesearch/mmap-go/mmap_wasm.go @@ -0,0 +1,27 @@ +// Copyright 2024 Evan Shaw. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package mmap + +import "syscall" + +func mmap(len int, inprot, inflags, fd uintptr, off int64) ([]byte, error) { + return nil, syscall.ENOTSUP +} + +func (m MMap) flush() error { + return syscall.ENOTSUP +} + +func (m MMap) lock() error { + return syscall.ENOTSUP +} + +func (m MMap) unlock() error { + return syscall.ENOTSUP +} + +func (m MMap) unmap() error { + return syscall.ENOTSUP +} diff --git a/vendor/github.com/blevesearch/mmap-go/mmap_windows.go b/vendor/github.com/blevesearch/mmap-go/mmap_windows.go index 631b3825f9..e0d986f70a 100644 --- a/vendor/github.com/blevesearch/mmap-go/mmap_windows.go +++ b/vendor/github.com/blevesearch/mmap-go/mmap_windows.go @@ -67,6 +67,7 @@ func mmap(len int, prot, flags, hfile uintptr, off int64) ([]byte, error) { fileOffsetLow := uint32(off & 0xFFFFFFFF) addr, errno := windows.MapViewOfFile(h, dwDesiredAccess, fileOffsetHigh, fileOffsetLow, uintptr(len)) if addr == 0 { + windows.CloseHandle(windows.Handle(h)) return nil, os.NewSyscallError("MapViewOfFile", errno) } handleLock.Lock() @@ -101,7 +102,7 @@ func (m MMap) flush() error { return errors.New("unknown base address") } - if handle.writable { + if handle.writable && handle.file != windows.Handle(^uintptr(0)) { if err := windows.FlushFileBuffers(handle.file); err != nil { return os.NewSyscallError("FlushFileBuffers", err) } diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/.golangci.yml b/vendor/github.com/blevesearch/scorch_segment_api/v2/.golangci.yml deleted file mode 100644 index 664f35f27e..0000000000 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/.golangci.yml +++ /dev/null @@ -1,42 +0,0 @@ -linters: - # please, do not use `enable-all`: it's deprecated and will be removed soon. - # inverted configuration with `enable-all` and `disable` is not scalable during updates of golangci-lint - disable-all: true - enable: - - bodyclose - - deadcode - - depguard - - dogsled - - dupl - - errcheck - - funlen - - gochecknoinits - - goconst - - gocritic - - gocyclo - - gofmt - - goimports - - golint - - gomnd - - goprintffuncname - - gosec - - gosimple - - govet - - ineffassign - - interfacer - - lll - - misspell - - nakedret - - nolintlint - - rowserrcheck - - scopelint - - staticcheck - - structcheck - - stylecheck - - typecheck - - unconvert - - unparam - - unused - - varcheck - - whitespace - diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/README.md b/vendor/github.com/blevesearch/scorch_segment_api/v2/README.md index dc33b004ed..76a994fe9e 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/README.md +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/README.md @@ -1,8 +1,7 @@ # Scorch Segment API -[![PkgGoDev](https://pkg.go.dev/badge/github.com/blevesearch/scorch_segment_api)](https://pkg.go.dev/github.com/blevesearch/scorch_segment_api) -[![Tests](https://github.com/blevesearch/scorch_segment_api/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/blevesearch/scorch_segment_api/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) -[![Lint](https://github.com/blevesearch/scorch_segment_api/workflows/Lint/badge.svg?branch=master&event=push)](https://github.com/blevesearch/scorch_segment_api/actions?query=workflow%3ALint+event%3Apush+branch%3Amaster) +[![Go Reference](https://pkg.go.dev/badge/github.com/blevesearch/scorch_segment_api/v2.svg)](https://pkg.go.dev/github.com/blevesearch/scorch_segment_api/v2) +[![Tests](https://github.com/blevesearch/scorch_segment_api/actions/workflows/tests.yml/badge.svg?branch=master&event=push)](https://github.com/blevesearch/scorch_segment_api/actions/workflows/tests.yml?query=event%3Apush+branch%3Amaster) Scorch supports a pluggable Segment interface. diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go index 122a28d793..00b64a8cf8 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment.go @@ -67,6 +67,11 @@ type UpdatableSegment interface { SetUpdatedFields(fieldInfo map[string]*index.UpdateFieldInfo) } +type SegmentWithCallbacks interface { + Segment + CallbackId() string +} + type TermDictionary interface { PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) @@ -182,6 +187,10 @@ type FieldStatsReporter interface { UpdateFieldStats(FieldStats) } +type VectorFieldStatsReporter interface { + UpdateVectorFieldStats(FieldStats) +} + type FieldStats interface { Store(statName, fieldName string, value uint64) Aggregate(stats FieldStats) @@ -243,3 +252,24 @@ type Synonym interface { Size() int } + +// NestedSegment is an optional interface that a Segment may implement +// to provide access to nested document relationships within that segment. +type NestedSegment interface { + Segment + // Ancestors returns a slice of ancestor IDs for the given document ID. + // If the document has no ancestors or if the segment does not support nested documents, + // a slice containing only the document ID itself is returned. + Ancestors(docID uint64, prealloc []index.AncestorID) []index.AncestorID + + // CountRoot returns the number of root documents in the segment, excluding any documents + // that are marked as deleted in the provided bitmap. If the segment does not support nested + // documents, it returns the total document count minus the count of deleted documents. + // A root document is defined as a document that is not a child of any other document. + CountRoot(deleted *roaring.Bitmap) uint64 + + // AddNestedDocuments updates the provided bitmap to include all nested documents + // associated with documents marked as deleted in the bitmap. This ensures that when + // a parent document is deleted, all its nested child documents are also considered deleted. + AddNestedDocuments(deleted *roaring.Bitmap) *roaring.Bitmap +} diff --git a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go index 7e50ce46f2..430afb36dd 100644 --- a/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go +++ b/vendor/github.com/blevesearch/scorch_segment_api/v2/segment_vector.go @@ -20,8 +20,8 @@ package segment import ( "encoding/json" - index "github.com/blevesearch/bleve_index_api" "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" ) type VecPostingsList interface { @@ -58,21 +58,32 @@ type VecPostingsIterator interface { } type VectorIndex interface { - // @params: Search params for backing vector index (like IVF, HNSW, etc.) + // Search performs a kNN search for the given query vector and returns a postings list. + // - qVector: the query vector + // - k: the number of similar vectors to return + // - params: additional search parameters Search(qVector []float32, k int64, params json.RawMessage) (VecPostingsList, error) - // @eligibleDocIDs: DocIDs in the segment eligible for the kNN query. - SearchWithFilter(qVector []float32, k int64, eligibleDocIDs []uint64, - params json.RawMessage) (VecPostingsList, error) + // SearchWithFilter performs a kNN search for the given query vector, filtering results based on eligible documents + // - qVector: the query vector + // - k: the number of similar vectors to return + // - eligibleList: list of eligible documents to consider + // - params: additional search parameters + SearchWithFilter(qVector []float32, k int64, eligibleList index.EligibleDocumentList, params json.RawMessage) (VecPostingsList, error) + // Close releases any resources held by the VectorIndex. Close() Size() uint64 ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ([]index.CentroidCardinality, error) } +type TrainedSegment interface { + Segment + GetCoarseQuantizer(field string) (interface{}, error) +} + type VectorSegment interface { Segment - InterpretVectorIndex(field string, requiresFiltering bool, except *roaring.Bitmap) ( - VectorIndex, error) + InterpretVectorIndex(field string, except *roaring.Bitmap) (VectorIndex, error) } type VecPosting interface { diff --git a/vendor/github.com/blevesearch/vellum/README.md b/vendor/github.com/blevesearch/vellum/README.md index e5c4a8bce8..1357f9d038 100644 --- a/vendor/github.com/blevesearch/vellum/README.md +++ b/vendor/github.com/blevesearch/vellum/README.md @@ -1,19 +1,20 @@ # ![vellum](docs/logo.png) vellum -[![Tests](https://github.com/couchbase/vellum/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/couchbase/vellum/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) -[![Coverage Status](https://coveralls.io/repos/github/couchbase/vellum/badge.svg?branch=master)](https://coveralls.io/github/couchbase/vellum?branch=master) -[![GoDoc](https://godoc.org/github.com/couchbase/vellum?status.svg)](https://godoc.org/github.com/couchbase/vellum) -[![Go Report Card](https://goreportcard.com/badge/github.com/couchbase/vellum)](https://goreportcard.com/report/github.com/couchbase/vellum) +[![Tests](https://github.com/blevesearch/vellum/actions/workflows/tests.yml/badge.svg?branch=master&event=push)](https://github.com/blevesearch/vellum/actions/workflows/tests.yml?query=event%3Apush+branch%3Amaster) +[![Go Reference](https://pkg.go.dev/badge/github.com/blevesearch/vellum.svg)](https://pkg.go.dev/github.com/blevesearch/vellum) +[![Go Report Card](https://goreportcard.com/badge/github.com/blevesearch/vellum)](https://goreportcard.com/report/github.com/blevesearch/vellum) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) A Go library implementing an FST (finite state transducer) capable of: - - mapping between keys ([]byte) and a value (uint64) - - enumerating keys in lexicographic order + +- mapping between keys ([]byte) and a value (uint64) +- enumerating keys in lexicographic order Some additional goals of this implementation: - - bounded memory use while building the FST - - streaming out FST data while building - - mmap FST runtime to support very large FTSs (optional) + +- bounded memory use while building the FST +- streaming out FST data while building +- mmap FST runtime to support very large FTSs (optional) ## Usage @@ -22,27 +23,30 @@ Some additional goals of this implementation: To build an FST, create a new builder using the `New()` method. This method takes an `io.Writer` as an argument. As the FST is being built, data will be streamed to the writer as soon as possible. With this builder you **MUST** insert keys in lexicographic order. Inserting keys out of order will result in an error. After inserting the last key into the builder, you **MUST** call `Close()` on the builder. This will flush all remaining data to the underlying writer. In memory: + ```go - var buf bytes.Buffer - builder, err := vellum.New(&buf, nil) - if err != nil { - log.Fatal(err) - } +var buf bytes.Buffer +builder, err := vellum.New(&buf, nil) +if err != nil { + log.Fatal(err) +} ``` To disk: + ```go - f, err := os.Create("/tmp/vellum.fst") - if err != nil { - log.Fatal(err) - } - builder, err := vellum.New(f, nil) - if err != nil { - log.Fatal(err) - } +f, err := os.Create("/tmp/vellum.fst") +if err != nil { + log.Fatal(err) +} +builder, err := vellum.New(f, nil) +if err != nil { + log.Fatal(err) +} ``` **MUST** insert keys in lexicographic order: + ```go err = builder.Insert([]byte("cat"), 1) if err != nil { @@ -70,45 +74,49 @@ if err != nil { After closing the builder, the data can be used to instantiate an FST. If the data was written to disk, you can use the `Open()` method to mmap the file. If the data is already in memory, or you wish to load/mmap the data yourself, you can instantiate the FST with the `Load()` method. Load in memory: + ```go - fst, err := vellum.Load(buf.Bytes()) - if err != nil { - log.Fatal(err) - } +fst, err := vellum.Load(buf.Bytes()) +if err != nil { + log.Fatal(err) +} ``` Open from disk: + ```go - fst, err := vellum.Open("/tmp/vellum.fst") - if err != nil { - log.Fatal(err) - } +fst, err := vellum.Open("/tmp/vellum.fst") +if err != nil { + log.Fatal(err) +} ``` Get key/value: + ```go - val, exists, err = fst.Get([]byte("dog")) - if err != nil { - log.Fatal(err) - } - if exists { - fmt.Printf("contains dog with val: %d\n", val) - } else { - fmt.Printf("does not contain dog") - } +val, exists, err = fst.Get([]byte("dog")) +if err != nil { + log.Fatal(err) +} +if exists { + fmt.Printf("contains dog with val: %d\n", val) +} else { + fmt.Printf("does not contain dog") +} ``` Iterate key/values: + ```go - itr, err := fst.Iterator(startKeyInclusive, endKeyExclusive) - for err == nil { - key, val := itr.Current() - fmt.Printf("contains key: %s val: %d", key, val) - err = itr.Next() - } - if err != nil { - log.Fatal(err) - } +itr, err := fst.Iterator(startKeyInclusive, endKeyExclusive) +for err == nil { + key, val := itr.Current() + fmt.Printf("contains key: %s val: %d", key, val) + err = itr.Next() +} +if err != nil { + log.Fatal(err) +} ``` ### How does the FST get built? @@ -169,14 +177,17 @@ The vellum command-line tool has a "dot" subcommand that can emit graphviz dot output data from an input vellum file. The dot file can in turn be converted into an image using graphviz tools. Example... - $ vellum dot myFile.vellum > output.dot - $ dot -Tpng output.dot -o output.png +```shell +vellum dot myFile.vellum > output.dot +dot -Tpng output.dot -o output.png +``` ## Related Work Much credit goes to two existing projects: - - [mafsa](https://github.com/smartystreets/mafsa) - - [BurntSushi/fst](https://github.com/BurntSushi/fst) + +- [mafsa](https://github.com/smartystreets/mafsa) +- [BurntSushi/fst](https://github.com/BurntSushi/fst) Most of the original implementation here started with my digging into the internals of mafsa. As the implementation progressed, I continued to borrow ideas/approaches from the BurntSushi/fst library as well. diff --git a/vendor/github.com/blevesearch/vellum/builder.go b/vendor/github.com/blevesearch/vellum/builder.go index 7e545cbec1..b9012fd31a 100644 --- a/vendor/github.com/blevesearch/vellum/builder.go +++ b/vendor/github.com/blevesearch/vellum/builder.go @@ -415,15 +415,17 @@ func outputCat(l, r uint64) uint64 { // // NB: builderNode lifecylce is described by the following interactions - // +------------------------+ +----------------------+ -// | Unfinished Nodes | Transfer once | Registry | +// | Unfinished Nodes | Transfer once | Registry | // |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | // +------------------------+ marked frozen +----------------------+ -// ^ | -// | | -// | Put() -// | Get() on +-------------------+ when -// +-new char--------| builderNode Pool |<-----------evicted -// +-------------------+ +// +// ^ ^ +// | | +// | | +// | Put() +// | Get() on +-------------------+ when +// +-new char--------| builderNode Pool |<-----------evicted +// +-------------------+ type builderNodePool struct { head *builderNode } diff --git a/vendor/github.com/blevesearch/vellum/levenshtein/levenshtein_nfa.go b/vendor/github.com/blevesearch/vellum/levenshtein/levenshtein_nfa.go index 68db5d191c..82655fbce6 100644 --- a/vendor/github.com/blevesearch/vellum/levenshtein/levenshtein_nfa.go +++ b/vendor/github.com/blevesearch/vellum/levenshtein/levenshtein_nfa.go @@ -19,13 +19,13 @@ import ( "sort" ) -/// Levenshtein Distance computed by a Levenshtein Automaton. -/// -/// Levenshtein automata can only compute the exact Levenshtein distance -/// up to a given `max_distance`. -/// -/// Over this distance, the automaton will invariably -/// return `Distance::AtLeast(max_distance + 1)`. +// Levenshtein Distance computed by a Levenshtein Automaton. +// +// Levenshtein automata can only compute the exact Levenshtein distance +// up to a given `max_distance`. +// +// Over this distance, the automaton will invariably +// return `Distance::AtLeast(max_distance + 1)`. type Distance interface { distance() uint8 } diff --git a/vendor/github.com/blevesearch/vellum/levenshtein/parametric_dfa.go b/vendor/github.com/blevesearch/vellum/levenshtein/parametric_dfa.go index d08e5da639..41d2fcf632 100644 --- a/vendor/github.com/blevesearch/vellum/levenshtein/parametric_dfa.go +++ b/vendor/github.com/blevesearch/vellum/levenshtein/parametric_dfa.go @@ -15,7 +15,7 @@ package levenshtein import ( - "crypto/md5" + "crypto/sha256" "encoding/json" "fmt" "math" @@ -311,13 +311,13 @@ func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) { } type hash struct { - index map[[16]byte]int + index map[[32]byte]int items []MultiState } func newHash() *hash { return &hash{ - index: make(map[[16]byte]int, 100), + index: make(map[[32]byte]int, 100), items: make([]MultiState, 0, 100), } } @@ -326,9 +326,9 @@ func (h *hash) getOrAllocate(m MultiState) int { size := len(h.items) var exists bool var pos int - md5 := getHash(&m) - if pos, exists = h.index[md5]; !exists { - h.index[md5] = size + sha := getHash(&m) + if pos, exists = h.index[sha]; !exists { + h.index[sha] = size pos = size h.items = append(h.items, m) } @@ -339,11 +339,11 @@ func (h *hash) getFromID(id int) *MultiState { return &h.items[id] } -func getHash(ms *MultiState) [16]byte { +func getHash(ms *MultiState) [32]byte { msBytes := []byte{} for _, state := range ms.states { jsonBytes, _ := json.Marshal(&state) msBytes = append(msBytes, jsonBytes...) } - return md5.Sum(msBytes) + return sha256.Sum256(msBytes) } diff --git a/vendor/github.com/blevesearch/vellum/vellum.go b/vendor/github.com/blevesearch/vellum/vellum.go index b2537b3f00..699d2d0986 100644 --- a/vendor/github.com/blevesearch/vellum/vellum.go +++ b/vendor/github.com/blevesearch/vellum/vellum.go @@ -32,7 +32,6 @@ Once the FST is ready, you can use the Contains() method to see if a keys is in the FST. You can use the Get() method to see if a key is in the FST and retrieve it's associated value. And, you can use the Iterator method to enumerate key/value pairs within a specified range. - */ package vellum diff --git a/vendor/github.com/blevesearch/vellum/vellum_mmap.go b/vendor/github.com/blevesearch/vellum/vellum_mmap.go index 81ea165091..789bf5b421 100644 --- a/vendor/github.com/blevesearch/vellum/vellum_mmap.go +++ b/vendor/github.com/blevesearch/vellum/vellum_mmap.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build !nommap // +build !nommap package vellum diff --git a/vendor/github.com/blevesearch/vellum/vellum_nommap.go b/vendor/github.com/blevesearch/vellum/vellum_nommap.go index e985272872..0b744aa431 100644 --- a/vendor/github.com/blevesearch/vellum/vellum_nommap.go +++ b/vendor/github.com/blevesearch/vellum/vellum_nommap.go @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +//go:build nommap // +build nommap package vellum diff --git a/vendor/github.com/blevesearch/zapx/v11/build.go b/vendor/github.com/blevesearch/zapx/v11/build.go index 3f13a2a6c6..8992b50fb6 100644 --- a/vendor/github.com/blevesearch/zapx/v11/build.go +++ b/vendor/github.com/blevesearch/zapx/v11/build.go @@ -160,7 +160,7 @@ func persistStoredFieldValues(fieldID int, func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { + dictLocs []uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -174,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v11/merge.go b/vendor/github.com/blevesearch/zapx/v11/merge.go index 50bb2ba544..1b8758692a 100644 --- a/vendor/github.com/blevesearch/zapx/v11/merge.go +++ b/vendor/github.com/blevesearch/zapx/v11/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v11/new.go b/vendor/github.com/blevesearch/zapx/v11/new.go index 095388d51d..c34a209138 100644 --- a/vendor/github.com/blevesearch/zapx/v11/new.go +++ b/vendor/github.com/blevesearch/zapx/v11/new.go @@ -45,11 +45,16 @@ var defaultChunkFactor uint32 = 1024 // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkFactor(results, defaultChunkFactor) + return z.newWithChunkFactor(results, defaultChunkFactor, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkFactor(results, defaultChunkFactor, config) } func (*ZapPlugin) newWithChunkFactor(results []index.Document, - chunkFactor uint32) (segment.Segment, uint64, error) { + chunkFactor uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -77,7 +82,7 @@ func (*ZapPlugin) newWithChunkFactor(results []index.Document, sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, config) if err == nil && s.reset() == nil { s.lastNumDocs = len(results) diff --git a/vendor/github.com/blevesearch/zapx/v11/segment.go b/vendor/github.com/blevesearch/zapx/v11/segment.go index 7465dac15c..3353a3ffce 100644 --- a/vendor/github.com/blevesearch/zapx/v11/segment.go +++ b/vendor/github.com/blevesearch/zapx/v11/segment.go @@ -37,8 +37,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -56,6 +66,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, }, f: f, mm: mm, @@ -104,6 +115,8 @@ type SegmentBase struct { m sync.Mutex fieldFSTs map[uint16]*vellum.FST + + config map[string]interface{} // config for the segment } func (sb *SegmentBase) Size() int { diff --git a/vendor/github.com/blevesearch/zapx/v12/build.go b/vendor/github.com/blevesearch/zapx/v12/build.go index de8265c140..daac3d466d 100644 --- a/vendor/github.com/blevesearch/zapx/v12/build.go +++ b/vendor/github.com/blevesearch/zapx/v12/build.go @@ -160,7 +160,7 @@ func persistStoredFieldValues(fieldID int, func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { + dictLocs []uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -174,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v12/merge.go b/vendor/github.com/blevesearch/zapx/v12/merge.go index e962c6ec17..aace047b1e 100644 --- a/vendor/github.com/blevesearch/zapx/v12/merge.go +++ b/vendor/github.com/blevesearch/zapx/v12/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v12/new.go b/vendor/github.com/blevesearch/zapx/v12/new.go index 94322c8e2d..48e13646d6 100644 --- a/vendor/github.com/blevesearch/zapx/v12/new.go +++ b/vendor/github.com/blevesearch/zapx/v12/new.go @@ -43,11 +43,16 @@ var ValidateDocFields = func(field index.Field) error { // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkMode(results, DefaultChunkMode) + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) } func (*ZapPlugin) newWithChunkMode(results []index.Document, - chunkMode uint32) (segment.Segment, uint64, error) { + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -75,7 +80,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document, sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, config) if err == nil && s.reset() == nil { s.lastNumDocs = len(results) diff --git a/vendor/github.com/blevesearch/zapx/v12/segment.go b/vendor/github.com/blevesearch/zapx/v12/segment.go index 936b63836f..04b1af18c2 100644 --- a/vendor/github.com/blevesearch/zapx/v12/segment.go +++ b/vendor/github.com/blevesearch/zapx/v12/segment.go @@ -37,8 +37,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -56,6 +66,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, }, f: f, mm: mm, @@ -104,6 +115,8 @@ type SegmentBase struct { m sync.Mutex fieldFSTs map[uint16]*vellum.FST + + config map[string]interface{} // config for the segment } func (sb *SegmentBase) Size() int { diff --git a/vendor/github.com/blevesearch/zapx/v13/build.go b/vendor/github.com/blevesearch/zapx/v13/build.go index 827e5c47e8..5f9a30377e 100644 --- a/vendor/github.com/blevesearch/zapx/v13/build.go +++ b/vendor/github.com/blevesearch/zapx/v13/build.go @@ -160,7 +160,7 @@ func persistStoredFieldValues(fieldID int, func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { + dictLocs []uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -174,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v13/merge.go b/vendor/github.com/blevesearch/zapx/v13/merge.go index e962c6ec17..aace047b1e 100644 --- a/vendor/github.com/blevesearch/zapx/v13/merge.go +++ b/vendor/github.com/blevesearch/zapx/v13/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v13/new.go b/vendor/github.com/blevesearch/zapx/v13/new.go index 94322c8e2d..48e13646d6 100644 --- a/vendor/github.com/blevesearch/zapx/v13/new.go +++ b/vendor/github.com/blevesearch/zapx/v13/new.go @@ -43,11 +43,16 @@ var ValidateDocFields = func(field index.Field) error { // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkMode(results, DefaultChunkMode) + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) } func (*ZapPlugin) newWithChunkMode(results []index.Document, - chunkMode uint32) (segment.Segment, uint64, error) { + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -75,7 +80,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document, sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, config) if err == nil && s.reset() == nil { s.lastNumDocs = len(results) diff --git a/vendor/github.com/blevesearch/zapx/v13/segment.go b/vendor/github.com/blevesearch/zapx/v13/segment.go index 936b63836f..04b1af18c2 100644 --- a/vendor/github.com/blevesearch/zapx/v13/segment.go +++ b/vendor/github.com/blevesearch/zapx/v13/segment.go @@ -37,8 +37,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -56,6 +66,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, }, f: f, mm: mm, @@ -104,6 +115,8 @@ type SegmentBase struct { m sync.Mutex fieldFSTs map[uint16]*vellum.FST + + config map[string]interface{} // config for the segment } func (sb *SegmentBase) Size() int { diff --git a/vendor/github.com/blevesearch/zapx/v14/build.go b/vendor/github.com/blevesearch/zapx/v14/build.go index b36878abbb..9daf0a5316 100644 --- a/vendor/github.com/blevesearch/zapx/v14/build.go +++ b/vendor/github.com/blevesearch/zapx/v14/build.go @@ -160,7 +160,7 @@ func persistStoredFieldValues(fieldID int, func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { + dictLocs []uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -174,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v14/merge.go b/vendor/github.com/blevesearch/zapx/v14/merge.go index e962c6ec17..aace047b1e 100644 --- a/vendor/github.com/blevesearch/zapx/v14/merge.go +++ b/vendor/github.com/blevesearch/zapx/v14/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v14/new.go b/vendor/github.com/blevesearch/zapx/v14/new.go index 94322c8e2d..48e13646d6 100644 --- a/vendor/github.com/blevesearch/zapx/v14/new.go +++ b/vendor/github.com/blevesearch/zapx/v14/new.go @@ -43,11 +43,16 @@ var ValidateDocFields = func(field index.Field) error { // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkMode(results, DefaultChunkMode) + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) } func (*ZapPlugin) newWithChunkMode(results []index.Document, - chunkMode uint32) (segment.Segment, uint64, error) { + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -75,7 +80,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document, sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, config) if err == nil && s.reset() == nil { s.lastNumDocs = len(results) diff --git a/vendor/github.com/blevesearch/zapx/v14/segment.go b/vendor/github.com/blevesearch/zapx/v14/segment.go index 936b63836f..04b1af18c2 100644 --- a/vendor/github.com/blevesearch/zapx/v14/segment.go +++ b/vendor/github.com/blevesearch/zapx/v14/segment.go @@ -37,8 +37,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -56,6 +66,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, }, f: f, mm: mm, @@ -104,6 +115,8 @@ type SegmentBase struct { m sync.Mutex fieldFSTs map[uint16]*vellum.FST + + config map[string]interface{} // config for the segment } func (sb *SegmentBase) Size() int { diff --git a/vendor/github.com/blevesearch/zapx/v15/build.go b/vendor/github.com/blevesearch/zapx/v15/build.go index 5db1d9ee24..1e6d905299 100644 --- a/vendor/github.com/blevesearch/zapx/v15/build.go +++ b/vendor/github.com/blevesearch/zapx/v15/build.go @@ -160,7 +160,7 @@ func persistStoredFieldValues(fieldID int, func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { + dictLocs []uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -174,6 +174,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v15/merge.go b/vendor/github.com/blevesearch/zapx/v15/merge.go index 738c24d6b8..f2af1834d4 100644 --- a/vendor/github.com/blevesearch/zapx/v15/merge.go +++ b/vendor/github.com/blevesearch/zapx/v15/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v15/new.go b/vendor/github.com/blevesearch/zapx/v15/new.go index 9da48fa350..594afd6865 100644 --- a/vendor/github.com/blevesearch/zapx/v15/new.go +++ b/vendor/github.com/blevesearch/zapx/v15/new.go @@ -44,11 +44,16 @@ var ValidateDocFields = func(field index.Field) error { // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkMode(results, DefaultChunkMode) + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) } func (*ZapPlugin) newWithChunkMode(results []index.Document, - chunkMode uint32) (segment.Segment, uint64, error) { + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -76,7 +81,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document, sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, config) // get the bytes written before the interim's reset() call // write it to the newly formed segment base. diff --git a/vendor/github.com/blevesearch/zapx/v15/segment.go b/vendor/github.com/blevesearch/zapx/v15/segment.go index a4938b4ba6..842aaf8092 100644 --- a/vendor/github.com/blevesearch/zapx/v15/segment.go +++ b/vendor/github.com/blevesearch/zapx/v15/segment.go @@ -38,8 +38,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -57,6 +67,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), fieldFSTs: make(map[uint16]*vellum.FST), + config: config, }, f: f, mm: mm, @@ -109,6 +120,8 @@ type SegmentBase struct { m sync.Mutex fieldFSTs map[uint16]*vellum.FST + + config map[string]interface{} // config for the segment } func (sb *SegmentBase) Size() int { diff --git a/vendor/github.com/blevesearch/zapx/v16/build.go b/vendor/github.com/blevesearch/zapx/v16/build.go index 7843653af5..6426e57c9e 100644 --- a/vendor/github.com/blevesearch/zapx/v16/build.go +++ b/vendor/github.com/blevesearch/zapx/v16/build.go @@ -159,7 +159,7 @@ func persistStoredFieldValues(fieldID int, } func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64, - storedIndexOffset uint64, sectionsIndexOffset uint64) (*SegmentBase, error) { + storedIndexOffset uint64, sectionsIndexOffset uint64, config map[string]interface{}) (*SegmentBase, error) { sb := &SegmentBase{ mem: mem, memCRC: memCRC, @@ -178,6 +178,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64 fieldsMap: make(map[string]uint16), dictLocs: make([]uint64, 0), fieldsInv: make([]string, 0), + config: config, } sb.updateSize() diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go index 3155bc220f..43e13a6ab0 100644 --- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_posting.go @@ -274,8 +274,7 @@ func (vpItr *VecPostingsIterator) BytesWritten() uint64 { // (2) search limited to a subset of documents within an attached vector index // (3) close attached vector index // (4) get the size of the attached vector index -func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool, - except *roaring.Bitmap) ( +func (sb *SegmentBase) InterpretVectorIndex(field string, except *roaring.Bitmap) ( segment.VectorIndex, error) { rv := &vectorIndexWrapper{sb: sb} @@ -304,7 +303,7 @@ func (sb *SegmentBase) InterpretVectorIndex(field string, requiresFiltering bool var err error rv.vecIndex, rv.vecDocIDMap, rv.docVecIDMap, rv.vectorIDsToExclude, err = - sb.vecIndexCache.loadOrCreate(fieldIDPlus1, sb.mem[pos:], requiresFiltering, + sb.vecIndexCache.loadOrCreate(fieldIDPlus1, sb.mem[pos:], true, // always load docVecIDMap except) if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_wrapper.go b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_wrapper.go index 47c269c50e..e85b7d4be1 100644 --- a/vendor/github.com/blevesearch/zapx/v16/faiss_vector_wrapper.go +++ b/vendor/github.com/blevesearch/zapx/v16/faiss_vector_wrapper.go @@ -81,13 +81,8 @@ func (v *vectorIndexWrapper) Search(qVector []float32, k int64, } func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, - eligibleDocIDs []uint64, params json.RawMessage) ( + eligibleList index.EligibleDocumentList, params json.RawMessage) ( segment.VecPostingsList, error) { - // If every element in the index is eligible (full selectivity), - // then this can basically be considered unfiltered kNN. - if len(eligibleDocIDs) == int(v.sb.numDocs) { - return v.Search(qVector, k, params) - } // 1. returned postings list (of type PostingsList) has two types of information - docNum and its score. // 2. both the values can be represented using roaring bitmaps. // 3. the Iterator (of type PostingsIterator) returned would operate in terms of VecPostings. @@ -102,10 +97,32 @@ func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, // vector index not found or dimensionality mismatched return rv, nil } + if eligibleList == nil { + // no eligible documents + return rv, nil + } + numEligible := eligibleList.Count() // Check and proceed only if non-zero documents eligible per the filter query. - if len(eligibleDocIDs) == 0 { + if numEligible == 0 { + // no eligible documents return rv, nil } + // If every element in the index is eligible (full selectivity), + // then this can basically be considered unfiltered kNN. + if numEligible == v.sb.numDocs { + // all documents eligible, no filtering needed + return v.Search(qVector, k, params) + } + eligibleDocIDs := make([]uint32, 0, numEligible) + // get eligible iterator + eligibleItr := eligibleList.Iterator() + for { + docID, ok := eligibleItr.Next() + if !ok { + break + } + eligibleDocIDs = append(eligibleDocIDs, uint32(docID)) + } // vector IDs corresponding to the local doc numbers to be // considered for the search @@ -128,6 +145,10 @@ func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, if len(vectorIDsToInclude) == 0 { return rv, nil } + // If all vectors are eligible, treat as unfiltered search. + if len(vectorIDsToInclude) == len(v.vecDocIDMap) { + return v.Search(qVector, k, params) + } // If the index is not an IVF index, then the search can be // performed directly, using the Flat index. if !v.vecIndex.IsIVFIndex() { @@ -141,10 +162,18 @@ func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, v.addIDsToPostingsList(rv, rs) return rv, nil } + // Getting the nprobe value set at index time. + nprobe, nlist := v.vecIndex.IVFParams() + // include selector for the vector IDs to be considered + includeSelector, err := v.getSelector(vectorIDsToInclude, true) + if err != nil { + return nil, err + } + defer includeSelector.Delete() // Determining which clusters, identified by centroid ID, // have at least one eligible vector and hence, ought to be // probed. - clusterVectorCounts, err := v.vecIndex.ObtainClusterVectorCountsFromIVFIndex(vectorIDsToInclude) + clusterVectorCounts, err := v.vecIndex.ObtainClusterVectorCountsFromIVFIndex(includeSelector, nlist) if err != nil { return nil, err } @@ -185,35 +214,64 @@ func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, // Ordering the retrieved centroid IDs by increasing order // of distance i.e. decreasing order of proximity to query vector. centroidIDs := make([]int64, 0, len(clusterVectorCounts)) - for centroidID := range clusterVectorCounts { - centroidIDs = append(centroidIDs, centroidID) + for centroidID, vectorCount := range clusterVectorCounts { + // Only centroids with at least one eligible vector are considered. + if vectorCount > 0 { + // since we are adding only unique centroid IDs, this is simply an increment + // and we can avoid a population count at the end + centroidIDs = append(centroidIDs, int64(centroidID)) + } + } + if len(centroidIDs) == 0 { + // no eligible centroids found + return rv, nil } - closestCentroidIDs, centroidDistances, err := - v.vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector, centroidIDs) + // get centroid selector + centroidSelector, err := v.getSelector(centroidIDs, true) + if err != nil { + return nil, err + } + defer centroidSelector.Delete() + + eligibleCentroidIDs, centroidDistances, err := + v.vecIndex.ObtainClustersWithDistancesFromIVFIndex(qVector, centroidSelector, int64(len(centroidIDs))) if err != nil { return nil, err } - // Getting the nprobe value set at index time. - nprobe := int(v.vecIndex.GetNProbe()) // Determining the minimum number of centroids to be probed // to ensure that at least 'k' vectors are collected while // examining at least 'nprobe' centroids. // centroidsToProbe range: [nprobe, number of eligible centroids] var eligibleVecsTillNow int64 - centroidsToProbe := len(closestCentroidIDs) - for i, centroidID := range closestCentroidIDs { + var eligibleCentroidsTillNow int + centroidsToProbe := len(eligibleCentroidIDs) + for i, centroidID := range eligibleCentroidIDs { + // if we get a -1 somehow here, it means no more centroids + // need to reslice the eligibleCentroidIDs and distances + // accordingly, just a safeguard check as this does not + // really happen. FAISS can pad with -1s if there are not enough + // eligible centroids, but we have already counted the cardinality so + // we should not see -1s here. + if centroidID == -1 { + centroidsToProbe = i + // reslice to only valid centroids + eligibleCentroidIDs = eligibleCentroidIDs[:centroidsToProbe] + centroidDistances = centroidDistances[:centroidsToProbe] + break + } eligibleVecsTillNow += clusterVectorCounts[centroidID] + eligibleCentroidsTillNow = i + 1 // Stop once we've examined at least 'nprobe' centroids and // collected at least 'k' vectors. - if eligibleVecsTillNow >= k && i+1 >= nprobe { - centroidsToProbe = i + 1 + if eligibleVecsTillNow >= k && eligibleCentroidsTillNow >= nprobe { + centroidsToProbe = eligibleCentroidsTillNow break } } - // Search the clusters specified by 'closestCentroidIDs' for + // Search the clusters specified by 'eligibleCentroidIDs' for // vectors whose IDs are present in 'vectorIDsToInclude' rs, err := v.searchClustersFromIVFIndex( - ids, include, closestCentroidIDs, centroidsToProbe, + ids, include, eligibleCentroidIDs, centroidsToProbe, k, qVector, centroidDistances, params) if err != nil { return nil, err @@ -356,7 +414,16 @@ func (v *vectorIndexWrapper) searchWithoutIDs(qVector []float32, k int64, exclud resultSet, error) { return v.docSearch(k, v.sb.numDocs, func() ([]float32, []int64, error) { - return v.vecIndex.SearchWithoutIDs(qVector, k, exclude, params) + var sel faiss.Selector + var err error + if len(exclude) > 0 { + sel, err = v.getSelector(exclude, false) + if err != nil { + return nil, nil, err + } + defer sel.Delete() + } + return v.vecIndex.SearchWithOptions(qVector, k, sel, params) }, func(numIter int, labels []int64) bool { // if this is the first loop iteration and we have < k unique docIDs, @@ -384,7 +451,15 @@ func (v *vectorIndexWrapper) searchWithIDs(qVector []float32, k int64, include [ var includeSet map[int64]struct{} return v.docSearch(k, v.sb.numDocs, func() ([]float32, []int64, error) { - return v.vecIndex.SearchWithIDs(qVector, k, include, params) + // build the selector based on whatever ids is as of now + selector, err := v.getSelector(include, true) + if err != nil { + return nil, nil, err + } + // once the main search is done we must free the selector + defer selector.Delete() + + return v.vecIndex.SearchWithOptions(qVector, k, selector, params) }, func(numIter int, labels []int64) bool { // if this is the first loop iteration and we have < k unique docIDs, @@ -441,8 +516,8 @@ func (v *vectorIndexWrapper) searchClustersFromIVFIndex(ids []int64, include boo } // once the main search is done we must free the selector defer selector.Delete() - return v.vecIndex.SearchClustersFromIVFIndex(selector, eligibleCentroidIDs, - centroidsToProbe, k, x, centroidDis, params) + return v.vecIndex.SearchClustersFromIVFIndex(eligibleCentroidIDs, centroidDis, + centroidsToProbe, x, k, selector, params) }, func(numIter int, labels []int64) bool { // if this is the first loop iteration and we have < k unique docIDs, @@ -463,12 +538,12 @@ func (v *vectorIndexWrapper) searchClustersFromIVFIndex(ids []int64, include boo // and still have not found enough unique docIDs, we increase // the number of centroids to probe for the next iteration // to try and find more vectors/documents - if numIter >= nprobeIncreaseThreshold && centroidsToProbe < len(eligibleCentroidIDs) { + if numIter >= nprobeIncreaseThreshold && centroidsToProbe < totalEligibleCentroids { // Calculate how much to increase: increase by 50% of the remaining centroids to probe, // but at least by 1 to ensure progress. increaseAmount := max((totalEligibleCentroids-centroidsToProbe)/2, 1) // Update centroidsToProbe, ensuring it does not exceed the total eligible centroids - centroidsToProbe = min(centroidsToProbe+increaseAmount, len(eligibleCentroidIDs)) + centroidsToProbe = min(centroidsToProbe+increaseAmount, totalEligibleCentroids) } // prepare the exclude/include list for the next iteration if include { @@ -502,7 +577,7 @@ func (v *vectorIndexWrapper) getSelector(ids []int64, include bool) (selector fa if include { selector, err = faiss.NewIDSelectorBatch(ids) } else { - selector, err = faiss.NewIDSelectorNot(ids) + selector, err = faiss.NewIDSelectorBatchNot(ids) } if err != nil { return nil, err diff --git a/vendor/github.com/blevesearch/zapx/v16/merge.go b/vendor/github.com/blevesearch/zapx/v16/merge.go index 6197af1178..69b827f984 100644 --- a/vendor/github.com/blevesearch/zapx/v16/merge.go +++ b/vendor/github.com/blevesearch/zapx/v16/merge.go @@ -36,9 +36,21 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path. -func (*ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, closeCh chan struct{}, s seg.StatsReporter) ( [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { switch segmentx := segment.(type) { diff --git a/vendor/github.com/blevesearch/zapx/v16/new.go b/vendor/github.com/blevesearch/zapx/v16/new.go index c99b933d7b..41fcd23af6 100644 --- a/vendor/github.com/blevesearch/zapx/v16/new.go +++ b/vendor/github.com/blevesearch/zapx/v16/new.go @@ -42,11 +42,16 @@ var ValidateDocFields = func(field index.Field) error { // New creates an in-memory zap-encoded SegmentBase from a set of Documents func (z *ZapPlugin) New(results []index.Document) ( segment.Segment, uint64, error) { - return z.newWithChunkMode(results, DefaultChunkMode) + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) } func (*ZapPlugin) newWithChunkMode(results []index.Document, - chunkMode uint32) (segment.Segment, uint64, error) { + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -72,7 +77,7 @@ func (*ZapPlugin) newWithChunkMode(results []index.Document, } sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, - uint64(len(results)), storedIndexOffset, sectionsIndexOffset) + uint64(len(results)), storedIndexOffset, sectionsIndexOffset, config) // get the bytes written before the interim's reset() call // write it to the newly formed segment base. diff --git a/vendor/github.com/blevesearch/zapx/v16/segment.go b/vendor/github.com/blevesearch/zapx/v16/segment.go index 461fdf5add..2757b66750 100644 --- a/vendor/github.com/blevesearch/zapx/v16/segment.go +++ b/vendor/github.com/blevesearch/zapx/v16/segment.go @@ -39,8 +39,18 @@ func init() { reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + // Open returns a zap impl of a segment -func (*ZapPlugin) Open(path string) (segment.Segment, error) { +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err @@ -59,6 +69,7 @@ func (*ZapPlugin) Open(path string) (segment.Segment, error) { vecIndexCache: newVectorIndexCache(), synIndexCache: newSynonymIndexCache(), fieldDvReaders: make([]map[uint16]*docValueReader, len(segmentSections)), + config: config, }, f: f, mm: mm, @@ -111,6 +122,7 @@ type SegmentBase struct { size uint64 updatedFields map[string]*index.UpdateFieldInfo + config map[string]interface{} // config for the segment m sync.Mutex fieldFSTs map[uint16]*vellum.FST diff --git a/vendor/github.com/blevesearch/zapx/v17/.gitignore b/vendor/github.com/blevesearch/zapx/v17/.gitignore new file mode 100644 index 0000000000..46d1cfad54 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/.gitignore @@ -0,0 +1,12 @@ +#* +*.sublime-* +*~ +.#* +.project +.settings +**/.idea/ +**/*.iml +.DS_Store +/cmd/zap/zap +*.test +tags diff --git a/vendor/github.com/blevesearch/zapx/v17/LICENSE b/vendor/github.com/blevesearch/zapx/v17/LICENSE new file mode 100644 index 0000000000..7a4a3ea242 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/vendor/github.com/blevesearch/zapx/v17/README.md b/vendor/github.com/blevesearch/zapx/v17/README.md new file mode 100644 index 0000000000..4cbf1a145b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/README.md @@ -0,0 +1,163 @@ +# zapx file format + +The zapx module is fork of [zap](https://github.com/blevesearch/zap) module which maintains file format compatibility, but removes dependency on bleve, and instead depends only on the indepenent interface modules: + +- [bleve_index_api](https://github.com/blevesearch/scorch_segment_api) +- [scorch_segment_api](https://github.com/blevesearch/scorch_segment_api) + +Advanced ZAP File Format Documentation is [here](zap.md). + +The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. + +Current usage: + +- mmap the entire file +- crc-32 bytes and version are in fixed position at end of the file +- reading remainder of footer could be version specific +- remainder of footer gives us: + - 3 important offsets (docValue , fields index and stored data index) + - 2 important values (number of docs and chunk factor) +- field data is processed once and memoized onto the heap so that we never have to go back to disk for it +- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. +- access to all other indexed data follows the following pattern: + - first know the field name -> convert to id + - next navigate to term dictionary for that field + - some operations stop here and do dictionary ops + - next use dictionary to navigate to posting list for a specific term + - walk posting list + - if necessary, walk posting details as we go + - if location info is desired, consult location bitmap to see if it is there + +## stored fields section + +- for each document + - preparation phase: + - produce a slice of metadata bytes and data bytes + - produce these slices in field id order + - field value is appended to the data slice + - metadata slice is varint encoded with the following values for each field value + - field id (uint16) + - field type (byte) + - field value start offset in uncompressed data slice (uint64) + - field value length (uint64) + - field number of array positions (uint64) + - one additional value for each array position (uint64) + - compress the data slice using snappy + - file writing phase: + - remember the start offset for this document + - write out meta data length (varint uint64) + - write out compressed data length (varint uint64) + - write out the metadata bytes + - write out the compressed data bytes + +## stored fields idx + +- for each document + - write start offset (remembered from previous section) of stored data (big endian uint64) + +With this index and a known document number, we have direct access to all the stored field data. + +## posting details (freq/norm) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode term frequency (uint64) + - encode norm factor (float32) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## posting details (location) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode field (uint16) + - encode field pos (uint64) + - encode field start (uint64) + - encode field end (uint64) + - encode number of array positions to follow (uint64) + - encode each array position (each uint64) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## postings list section + +- for each posting list + - preparation phase: + - encode roaring bitmap posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this posting list + - write freq/norm details offset (remembered from previous, as varint uint64) + - write location details offset (remembered from previous, as varint uint64) + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + +## dictionary + +- for each field + - preparation phase: + - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) + - file writing phase: + - remember the start position of this persistDictionary + - write length of vellum data (varint uint64) + - write out vellum data + +## fields section + +- for each field + - file writing phase: + - remember start offset for each field + - write dictionary address (remembered from previous) (varint uint64) + - write length of field name (varint uint64) + - write field name bytes + +## fields idx + +- for each field + - file writing phase: + - write big endian uint64 of start offset for each field + +NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. + +## fields DocValue + +- for each field + - preparation phase: + - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data + - produce a slice remembering the length of each chunk + - file writing phase: + - remember the start position of this first field DocValue offset in the footer + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any +read operation leverage that meta information to extract the document specific data from the file. + +## footer + +- file writing phase + - write number of docs (big endian uint64) + - write stored field index location (big endian uint64) + - write field index location (big endian uint64) + - write field docValue location (big endian uint64) + - write out chunk factor (big endian uint32) + - write out version (big endian uint32) + - write out file CRC of everything preceding this (big endian uint32) diff --git a/vendor/github.com/blevesearch/zapx/v17/build.go b/vendor/github.com/blevesearch/zapx/v17/build.go new file mode 100644 index 0000000000..f04cae0baf --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/build.go @@ -0,0 +1,239 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "fmt" + "io" + "math" + "os" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" +) + +const Version uint32 = 17 + +const Type string = "zap" + +const fieldNotUninverted uint64 = math.MaxUint64 + +func (sb *SegmentBase) Persist(path string) error { + return PersistSegmentBase(sb, path) +} + +// WriteTo is an implementation of io.WriterTo interface. +func (sb *SegmentBase) WriteTo(w io.Writer) (int64, error) { + if w == nil { + return 0, fmt.Errorf("invalid writer found") + } + + n, err := persistSegmentBaseToWriter(sb, w) + return int64(n), err +} + +// PersistSegmentBase persists SegmentBase in the zap file format. +func PersistSegmentBase(sb *SegmentBase, path string) error { + // since in-memory data is not processed by any writer callback, + // check with the latest writer to see if data needs to be processed + writer, err := NewFileWriter(nil, []byte(path)) + if err != nil { + return err + } + if writer.id != sb.fileReader.id { + // rewrite the segment base with the latest writer callback; + // the rewrite will persist the segment to the given path (upon + // success), so we should return early to avoid overwriting again. + return rewriteSegmentBase(sb, path) + } + + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + _, err = persistSegmentBaseToWriter(sb, f) + if err != nil { + cleanup() + return err + } + + err = f.Sync() + if err != nil { + cleanup() + return err + } + + err = f.Close() + if err != nil { + cleanup() + return err + } + + return err +} + +// rewrites the segment base with the latest writer callback by leveraging +// the merge path +func rewriteSegmentBase(sb *SegmentBase, path string) error { + closeCh := make(chan struct{}) + defer close(closeCh) + _, _, err := mergeSegmentBases([]*SegmentBase{sb}, []*roaring.Bitmap{nil}, + path, DefaultChunkMode, closeCh, nil, nil) + if err != nil { + return err + } + return nil +} + +type bufWriter struct { + w *bufio.Writer + n int +} + +func (br *bufWriter) Write(in []byte) (int, error) { + n, err := br.w.Write(in) + br.n += n + return n, err +} + +func persistSegmentBaseToWriter(sb *SegmentBase, w io.Writer) (int, error) { + br := &bufWriter{w: bufio.NewWriter(w)} + + _, err := br.Write(sb.mem) + if err != nil { + return 0, err + } + + err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.sectionsIndexOffset, + sb.chunkMode, sb.memCRC, br, sb.fileReader.id) + if err != nil { + return 0, err + } + + err = br.w.Flush() + if err != nil { + return 0, err + } + + return br.n, nil +} + +func persistStoredFieldValues(fieldID int, + storedFieldValues [][]byte, stf []byte, spf [][]uint64, + curr int, metaEncode varintEncoder, data []byte) ( + int, []byte, error) { + for i := 0; i < len(storedFieldValues); i++ { + // encode field + _, err := metaEncode(uint64(fieldID)) + if err != nil { + return 0, nil, err + } + // encode type + _, err = metaEncode(uint64(stf[i])) + if err != nil { + return 0, nil, err + } + // encode start offset + _, err = metaEncode(uint64(curr)) + if err != nil { + return 0, nil, err + } + // end len + _, err = metaEncode(uint64(len(storedFieldValues[i]))) + if err != nil { + return 0, nil, err + } + // encode number of array pos + _, err = metaEncode(uint64(len(spf[i]))) + if err != nil { + return 0, nil, err + } + // encode all array positions + for _, pos := range spf[i] { + _, err = metaEncode(pos) + if err != nil { + return 0, nil, err + } + } + + data = append(data, storedFieldValues[i]...) + curr += len(storedFieldValues[i]) + } + + return curr, data, nil +} + +func InitSegmentBase(mem []byte, memCRC uint32, chunkMode uint32, numDocs uint64, + storedIndexOffset uint64, sectionsIndexOffset uint64, + config map[string]interface{}) (*SegmentBase, error) { + sb := &SegmentBase{ + mem: mem, + memCRC: memCRC, + chunkMode: chunkMode, + numDocs: numDocs, + storedIndexOffset: storedIndexOffset, + sectionsIndexOffset: sectionsIndexOffset, + fieldDvReaders: make([][]*docValueReader, len(segmentSections)), + updatedFields: make(map[string]*index.UpdateFieldInfo), + invIndexCache: newInvertedIndexCache(), + vecIndexCache: newVectorIndexCache(), + synIndexCache: newSynonymIndexCache(), + nstIndexCache: newNestedIndexCache(), + // following fields gets populated by loadFields + fieldsMap: make(map[string]uint16), + fieldsOptions: make(map[string]index.FieldIndexingOptions), + fieldsInv: make([]string, 0), + config: config, + } + sb.updateSize() + + // initialize the file reader with an empty callback + // since the data is not yet persisted, the data has also + // not been processed by any writer callback + fileReader, err := NewFileReader("", nil) + if err != nil { + return nil, err + } + sb.fileReader = fileReader + + // load the data/section starting offsets for each field + // by via the sectionsIndexOffset as starting point. + err = sb.loadFields() + if err != nil { + return nil, err + } + + err = sb.loadDvReaders() + if err != nil { + return nil, err + } + + // initialize any of the caches if needed + err = sb.nstIndexCache.initialize(sb.numDocs, sb.getEdgeListOffset(), sb.mem) + if err != nil { + return nil, err + } + + return sb, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/centroid_index.go b/vendor/github.com/blevesearch/zapx/v17/centroid_index.go new file mode 100644 index 0000000000..cdec662e22 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/centroid_index.go @@ -0,0 +1,82 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "fmt" + + faiss "github.com/blevesearch/go-faiss" +) + +func (sb *SegmentBase) GetCoarseQuantizer(field string) (interface{}, error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 <= 0 { + return nil, fmt.Errorf("field %s does not exist in segment", field) + } + + vectorSection := sb.fieldsSectionsMap[fieldIDPlus1-1][SectionFaissVectorIndex] + // check if the field has a vector section in the segment. + if vectorSection <= 0 { + return nil, fmt.Errorf("field %s does not have a vector section in the segment", field) + } + + pos := int(vectorSection) + // doc values and vector optimization type + for i := 0; i < 3; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + } + + numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // length of the vector to docID map + _, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // vector to docID mapping + for i := 0; i < int(numVecs); i++ { + _, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + } + + // type of index + indexType, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // todo: might wanna use the vector cache here, early tests didn't show a big diff + faissIndex, err := faiss.ReadIndexFromBuffer(sb.mem[pos:pos+int(indexSize)], faissIOFlags) + if err != nil { + return nil, err + } + pos += int(indexSize) + + if faissIndexType(indexType) == faissBIVFIndex { + binaryIndexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + binaryIndex, err := faiss.ReadBinaryIndexFromBuffer(sb.mem[pos:pos+int(binaryIndexSize)], faissIOFlags) + if err != nil { + return nil, err + } + return newFaissBinaryIndex(binaryIndex, faissIndex) + } + return newFaissFloat32Index(faissIndex) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/chunk.go b/vendor/github.com/blevesearch/zapx/v17/chunk.go new file mode 100644 index 0000000000..53d124f063 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/chunk.go @@ -0,0 +1,84 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "errors" + "fmt" +) + +// LegacyChunkMode was the original chunk mode (always chunk size 1024) +// this mode is still used for chunking doc values. +var LegacyChunkMode uint32 = 1024 + +// DefaultChunkMode is the most recent improvement to chunking and should +// be used by default. +var DefaultChunkMode uint32 = 1026 + +var ErrChunkSizeZero = errors.New("chunk size is zero") + +// getChunkSize returns the chunk size for the given chunkMode, cardinality, and +// maxDocs. +// +// In error cases, the returned chunk size will be 0. Caller can differentiate +// between a valid chunk size of 0 and an error by checking for ErrChunkSizeZero. +func getChunkSize(chunkMode uint32, cardinality uint64, maxDocs uint64) (uint64, error) { + switch { + case chunkMode == 0: + return 0, ErrChunkSizeZero + + // any chunkMode <= 1024 will always chunk with chunkSize=chunkMode + case chunkMode <= 1024: + // legacy chunk size + return uint64(chunkMode), nil + + case chunkMode == 1025: + // attempt at simple improvement + // theory - the point of chunking is to put a bound on the maximum number of + // calls to Next() needed to find a random document. ie, you should be able + // to do one jump to the correct chunk, and then walk through at most + // chunk-size items + // previously 1024 was chosen as the chunk size, but this is particularly + // wasteful for low cardinality terms. the observation is that if there + // are less than 1024 items, why not put them all in one chunk, + // this way you'll still achieve the same goal of visiting at most + // chunk-size items. + // no attempt is made to tweak any other case + if cardinality <= 1024 { + if maxDocs == 0 { + return 0, ErrChunkSizeZero + } + return maxDocs, nil + } + return 1024, nil + + case chunkMode == 1026: + // improve upon the ideas tested in chunkMode 1025 + // the observation that the fewest number of dense chunks is the most + // desirable layout, given the built-in assumptions of chunking + // (that we want to put an upper-bound on the number of items you must + // walk over without skipping, currently tuned to 1024) + // + // 1. compute the number of chunks needed (max 1024/chunk) + // 2. convert to chunkSize, dividing into maxDocs + numChunks := (cardinality / 1024) + 1 + chunkSize := maxDocs / numChunks + if chunkSize == 0 { + return 0, ErrChunkSizeZero + } + return chunkSize, nil + } + return 0, fmt.Errorf("unknown chunk mode %d", chunkMode) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/contentcoder.go b/vendor/github.com/blevesearch/zapx/v17/contentcoder.go new file mode 100644 index 0000000000..65c784ca40 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/contentcoder.go @@ -0,0 +1,281 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + "reflect" + + "github.com/golang/snappy" +) + +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + +type chunkedContentCoder struct { + bytesWritten uint64 // moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + compressed []byte // temp buf for snappy compression + + w io.Writer + progressiveWrite bool + skipCompression bool + + chunkMeta []MetaData + chunkMetaBuf bytes.Buffer + chunkBuf bytes.Buffer +} + +// MetaData represents the data information inside a +// chunk. +type MetaData struct { + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid +} + +// newChunkedContentCoder returns a new chunk content coder which +// packs data into chunks based on the provided chunkSize +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool, skipCompression bool, +) *chunkedContentCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedContentCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, + skipCompression: skipCompression, + } + + return rv +} + +// Reset lets you reuse this chunked content coder. Buffers are reset +// and re used. You cannot change the chunk size. +func (c *chunkedContentCoder) Reset() { + c.currChunk = 0 + c.bytesWritten = 0 + c.final = c.final[:0] + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } + c.chunkMeta = c.chunkMeta[:0] +} + +func (c *chunkedContentCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } + if cap(c.chunkMeta) < total { + c.chunkMeta = make([]MetaData, 0, total) + } +} + +// Close indicates you are done calling Add() this allows +// the final chunk to be encoded. +func (c *chunkedContentCoder) Close() error { + return c.flushContents() +} + +func (c *chunkedContentCoder) incrementBytesWritten(val uint64) { + c.bytesWritten += val +} + +func (c *chunkedContentCoder) getBytesWritten() uint64 { + return c.bytesWritten +} + +func (c *chunkedContentCoder) writeChunkMeta() ([]byte, error) { + + // flush the contents, with meta information at first + buf := make([]byte, binary.MaxVarintLen64) + var metaData []byte + n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) + _, err := c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the metaData slice + for _, meta := range c.chunkMeta { + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) + if err != nil { + return nil, err + } + } + + // write the metadata to final data + metaData = c.chunkMetaBuf.Bytes() + c.final = append(c.final, metaData...) + + return metaData, nil +} + +func (c *chunkedContentCoder) flushContents() error { + var metaData []byte + var err error + // Meta data is only needed if we have more than 1 doc in the chunk, + // otherwise we can just write the doc value directly + if c.chunkSize != 1 { + metaData, err = c.writeChunkMeta() + if err != nil { + return err + } + } + + // write the compressed data to the final data + if c.skipCompression { + c.compressed = c.chunkBuf.Bytes() + } else { + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + } + + // process the compressed data using the callback + if fw, ok := c.w.(*FileWriter); ok && fw != nil { + c.compressed = fw.process(c.compressed) + } + + c.incrementBytesWritten(uint64(len(c.compressed))) + c.final = append(c.final, c.compressed...) + + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } + + return nil +} + +// Add encodes the provided byte slice into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // flush out the previous chunk details + err := c.flushContents() + if err != nil { + return err + } + // clearing the chunk specific meta for next chunk + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + c.chunkMeta = c.chunkMeta[:0] + c.currChunk = chunk + } + + // get the starting offset for this doc + dvOffset := c.chunkBuf.Len() + dvSize, err := c.chunkBuf.Write(vals) + if err != nil { + return err + } + + c.chunkMeta = append(c.chunkMeta, MetaData{ + DocNum: docNum, + DocDvOffset: uint64(dvOffset + dvSize), + }) + return nil +} + +// Write commits all the encoded chunked contents to the provided writer. +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +func (c *chunkedContentCoder) Write() (int, error) { + var tw int + + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] + } + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(c.final, chunkOffset) + nw, err := c.w.Write(c.final[:n]) + tw += nw + if err != nil { + return tw, err + } + } + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + c.final = c.final[:0] + + return tw, nil +} + +// ReadDocValueBoundary elicits the start, end offsets from a +// metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + return start, metaHeaders[chunk].DocDvOffset +} diff --git a/vendor/github.com/blevesearch/zapx/v17/count.go b/vendor/github.com/blevesearch/zapx/v17/count.go new file mode 100644 index 0000000000..b6135359fb --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/count.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "hash/crc32" + "io" + + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +// CountHashWriter is a wrapper around a Writer which counts the number of +// bytes which have been written and computes a crc32 hash +type CountHashWriter struct { + w io.Writer + crc uint32 + n int + s segment.StatsReporter +} + +// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer +func NewCountHashWriter(w io.Writer) *CountHashWriter { + return &CountHashWriter{w: w} +} + +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + +// Write writes the provided bytes to the wrapped writer and counts the bytes +func (c *CountHashWriter) Write(b []byte) (int, error) { + n, err := c.w.Write(b) + c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) + c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } + return n, err +} + +// Count returns the number of bytes written +func (c *CountHashWriter) Count() int { + return c.n +} + +// Sum32 returns the CRC-32 hash of the content written to this writer +func (c *CountHashWriter) Sum32() uint32 { + return c.crc +} diff --git a/vendor/github.com/blevesearch/zapx/v17/dict.go b/vendor/github.com/blevesearch/zapx/v17/dict.go new file mode 100644 index 0000000000..5ec7e27fda --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/dict.go @@ -0,0 +1,188 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +// Dictionary is the zap representation of the term dictionary +type Dictionary struct { + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + + fstReader *vellum.Reader + + bytesRead uint64 +} + +// represents an immutable, empty dictionary +var emptyDictionary = &Dictionary{} + +func (d *Dictionary) Cardinality() int { + if d.fst != nil { + return d.fst.Len() + } + return 0 +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList(term, except, preallocPL) +} + +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if d.fstReader == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + postingsOffset, exists, err := d.fstReader.Get(term) + + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } + return d.postingsListInit(rv, except), nil + } + + return d.postingsListFromOffset(postingsOffset, except, rv) +} + +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, except) + + err := rv.read(postingsOffset, d) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { + if rv == nil || rv == emptyPostingsList { + rv = &PostingsList{} + } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + + *rv = PostingsList{} // clear the struct + + rv.postings = postings + } + rv.sb = d.sb + rv.except = except + return rv +} + +func (d *Dictionary) Contains(key []byte) (bool, error) { + if d.fst != nil { + return d.fst.Contains(key) + } + return false, nil +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a segment.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + if d.fst != nil { + rv := &DictionaryIterator{ + d: d, + } + + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv + } + return emptyDictionaryIterator +} + +func (d *Dictionary) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (d *Dictionary) BytesRead() uint64 { + return d.bytesRead +} + +func (d *Dictionary) ResetBytesRead(val uint64) { + d.bytesRead = val +} + +func (d *Dictionary) BytesWritten() uint64 { + return 0 +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool +} + +var emptyDictionaryIterator = &DictionaryIterator{} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, postingsOffset := i.itr.Current() + if fitr, ok := i.itr.(vellum.FuzzyIterator); ok { + i.entry.EditDistance = fitr.EditDistance() + } + i.entry.Term = string(term) + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() + } + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/docvalues.go b/vendor/github.com/blevesearch/zapx/v17/docvalues.go new file mode 100644 index 0000000000..688bda8926 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/docvalues.go @@ -0,0 +1,409 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "reflect" + "sort" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var reflectStaticSizedocValueReader int + +func init() { + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) +} + +type docNumTermsVisitor func(docNum uint64, terms []byte) error + +type docVisitState struct { + dvrs []*docValueReader + segment *SegmentBase + + bytesRead uint64 +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the disk (pertaining to the +// docvalues) while querying. +// the loadDvChunk retrieves the next chunk of docvalues +// and the bytes retrieved off the disk pertaining to that +// is accounted as well. +func (d *docVisitState) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (d *docVisitState) BytesRead() uint64 { + return d.bytesRead +} + +func (d *docVisitState) BytesWritten() uint64 { + return 0 +} + +func (d *docVisitState) ResetBytesRead(val uint64) { + d.bytesRead = val +} + +type docValueReader struct { + field string + indexOptions index.FieldIndexingOptions + curChunkNum uint64 + chunkOffsets []uint64 + dvDataLoc uint64 + curChunkHeader []MetaData // Only populated when chunking is enabled + curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression + + bytesRead uint64 +} + +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + SizeOfPtr + + len(di.field) + + len(di.chunkOffsets)*SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) +} + +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } + + rv.field = di.field + rv.indexOptions = di.indexOptions + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = rv.curChunkHeader[:0] + rv.curChunkData = nil + rv.uncompressed = rv.uncompressed[:0] + + return rv +} + +func (di *docValueReader) curChunkNumber() uint64 { + return di.curChunkNum +} + +func (sb *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { + // get the docValue offset for the given fields + if fieldDvLocStart == fieldNotUninverted { + // no docValues found, nothing to do + return nil, nil + } + + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(sb.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(sb.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen + + // 16 bytes since it corresponds to the length + // of chunk offsets and the position of the offsets + sb.incrementBytesRead(16) + } else { + return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) + } + + fdvIter := &docValueReader{ + curChunkNum: math.MaxUint64, + field: field, + indexOptions: sb.fieldsOptions[field], + chunkOffsets: make([]uint64, int(numChunks)), + } + + // read the chunk offsets + var offset uint64 + for i := 0; i < int(numChunks); i++ { + loc, read := binary.Uvarint(sb.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("corrupted chunk offset during segment load") + } + fdvIter.chunkOffsets[i] = loc + offset += uint64(read) + } + sb.incrementBytesRead(offset) + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + return fdvIter, nil +} + +func (d *docValueReader) getBytesRead() uint64 { + return d.bytesRead +} + +func (d *docValueReader) incrementBytesRead(val uint64) { + d.bytesRead += val +} + +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { + // advance to the chunk where the docValues + // reside for the given docNum + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + + destChunkDataLoc += start + curChunkEnd += end + + var err error + // if skip chunking is enabled, each chunk has 1 document's docValues + if di.indexOptions.SkipDVChunking() { + di.curChunkData, err = s.fileReader.process(s.mem[destChunkDataLoc:curChunkEnd]) + if err != nil { + return err + } + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + + // read the number of docs reside in the chunk + numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("failed to read the chunk") + } + chunkMetaLoc := destChunkDataLoc + uint64(read) + di.incrementBytesRead(uint64(read)) + offset := uint64(0) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } + for i := 0; i < int(numDocs); i++ { + di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := curChunkEnd - compressedDataLoc + di.incrementBytesRead(uint64(dataLength + offset)) + di.curChunkData, err = s.fileReader.process(s.mem[compressedDataLoc : compressedDataLoc+dataLength]) + if err != nil { + return err + } + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil +} + +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + + // if chunkdate is missing or chunk header is missing (when chunking is enabled), ignore chunk + if di.curChunkData == nil || (len(di.curChunkHeader) == 0 && !di.indexOptions.SkipDVChunking()) { + continue + } + + var uncompressed []byte + if di.indexOptions.SkipDVCompression() { + uncompressed = di.curChunkData + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + } + di.uncompressed = uncompressed + + // if chunking is skipped, then all docValues + // for the chunk belong to a single docNum + if di.indexOptions.SkipDVCompression() { + err = visitor(uint64(i), uncompressed) + if err != nil { + return err + } + continue + } + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + +func (di *docValueReader) visitDocValues(docNum uint64, + visitor index.DocValueVisitor) error { + + var start, end uint64 + if di.indexOptions.SkipDVChunking() { + // docNum directly maps to the chunk number + start = 0 + end = uint64(len(di.curChunkData)) + } else { + // binary search the term locations for the docNum + start, end = di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { + return nil + } + } + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + if di.indexOptions.SkipDVCompression() { + uncompressed = di.curChunkData + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + } + di.uncompressed = uncompressed + } + + // pick the terms for the given docNum + uncompressed = uncompressed[start:end] + for { + i := bytes.IndexByte(uncompressed, index.DocValueTermSeparator) + if i < 0 { + break + } + + visitor(di.field, uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + + return nil +} + +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { + i := sort.Search(len(di.curChunkHeader), func(i int) bool { + return di.curChunkHeader[i].DocNum >= docNum + }) + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { + return ReadDocValueBoundary(i, di.curChunkHeader) + } + return math.MaxUint64, math.MaxUint64 +} + +// VisitDocValues is an implementation of the +// DocValueVisitable interface +func (sb *SegmentBase) VisitDocValues(localDocNum uint64, fields []string, + visitor index.DocValueVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{} + } else { + if dvs.segment != sb { + dvs.segment = sb + dvs.dvrs = nil + dvs.bytesRead = 0 + } + } + + var initDvReaders bool + if dvs.dvrs == nil { + dvs.dvrs = make([]*docValueReader, len(sb.fieldsInv)) + initDvReaders = true + } + + // find the chunkNumber where the docValues are stored + // NOTE: doc values continue to use legacy chunk mode + chunkFactor, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + var fieldIDPlus1, fieldID uint16 + var dvr, dvIter *docValueReader + var docInChunk uint64 + for _, field := range fields { + if fieldIDPlus1, ok = sb.fieldsMap[field]; !ok { + continue + } + fieldID = fieldIDPlus1 - 1 + + if sb.fieldsOptions[field].SkipDVChunking() { + docInChunk = localDocNum + } else { + docInChunk = localDocNum / chunkFactor + } + + // initialize the docValueReader for the field if needed + if initDvReaders { + dvIter = sb.fieldDvReaders[SectionInvertedTextIndex][fieldID] + if dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + + dvr = dvs.dvrs[fieldID] + if dvr != nil { + // check if the chunk is already loaded + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, sb) + if err != nil { + return dvs, err + } + dvs.ResetBytesRead(dvr.getBytesRead()) + } else { + dvs.ResetBytesRead(0) + } + + _ = dvr.visitDocValues(localDocNum, visitor) + } + } + return dvs, nil +} + +// VisitableDocValueFields returns the list of fields with +// persisted doc value terms ready to be visitable using the +// VisitDocumentFieldTerms method. +func (sb *SegmentBase) VisitableDocValueFields() ([]string, error) { + return sb.fieldDvNames, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/enumerator.go b/vendor/github.com/blevesearch/zapx/v17/enumerator.go new file mode 100644 index 0000000000..972a224165 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/enumerator.go @@ -0,0 +1,138 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + + "github.com/blevesearch/vellum" +) + +// enumerator provides an ordered traversal of multiple vellum +// iterators. Like JOIN of iterators, the enumerator produces a +// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, +// then iteratorIndex ASC, where the same key might be seen or +// repeated across multiple child iterators. +type enumerator struct { + itrs []vellum.Iterator + currKs [][]byte + currVs []uint64 + + lowK []byte + lowIdxs []int + lowCurr int +} + +// newEnumerator returns a new enumerator over the vellum Iterators +func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { + rv := &enumerator{ + itrs: itrs, + currKs: make([][]byte, len(itrs)), + currVs: make([]uint64, len(itrs)), + lowIdxs: make([]int, 0, len(itrs)), + } + for i, itr := range rv.itrs { + rv.currKs[i], rv.currVs[i] = itr.Current() + } + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { + return rv, vellum.ErrIteratorDone + } + return rv, nil +} + +// updateMatches maintains the low key matches based on the currKs +func (m *enumerator) updateMatches(skipEmptyKey bool) { + m.lowK = nil + m.lowIdxs = m.lowIdxs[:0] + m.lowCurr = 0 + + for i, key := range m.currKs { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys + continue + } + + cmp := bytes.Compare(key, m.lowK) + if cmp < 0 || len(m.lowIdxs) == 0 { + // reached a new low + m.lowK = key + m.lowIdxs = m.lowIdxs[:0] + m.lowIdxs = append(m.lowIdxs, i) + } else if cmp == 0 { + m.lowIdxs = append(m.lowIdxs, i) + } + } +} + +// Current returns the enumerator's current key, iterator-index, and +// value. If the enumerator is not pointing at a valid value (because +// Next returned an error previously), Current will return nil,0,0. +func (m *enumerator) Current() ([]byte, int, uint64) { + var i int + var v uint64 + if m.lowCurr < len(m.lowIdxs) { + i = m.lowIdxs[m.lowCurr] + v = m.currVs[i] + } + return m.lowK, i, v +} + +// GetLowIdxsAndValues will return all of the iterator indices +// which point to the current key, and their corresponding +// values. This can be used by advanced caller which may need +// to peek into these other sets of data before processing. +func (m *enumerator) GetLowIdxsAndValues() ([]int, []uint64) { + values := make([]uint64, 0, len(m.lowIdxs)) + for _, idx := range m.lowIdxs { + values = append(values, m.currVs[idx]) + } + return m.lowIdxs, values +} + +// Next advances the enumerator to the next key/iterator/value result, +// else vellum.ErrIteratorDone is returned. +func (m *enumerator) Next() error { + m.lowCurr += 1 + if m.lowCurr >= len(m.lowIdxs) { + // move all the current low iterators forwards + for _, vi := range m.lowIdxs { + err := m.itrs[vi].Next() + if err != nil && err != vellum.ErrIteratorDone { + return err + } + m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() + } + // can skip any empty keys encountered at this point + m.updateMatches(true) + } + if m.lowK == nil && len(m.lowIdxs) == 0 { + return vellum.ErrIteratorDone + } + return nil +} + +// Close all the underlying Iterators. The first error, if any, will +// be returned. +func (m *enumerator) Close() error { + var rv error + for _, itr := range m.itrs { + err := itr.Close() + if rv == nil { + rv = err + } + } + return rv +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache.go new file mode 100644 index 0000000000..863c105b8b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache.go @@ -0,0 +1,413 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/RoaringBitmap/roaring/v2" + faiss "github.com/blevesearch/go-faiss" +) + +// ----------------------------------------------------------------------------- + +func newVectorIndexCache() *vectorIndexCache { + return &vectorIndexCache{ + cache: make(map[uint16]*cacheEntry), + closeCh: make(chan struct{}), + } +} + +type vectorIndexCache struct { + closeCh chan struct{} + m sync.RWMutex + cache map[uint16]*cacheEntry + isClosed bool +} + +// Clear clears the entire vector index cache. +func (vc *vectorIndexCache) Clear() { + vc.m.Lock() + // if already closed, no-op + if vc.isClosed { + vc.m.Unlock() + return + } + vc.isClosed = true + close(vc.closeCh) + + // forcing a close on all indexes to avoid memory leaks. + for _, entry := range vc.cache { + entry.close() + } + vc.cache = nil + vc.m.Unlock() +} + +// loadOrCreate obtains the vector index from the cache or creates it if it's not present. +// useGPU indicates whether the field mapping requires GPU acceleration for this index. +func (vc *vectorIndexCache) loadOrCreate(fieldID uint16, mem []byte, numDocs uint32, except *roaring.Bitmap, useGPU bool, r *FileReader) ( + index faissIndex, mapping *idMapping, exclude *bitmap, err error) { + // first try to read from the cache with a read lock + vc.m.RLock() + if vc.isClosed { + // if cache is closed, no-op + vc.m.RUnlock() + return nil, nil, nil, nil + } + entry, ok := vc.cache[fieldID] + if ok { + vc.m.RUnlock() + return entry.load(except) + } + vc.m.RUnlock() + // cache miss, rebuild the cache entry under a write lock + vc.m.Lock() + defer vc.m.Unlock() + if vc.isClosed { + // if cache is closed, no-op + return nil, nil, nil, nil + } + // check again if we have the entry now + entry, ok = vc.cache[fieldID] + if ok { + return entry.load(except) + } + // still not present, create and cache it + return vc.createAndCacheLOCKED(fieldID, mem, numDocs, except, useGPU, r) +} + +// Rebuilding the cache on a miss. +func (vc *vectorIndexCache) createAndCacheLOCKED(fieldID uint16, mem []byte, + numDocs uint32, except *roaring.Bitmap, useGPU bool, r *FileReader) (index faissIndex, + mapping *idMapping, exclude *bitmap, err error) { + // if the cache doesn't have the entry, construct the vector to doc id map and + // the vector index out of the mem bytes and update the cache under lock. + pos := 0 + numVecs, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return nil, nil, nil, fmt.Errorf("could not read numVecs") + } + // if no vectors or no documents, return empty cache entry + if numVecs == 0 || numDocs == 0 { + return nil, nil, nil, nil + } + pos += n + // read the length of the docID list + listLen, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return nil, nil, nil, fmt.Errorf("could not read docID list length") + } + pos += n + // read the entierity of the docID list through the file reader + buf, err := r.process(mem[pos : pos+int(listLen)]) + if err != nil { + return nil, nil, nil, fmt.Errorf("could not process docID list: %v", err) + } + pos += int(listLen) + bufPos := 0 + bufLen := len(buf) + // create a mapping using the numVecs and numDocs + mapping = newIDMapping(uint32(numVecs), numDocs) + for vecID := uint32(0); vecID < uint32(numVecs); vecID++ { + docID, n := binary.Uvarint(buf[bufPos:min(bufPos+binary.MaxVarintLen64, bufLen)]) + if n <= 0 { + return nil, nil, nil, fmt.Errorf("could not read docID for vecID %d", vecID) + } + bufPos += n + mapping.add(vecID, uint32(docID)) + } + // read the type of the vector index + indexType, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return nil, nil, nil, fmt.Errorf("could not read faiss index type") + } + pos += n + // read the faiss index size + indexSize, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return nil, nil, nil, fmt.Errorf("could not read faiss index size") + } + pos += n + + // read the index bytes through the file reader + buf, err = r.process(mem[pos : pos+int(indexSize)]) + if err != nil { + return nil, nil, nil, err + } + + // read the serialized vector index + fIndex, err := faiss.ReadIndexFromBuffer(buf, faissIOFlagsReadOnly) + if err != nil { + return nil, nil, nil, fmt.Errorf("faiss index load error: %v", err) + } + pos += int(indexSize) + if faissIndexType(indexType) == faissBIVFIndex { + // read the faiss binary index size + binSize, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += n + // read the index bytes through the file reader + buf, err = r.process(mem[pos : pos+int(binSize)]) + if err != nil { + return nil, nil, nil, err + } + // read the serialized binary vector index + bIndex, err := faiss.ReadBinaryIndexFromBuffer(buf, faissIOFlagsReadOnly) + if err != nil { + return nil, nil, nil, fmt.Errorf("faiss binary index load error: %v", err) + } + pos += int(binSize) + index, err = newFaissBinaryIndex(bIndex, fIndex) + if err != nil { + return nil, nil, nil, fmt.Errorf("faiss binary index creation error: %v", err) + } + } else { + if useGPU { + index, err = newFaissGPUFloat32Index(fIndex) + } else { + index, err = newFaissFloat32Index(fIndex) + } + if err != nil { + return nil, nil, nil, fmt.Errorf("faiss float32 index creation error: %v", err) + } + } + // update the cache + vc.insertLOCKED(fieldID, index, mapping) + return index, mapping, getExcludedVectors(mapping, except), nil +} + +func (vc *vectorIndexCache) insertLOCKED(fieldID uint16, + index faissIndex, mapping *idMapping) { + // the first time we've hit the cache, try to spawn a monitoring routine + // which will reconcile the moving averages for all the fields being hit + if len(vc.cache) == 0 { + go vc.monitor() + } + // initializing the alpha with 0.4 essentially means that we are favoring + // the history a little bit more relative to the current sample value. + // this makes the average to be kept above the threshold value for a + // longer time and thereby the index to be resident in the cache + // for longer time. + vc.cache[fieldID] = createCacheEntry(index, mapping, 0.4) +} + +func (vc *vectorIndexCache) incHit(fieldID uint16) { + vc.m.RLock() + entry, ok := vc.cache[fieldID] + if ok { + entry.incHit() + } + vc.m.RUnlock() +} + +func (vc *vectorIndexCache) decRef(fieldID uint16) { + vc.m.RLock() + entry, ok := vc.cache[fieldID] + if ok { + entry.decRef() + } + vc.m.RUnlock() +} + +// vectorIndexLocation describes where a cached vector index currently resides. +type vectorIndexLocation uint8 + +const ( + vectorIndexNotCached vectorIndexLocation = iota // not present in the cache + vectorIndexInCPU // loaded in CPU memory + vectorIndexInGPU // loaded in GPU memory +) + +// indexLocation reports where the vector index for fieldID currently resides. +func (vc *vectorIndexCache) indexLocation(fieldID uint16) vectorIndexLocation { + vc.m.RLock() + defer vc.m.RUnlock() + if vc.isClosed { + return vectorIndexNotCached + } + entry, ok := vc.cache[fieldID] + if !ok { + return vectorIndexNotCached + } + if gpuIdx, ok := entry.index.(faissIndexGPU); ok && gpuIdx.inGPURam() { + return vectorIndexInGPU + } + return vectorIndexInCPU +} + +func (vc *vectorIndexCache) cleanup() bool { + vc.m.Lock() + cache := vc.cache + + // for every field reconcile the average with the current sample values + for fieldID, entry := range cache { + sample := atomic.LoadUint64(&entry.tracker.sample) + entry.tracker.add(sample) + + refCount := atomic.LoadInt64(&entry.refs) + // the comparison threshold as of now is (1 - a). mathematically it + // means that there is only 1 query per second on average as per history. + // and in the current second, there were no queries performed against + // this index. + if entry.tracker.avg <= (1-entry.tracker.alpha) && refCount <= 0 { + atomic.StoreUint64(&entry.tracker.sample, 0) + delete(vc.cache, fieldID) + entry.close() + continue + } + atomic.StoreUint64(&entry.tracker.sample, 0) + } + + rv := len(vc.cache) == 0 + vc.m.Unlock() + return rv +} + +var monitorFreq = 1 * time.Second + +func (vc *vectorIndexCache) monitor() { + ticker := time.NewTicker(monitorFreq) + defer ticker.Stop() + for { + select { + case <-vc.closeCh: + return + case <-ticker.C: + exit := vc.cleanup() + if exit { + // no entries to be monitored, exit + return + } + } + } +} + +// ----------------------------------------------------------------------------- + +type ewma struct { + alpha float64 + avg float64 + // every hit to the cache entry is recorded as part of a sample + // which will be used to calculate the average in the next cycle of average + // computation (which is average traffic for the field till now). this is + // used to track the per second hits to the cache entries. + sample uint64 +} + +func (e *ewma) add(val uint64) { + if e.avg == 0.0 { + e.avg = float64(val) + } else { + // the exponentially weighted moving average + // X(t) = a.v + (1 - a).X(t-1) + e.avg = e.alpha*float64(val) + (1-e.alpha)*e.avg + } +} + +// ----------------------------------------------------------------------------- + +func createCacheEntry(index faissIndex, mapping *idMapping, alpha float64) *cacheEntry { + ce := &cacheEntry{ + index: index, + mapping: mapping, + tracker: &ewma{ + alpha: alpha, + sample: 1, + }, + refs: 1, + } + return ce +} + +type cacheEntry struct { + tracker *ewma + + // this is used to track the live references to the cache entry, + // such that while we do a cleanup() and we see that the avg is below a + // threshold we close/cleanup only if the live refs to the cache entry is 0. + refs int64 + + index faissIndex + mapping *idMapping +} + +func (ce *cacheEntry) incHit() { + atomic.AddUint64(&ce.tracker.sample, 1) +} + +func (ce *cacheEntry) addRef() { + atomic.AddInt64(&ce.refs, 1) +} + +func (ce *cacheEntry) decRef() { + atomic.AddInt64(&ce.refs, -1) +} + +func (ce *cacheEntry) load(except *roaring.Bitmap) (faissIndex, *idMapping, *bitmap, error) { + ce.incHit() + ce.addRef() + return ce.index, ce.mapping, getExcludedVectors(ce.mapping, except), nil +} + +func (ce *cacheEntry) close() { + go func() { + if ce.index != nil { + ce.index.close() + } + ce.mapping = nil + }() +} + +// ----------------------------------------------------------------------------- + +func getExcludedVectors(idMap *idMapping, except *roaring.Bitmap) (exclude *bitmap) { + if except != nil && !except.IsEmpty() && idMap != nil { + numVecs := idMap.numVectors() + // if there are no vectors, nothing to exclude + if numVecs == 0 { + return exclude + } + // iterate over the docs present in the except bitmap to + // construct the vector exclude bitmap. we can guarantee that + // this except bitmap is immutable and derived from the segment + // snapshot, but the vector exclude bitmap is part of the + // SegmentBase's cache, because of which it is necessary to create + // a new vector exclude bitmap per cache load operation + // get an iterator over the except bitmap + exceptItr := except.Iterator() + // as we iterate over the except docIDs, get the vector IDs + // for those docIDs and set them in our exclude bitmap + for exceptItr.HasNext() { + docID := exceptItr.Next() + vecs, ok := idMap.vecsForDoc(docID) + if ok && len(vecs) > 0 { + if exclude == nil { + exclude = newBitmap(numVecs) + } + for _, vecID := range vecs { + exclude.set(vecID) + } + } + } + } + return exclude +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache_nosup.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache_nosup.go new file mode 100644 index 0000000000..ff152f95c4 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_cache_nosup.go @@ -0,0 +1,27 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !vectors +// +build !vectors + +package zap + +type vectorIndexCache struct { +} + +func newVectorIndexCache() *vectorIndexCache { + return nil +} + +func (v *vectorIndexCache) Clear() {} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index.go new file mode 100644 index 0000000000..77f0104bf5 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index.go @@ -0,0 +1,115 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/json" + "errors" + + "github.com/blevesearch/go-faiss" +) + +var ( + errNilConfig error = errors.New("faiss index config is nil") + errNilIndex error = errors.New("faiss index is nil") + errNotSupported error = errors.New("operation not supported") +) + +// Abstract interface for Faiss vector indices, which are returned by the go-faiss library. +type faissIndex interface { + // adds the given vectors to the index. + add(vecs *vectorSet) error + // closes the index and releases any associated resources. + close() + // returns the dimensionality of the vectors in the index. + dim() int + // returns the metric type used by the index, which determines how distances between vectors are computed during search. + metricType() int + // ntotal returns the total number of vectors currently stored in the index. + ntotal() int64 + // reconstructBatch reconstructs the original vectors for the given vector IDs in the index. + reconstructBatch(vecIDs []int64, prealloc []float32) ([]float32, error) + // performs a search on the index using the provided query vector and and retrieves the top K nearest neighbors. + // Optional search constraints can be applied using the selector and additional search parameters. + search(qVector *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) + // write out the index content into the provide fileWriter using a reusable buffer + // returns any error encountered during the write process. + write(buf []byte, w *FileWriter) error + // returns the size of the index in bytes. + size() uint64 + // ----------------------------------------------------------------- + // casting methods to access index-specific operations below + // ----------------------------------------------------------------- + // returns the underlying IVF index if this is an IVF index, + // and a boolean indicating whether the cast was successful. + castIVF() faissIndexIVF +} + +// Interface for IVF-specific operations on Faiss vector indices. +type faissIndexIVF interface { + faissIndex + // returns the count of the selected vector IDs in each + // cluster of the IVF index, based on the provided selector. + clusterVectorCounts(sel faiss.Selector, nlist int) ([]int64, error) + // returns the top K cardinalities (number of vectors) of the centroids in the IVF index. + centroidCardinalities(limit int, descending bool) ([]uint64, [][]float32, error) + // returns the IVF index parameters, nprobe and nlist from the ivf index. + ivfParams() (nprobe, nlist int) + // performs a search on the flat index quantizer of the IVF index, considering only the + // clusters selected by the centroidSelector and returns the search results. + searchQuantizer(qVector *vectorSet, centroidSelector faiss.Selector, centroidCount int64) ([]int64, []float32, error) + // performs a search on the IVF index by probing the specified clusters and returns the search results. + // We restrict the search to a caller-supplied set of pre-assigned clusters rather than probing internally. + searchClusters(eligibleCentroidIDs []int64, centroidDis []float32, + centroidsToProbe int, qVecSet *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) + // sets the direct map type for the IVF index. The direct map is essential for + // reconstructing vectors based on their sequential vector IDs in future merges. + setDirectMap(directMapType int) error + // sets the number of probes (nprobe) for the IVF index. nprobe determines how many + // inverted lists are probed during search, and is a key parameter that controls the + // trade-off between search accuracy and latency. + setNProbe(nprobe int32) + // trains the IVF index on the provided training data and adds the vectors to + // the trained index. The training step performs k-means clustering to partition + // the data space, which enables efficient non-exhaustive search during query time. + // directMap and nprobe must be set after this call (GPU sync clears them). + trainAndAdd(trainingData *vectorSet, vecsToAdd *vectorSet) error + // sets the quantizers for the IVF index. The quantizer is a separate + // IVF index that is trained on the same data and used to assign vectors + // to clusters in the IVF index. + setQuantizers(trainedIndex faissIndexIVF) error + // returns whether the participating index is eligible for fast merge + isMergeable() bool + // merged another faiss index into the current IVF index, + // with an offset to adjust vector IDs from the other index. + mergeFrom(other faissIndex, offset int64) error +} + +// faissIndexGPU is implemented by any index type that can reside in GPU memory. +type faissIndexGPU interface { + // inGPURam reports whether the index is currently loaded in GPU memory. + inGPURam() bool +} + +// Interface for batched search operations on Faiss vector indices. +type faissQueryBatch interface { + // performs a batch search on the index using the provided query vector and parameters, + // and returns the distances and corresponding vector IDs of the top k results. + // NOTE: only vector search requests with the same `k` are batched together. + batchSearch(qVector *vectorSet, k int64) ([]float32, []int64, error) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_bivf.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_bivf.go new file mode 100644 index 0000000000..596e19d3f6 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_bivf.go @@ -0,0 +1,323 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "encoding/json" + + index "github.com/blevesearch/bleve_index_api" + faiss "github.com/blevesearch/go-faiss" +) + +// --------------------------------- +// Faiss Binary IVF Index +// --------------------------------- +type faissBinaryIndex struct { + cfg *faissIndexConfig + backing *faiss.IndexImpl + binary *faiss.BinaryIndexImpl +} + +func newFaissBinaryIndex(binary *faiss.BinaryIndexImpl, backing *faiss.IndexImpl) (index faissIndex, err error) { + // we always create this object only with valid backing and binary indexes + if binary == nil || backing == nil { + return nil, errNilIndex + } + return &faissBinaryIndex{ + backing: backing, + binary: binary, + }, nil +} + +func newFaissBinaryIndexWithConfig(binary *faiss.BinaryIndexImpl, backing *faiss.IndexImpl, cfg *faissIndexConfig) (index faissIndex, err error) { + if binary == nil || backing == nil { + return nil, errNilIndex + } + if cfg == nil { + return nil, errNilConfig + } + + return &faissBinaryIndex{ + cfg: cfg, + backing: backing, + binary: binary, + }, nil +} + +func (b *faissBinaryIndex) add(vecs *vectorSet) error { + // add float data to backing index and the binary data to binary index + err := b.backing.Add(vecs.floatData) + if err != nil { + return err + } + return b.binary.Add(vecs.binaryData) +} + +func (b *faissBinaryIndex) close() { + b.binary.Close() + b.backing.Close() +} + +func (b *faissBinaryIndex) dim() int { + return b.binary.D() +} + +func (b *faissBinaryIndex) metricType() int { + return b.backing.MetricType() +} + +func (b *faissBinaryIndex) ntotal() int64 { + return b.binary.Ntotal() +} + +func (b *faissBinaryIndex) reconstructBatch(vecIDs []int64, prealloc []float32) ([]float32, error) { + // reconstruct vectors from backing index + return b.backing.ReconstructBatch(vecIDs, prealloc) +} + +func (b *faissBinaryIndex) search(qVector *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + // search the binary index with oversampling and then do a re-ranking on the + // FAISS index to get the top K results + // first binarize the query vector if not already done + qVector.binarize() + // search the binary index with oversampling to get a larger set of candidate binary IDs for re-ranking + _, binIDs, err := b.binary.SearchWithOptions(qVector.binaryData, binaryOversampleValue*k, + selector, params) + if err != nil { + return nil, nil, err + } + + // use backing index for re-ranking, compute the distances/scores for the + // retrieved binary IDs and then get the top K results based on those distances/scores. + distances, err := b.backing.DistCompute(qVector.floatData, binIDs) + if err != nil { + return nil, nil, err + } + // quick select algorithm for inplace partial sorting to get top K results + // based on distances/scores + scores, labels := topNIDsByDistance(distances, binIDs, int(k)) + return scores, labels, nil +} + +func (b *faissBinaryIndex) write(buf []byte, w *FileWriter) error { + backingBytes, err := faiss.WriteIndexIntoBuffer(b.backing) + if err != nil { + return err + } + backingBytes = w.process(backingBytes) + + // write the length of the serialized vector index bytes + n := binary.PutUvarint(buf, uint64(len(backingBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + _, err = w.Write(backingBytes) + if err != nil { + return err + } + + binaryBytes, err := faiss.WriteBinaryIndexIntoBuffer(b.binary) + if err != nil { + return err + } + binaryBytes = w.process(binaryBytes) + + // write the length of the serialized vector index bytes + n = binary.PutUvarint(buf, uint64(len(binaryBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + _, err = w.Write(binaryBytes) + if err != nil { + return err + } + return nil +} + +func (b *faissBinaryIndex) size() uint64 { + return b.binary.Size() + b.backing.Size() +} + +// ----------------------------------------------------------------- +// casting methods to access index-specific operations below +// ----------------------------------------------------------------- +func (b *faissBinaryIndex) castIVF() faissIndexIVF { + if b.binary.IsIVFIndex() { + // return b itself, as the IVF interface is implemented by the same + // struct as the non-IVF interface in go-faiss. + return b + } + // not an IVF index, return nil. + return nil +} + +// ----------------------------------------------------------------- +// IVF-Index specific operations +// ----------------------------------------------------------------- + +func (b *faissBinaryIndex) centroidCardinalities(limit int, descending bool) ([]uint64, [][]float32, error) { + cardinalites, bCentroids, err := b.binary.ObtainKCentroidCardinalitiesFromIVFIndex(limit, descending) + if err != nil { + return nil, nil, err + } + centroids := make([][]float32, len(bCentroids)) + for i := range bCentroids { + centroids[i] = make([]float32, len(bCentroids[i])) + for j := range bCentroids[i] { + centroids[i][j] = float32(bCentroids[i][j]) + } + } + return cardinalites, centroids, nil +} + +func (b *faissBinaryIndex) clusterVectorCounts(sel faiss.Selector, nlist int) ([]int64, error) { + return b.binary.ObtainClusterVectorCountsFromIVFIndex(sel, nlist) +} + +func (b *faissBinaryIndex) ivfParams() (nprobe, nlist int) { + return b.binary.IVFParams() +} + +func (b *faissBinaryIndex) searchQuantizer(qVector *vectorSet, centroidSelector faiss.Selector, centroidCount int64) ([]int64, []float32, error) { + // binarize the query vector if not already done + qVector.binarize() + ids, dis, err := b.binary.ObtainClustersWithDistancesFromIVFIndex(qVector.binaryData, centroidSelector, centroidCount) + if err != nil { + return nil, nil, err + } + distances := make([]float32, len(dis)) + for i, d := range dis { + distances[i] = float32(d) + } + return ids, distances, nil +} + +func (b *faissBinaryIndex) searchClusters(eligibleCentroidIDs []int64, centroidDis []float32, + centroidsToProbe int, qVector *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + // binarize the query vector if not already done + qVector.binarize() + // convert the float distances to binary distances for the binary index search + binaryCentroidDis := make([]int32, len(centroidDis)) + for i, d := range centroidDis { + binaryCentroidDis[i] = int32(d) + } + // search the binary index without oversampling, since we are already searching a + // limited number of centroids specified by centroidsToProbe + _, binIDs, err := b.binary.SearchClustersFromIVFIndex(eligibleCentroidIDs, binaryCentroidDis, + centroidsToProbe, qVector.binaryData, k, selector, params) + if err != nil { + return nil, nil, err + } + + // use backing index for re-ranking, compute the distances/scores for the + // retrieved binary IDs and then get the top K results based on those distances/scores. + // reranking is still necessary since hamming distance has a lot of collisions + distances, err := b.backing.DistCompute(qVector.floatData, binIDs) + if err != nil { + return nil, nil, err + } + // quick select algorithm for inplace partial sorting to get top K results + // based on distances/scores + scores, labels := topNIDsByDistance(distances, binIDs, int(k)) + return scores, labels, nil +} + +func (b *faissBinaryIndex) setDirectMap(directMapType int) error { + return b.binary.SetDirectMap(directMapType) +} + +func (b *faissBinaryIndex) setNProbe(nprobe int32) { + b.binary.SetNProbe(nprobe) +} + +func (b *faissBinaryIndex) trainAndAdd(trainingData *vectorSet, vecsToAdd *vectorSet) error { + // train the backing index with the floatData + var err error + if b.backing.IsSQIndex() { + err = b.backing.Train(trainingData.floatData) + if err != nil { + return err + } + } + + err = b.binary.Train(trainingData.binaryData) + if err != nil { + return err + } + return b.add(vecsToAdd) +} + +func (b *faissBinaryIndex) setQuantizers(trainedIndex faissIndexIVF) error { + if idx, ok := trainedIndex.(*faissBinaryIndex); ok { + // set quantizers for the binary and the backing index if its an SQ8 index + var err error + if idx.backing.IsSQIndex() { + err = b.backing.SetQuantizers(idx.backing) + if err != nil { + return err + } + } + err = b.binary.SetQuantizers(idx.binary) + if err != nil { + return err + } + return nil + } + return errNotSupported +} + +func (b *faissBinaryIndex) isMergeable() bool { + if b.cfg != nil { + switch b.cfg.optimizationType { + case index.IndexBIVFWithBackingFlat: + // the flat backing index currently doesn't support merge_from + return false + case index.IndexBIVFWithBackingSQ8: + return b.backing.Ntotal() > ivfThreshold + } + } + return false +} + +func (b *faissBinaryIndex) mergeFrom(other faissIndex, offset int64) error { + if idx, ok := other.(*faissBinaryIndex); ok { + if !idx.isMergeable() { + return errNotSupported + } + // merge the binary and the backing index, both flat and SQ8 indexes support + // merge_from API underneath the hood. the add_id is kept to 0 since we will + // be merging the largest set of indexes which will be sequential in the list + // of segments being merged, so there won't be any ID conflicts. + err := b.backing.MergeFrom(idx.backing, 0) + if err != nil { + return err + } + err = b.binary.MergeFrom(idx.binary, offset) + if err != nil { + return err + } + + return nil + } + return errNotSupported +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_float32.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_float32.go new file mode 100644 index 0000000000..bfcd7267c5 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_float32.go @@ -0,0 +1,199 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "encoding/json" + + index "github.com/blevesearch/bleve_index_api" + faiss "github.com/blevesearch/go-faiss" +) + +// --------------------------------- +// Faiss Float32 Index +// --------------------------------- +type faissFloat32Index struct { + cfg *faissIndexConfig + idx *faiss.IndexImpl +} + +func newFaissFloat32Index(idx *faiss.IndexImpl) (index faissIndex, err error) { + if idx == nil { + return nil, errNilIndex + } + return &faissFloat32Index{ + idx: idx, + }, nil +} + +func newFaissFloat32IndexWithConfig(idx *faiss.IndexImpl, cfg *faissIndexConfig) (index faissIndex, err error) { + if idx == nil { + return nil, errNilIndex + } + if cfg == nil { + return nil, errNilConfig + } + + return &faissFloat32Index{ + idx: idx, + cfg: cfg, + }, nil +} + +func (f *faissFloat32Index) add(vecs *vectorSet) error { + return f.idx.Add(vecs.floatData) +} + +func (f *faissFloat32Index) close() { + f.idx.Close() +} + +func (f *faissFloat32Index) dim() int { + return f.idx.D() +} + +func (f *faissFloat32Index) metricType() int { + return f.idx.MetricType() +} + +func (f *faissFloat32Index) ntotal() int64 { + return f.idx.Ntotal() +} + +func (f *faissFloat32Index) reconstructBatch(vecIDs []int64, prealloc []float32) ([]float32, error) { + return f.idx.ReconstructBatch(vecIDs, prealloc) +} + +func (f *faissFloat32Index) search(qVector *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + return f.idx.SearchWithOptions(qVector.floatData, k, selector, params) +} + +func (f *faissFloat32Index) write(buf []byte, w *FileWriter) error { + idxBytes, err := faiss.WriteIndexIntoBuffer(f.idx) + if err != nil { + return err + } + idxBytes = w.process(idxBytes) + + // write the length of the serialized vector index bytes + n := binary.PutUvarint(buf, uint64(len(idxBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + _, err = w.Write(idxBytes) + if err != nil { + return err + } + return nil +} + +func (f *faissFloat32Index) size() uint64 { + return f.idx.Size() +} + +// ----------------------------------------------------------------- +// casting methods to access index-specific operations below +// ----------------------------------------------------------------- +func (f *faissFloat32Index) castIVF() faissIndexIVF { + if f.idx.IsIVFIndex() { + // return f itself, as the IVF interface is implemented by the same + // struct as the non-IVF interface in go-faiss. + return f + } + // not an IVF index, return nil. + return nil +} + +// ----------------------------------------------------------------- +// IVF-Index specific operations +// ----------------------------------------------------------------- +func (f *faissFloat32Index) clusterVectorCounts(sel faiss.Selector, nlist int) ([]int64, error) { + return f.idx.ObtainClusterVectorCountsFromIVFIndex(sel, nlist) +} + +func (f *faissFloat32Index) centroidCardinalities(limit int, descending bool) ([]uint64, [][]float32, error) { + return f.idx.ObtainKCentroidCardinalitiesFromIVFIndex(limit, descending) +} + +func (f *faissFloat32Index) ivfParams() (nprobe, nlist int) { + return f.idx.IVFParams() +} + +func (f *faissFloat32Index) searchQuantizer(qVector *vectorSet, centroidSelector faiss.Selector, centroidCount int64) ([]int64, []float32, error) { + return f.idx.ObtainClustersWithDistancesFromIVFIndex(qVector.floatData, centroidSelector, centroidCount) +} + +func (f *faissFloat32Index) searchClusters(eligibleCentroidIDs []int64, centroidDis []float32, + centroidsToProbe int, qVecSet *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + return f.idx.SearchClustersFromIVFIndex(eligibleCentroidIDs, centroidDis, centroidsToProbe, qVecSet.floatData, k, selector, params) +} + +func (f *faissFloat32Index) setDirectMap(directMapType int) error { + return f.idx.SetDirectMap(directMapType) +} + +func (f *faissFloat32Index) setNProbe(nprobe int32) { + f.idx.SetNProbe(nprobe) +} + +func (f *faissFloat32Index) trainAndAdd(trainingData *vectorSet, vecsToAdd *vectorSet) error { + err := f.idx.Train(trainingData.floatData) + if err != nil { + return err + } + return f.add(vecsToAdd) +} + +func (f *faissFloat32Index) setQuantizers(trainedIndex faissIndexIVF) error { + centroidFaissIndex, ok := trainedIndex.(*faissFloat32Index) + if !ok { + // if not a float32 trained index, we cannot set it as the quantizer + // for the current index, return an error. + return errNotSupported + } + return f.idx.SetQuantizers(centroidFaissIndex.idx) +} + +func (f *faissFloat32Index) isMergeable() bool { + if f.cfg != nil { + switch f.cfg.optimizationType { + case index.IndexOptimizedForLatency, index.IndexOptimizedForRecall: + return f.ntotal() > ivfSq8Threshold + case index.IndexOptimizedForMemoryEfficient, index.IndexIVFRaBitQ: + return f.ntotal() > ivfThreshold + default: + return false + } + } + return false +} + +func (f *faissFloat32Index) mergeFrom(other faissIndex, offset int64) error { + otherFaissIndex, ok := other.(*faissFloat32Index) + if !ok { + return errNotSupported + } + + if otherFaissIndex.isMergeable() { + return f.idx.MergeFrom(otherFaissIndex.idx, offset) + } + return errNotSupported +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_gpu_float32.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_gpu_float32.go new file mode 100644 index 0000000000..dfba5cd1ba --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_index_gpu_float32.go @@ -0,0 +1,304 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "encoding/json" + "sync/atomic" + + faiss "github.com/blevesearch/go-faiss" +) + +// gpuState holds all the resources related to gpu vector search, +// the gpu index and the request batcher to the gpu +type gpuState struct { + idx *faiss.GPUIndexImpl + batcher *requestBatcher +} + +// batchSearch implements faissIndexBatch directly on gpuState, so the batcher +// holds a reference to the index without going through the atomic pointer. +func (gs *gpuState) batchSearch(qVector *vectorSet, k int64) ([]float32, []int64, error) { + return gs.idx.Search(qVector.floatData, k) +} + +// --------------------------------- +// Faiss GPU Float32 Index +// --------------------------------- +// faissGPUFloat32Index wraps a CPU float32 index alongside a GPU index. +// The GPU is used for unfiltered searches (no selector), while all +// other operations (filtered searches, IVF cluster searches, SQ/IVF +// operations, serialization, etc.) are delegated to the CPU index. +type faissGPUFloat32Index struct { + cpuIdx *faiss.IndexImpl + + // doneCh is closed when initGPU completes. + doneCh chan struct{} + + // gpu holds both the GPU index and its request batcher as a single + // atomic pointer; a nil load means the GPU is not yet available or has + // been torn down. + gpu atomic.Pointer[gpuState] +} + +// newFaissGPUFloat32Index creates a GPU-backed float32 index. The GPU clone is +// always performed asynchronously; search falls back to CPU until it +// completes. All other GPU-operating methods block on doneCh before proceeding. +func newFaissGPUFloat32Index(cpuIdx *faiss.IndexImpl) (faissIndex, error) { + if cpuIdx == nil { + return nil, errNilIndex + } + f := &faissGPUFloat32Index{ + cpuIdx: cpuIdx, + doneCh: make(chan struct{}), + } + go f.initGPU() + return f, nil +} + +// waitGPU blocks until initGPU has completed +func (f *faissGPUFloat32Index) waitGPU() { + <-f.doneCh +} + +// initGPU clones the CPU index to the GPU and sets up the request batcher. +// It always closes doneCh when it returns, signalling completion to waiters. +func (f *faissGPUFloat32Index) initGPU() { + defer close(f.doneCh) + gpuIdx, err := faiss.CloneToGPU(f.cpuIdx) + if err != nil || gpuIdx == nil { + return + } + gs := &gpuState{idx: gpuIdx} + gs.batcher = newRequestBatcher(gs) + f.gpu.Store(gs) +} + +// attempt to add the vectors to the GPU index. If it fails, +// fallback to the CPU index +func (f *faissGPUFloat32Index) add(vecs *vectorSet) error { + f.waitGPU() + gpuState := f.gpu.Load() + if gpuState == nil { + return f.cpuIdx.Add(vecs.floatData) + } + + err := gpuState.idx.Add(vecs.floatData) + if err != nil { + f.teardownGPU() + return f.cpuIdx.Add(vecs.floatData) + } + + err = f.syncGPUToCPU() + if err != nil { + f.teardownGPU() + return f.cpuIdx.Add(vecs.floatData) + } + + return nil +} + +func (f *faissGPUFloat32Index) close() { + f.waitGPU() + f.teardownGPU() + f.cpuIdx.Close() +} + +// teardownGPU stops the batcher first (while gpuIdx is still live so that +// the final flush can complete on the GPU), then nils and closes the GPU index. +func (f *faissGPUFloat32Index) teardownGPU() { + f.waitGPU() + // Swap to nil first so new searches fall through to CPU immediately. + // The batcher holds a direct reference to gpuState.idx via gpuState.batchSearch, + // so the final flush completes safely without touching f.gpu. + gpuState := f.gpu.Swap(nil) + if gpuState == nil { + return + } + gpuState.batcher.stop() + gpuState.idx.Close() +} + +func (f *faissGPUFloat32Index) dim() int { + return f.cpuIdx.D() +} + +func (f *faissGPUFloat32Index) metricType() int { + return f.cpuIdx.MetricType() +} + +func (f *faissGPUFloat32Index) ntotal() int64 { + return f.cpuIdx.Ntotal() +} + +func (f *faissGPUFloat32Index) reconstructBatch(vecIDs []int64, prealloc []float32) ([]float32, error) { + return f.cpuIdx.ReconstructBatch(vecIDs, prealloc) +} + +func (f *faissGPUFloat32Index) search(qVector *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + if selector == nil && len(params) == 0 { + if gpuState := f.gpu.Load(); gpuState != nil { + return gpuState.batcher.search(qVector, k) + } + } + // GPU not ready, filtered search, or non-empty params — fall back to CPU + return f.cpuIdx.SearchWithOptions(qVector.floatData, k, selector, params) +} + +func (f *faissGPUFloat32Index) write(buf []byte, w *FileWriter) error { + idxBytes, err := faiss.WriteIndexIntoBuffer(f.cpuIdx) + if err != nil { + return err + } + idxBytes = w.process(idxBytes) + + // write the length of the serialized vector index bytes + n := binary.PutUvarint(buf, uint64(len(idxBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + _, err = w.Write(idxBytes) + if err != nil { + return err + } + return nil +} + +func (f *faissGPUFloat32Index) size() uint64 { + return f.cpuIdx.Size() +} + +// inGPURam reports if the index is currently running on the GPU. +// returns false if the async clone is not yet done or the clone fails. +func (f *faissGPUFloat32Index) inGPURam() bool { + return f.gpu.Load() != nil +} + +// ----------------------------------------------------------------- +// casting methods to access index-specific operations below +// ----------------------------------------------------------------- +func (f *faissGPUFloat32Index) castIVF() faissIndexIVF { + if f.cpuIdx.IsIVFIndex() { + return f + } + return nil +} + +// ----------------------------------------------------------------- +// IVF-Index specific operations (delegate to CPU index) +// ----------------------------------------------------------------- +func (f *faissGPUFloat32Index) clusterVectorCounts(sel faiss.Selector, nlist int) ([]int64, error) { + return f.cpuIdx.ObtainClusterVectorCountsFromIVFIndex(sel, nlist) +} + +func (f *faissGPUFloat32Index) centroidCardinalities(limit int, descending bool) ([]uint64, [][]float32, error) { + return f.cpuIdx.ObtainKCentroidCardinalitiesFromIVFIndex(limit, descending) +} + +func (f *faissGPUFloat32Index) ivfParams() (nprobe, nlist int) { + return f.cpuIdx.IVFParams() +} + +func (f *faissGPUFloat32Index) searchQuantizer(qVector *vectorSet, centroidSelector faiss.Selector, centroidCount int64) ([]int64, []float32, error) { + return f.cpuIdx.ObtainClustersWithDistancesFromIVFIndex(qVector.floatData, centroidSelector, centroidCount) +} + +func (f *faissGPUFloat32Index) searchClusters(eligibleCentroidIDs []int64, centroidDis []float32, + centroidsToProbe int, qVecSet *vectorSet, k int64, selector faiss.Selector, params json.RawMessage) ([]float32, []int64, error) { + return f.cpuIdx.SearchClustersFromIVFIndex(eligibleCentroidIDs, centroidDis, centroidsToProbe, qVecSet.floatData, k, selector, params) +} + +func (f *faissGPUFloat32Index) setDirectMap(directMapType int) error { + return f.cpuIdx.SetDirectMap(directMapType) +} + +func (f *faissGPUFloat32Index) setNProbe(nprobe int32) { + f.cpuIdx.SetNProbe(nprobe) +} + +// attempt to train and add the vectors to the GPU index. If it fails, +// fallback to the CPU index +func (f *faissGPUFloat32Index) trainAndAdd(trainingData *vectorSet, vecsToAdd *vectorSet) error { + f.waitGPU() + gpuState := f.gpu.Load() + if gpuState == nil { + return f.trainAndAddCPU(trainingData, vecsToAdd) + } + + err := gpuState.idx.Train(trainingData.floatData) + if err != nil { + f.teardownGPU() + return f.trainAndAddCPU(trainingData, vecsToAdd) + } + + err = gpuState.idx.Add(vecsToAdd.floatData) + if err != nil { + f.teardownGPU() + return f.trainAndAddCPU(trainingData, vecsToAdd) + } + + err = f.syncGPUToCPU() + if err != nil { + f.teardownGPU() + return f.trainAndAddCPU(trainingData, vecsToAdd) + } + + return nil +} + +func (f *faissGPUFloat32Index) trainAndAddCPU(trainingData *vectorSet, vecsToAdd *vectorSet) error { + err := f.cpuIdx.Train(trainingData.floatData) + if err != nil { + return err + } + return f.cpuIdx.Add(vecsToAdd.floatData) +} + +func (f *faissGPUFloat32Index) setQuantizers(trainedIndex faissIndexIVF) error { + return errNotSupported +} + +func (f *faissGPUFloat32Index) isMergeable() bool { + return false +} + +func (f *faissGPUFloat32Index) mergeFrom(other faissIndex, offset int64) error { + return errNotSupported +} + +// syncGPUToCPU clones the current GPU index state back to the CPU index, +// replacing the old CPU index. +func (f *faissGPUFloat32Index) syncGPUToCPU() error { + gpuState := f.gpu.Load() + if gpuState == nil { + return nil + } + + cpuIdx, err := faiss.CloneToCPU(gpuState.idx) + if err != nil { + return err + } + + oldCPUIdx := f.cpuIdx + f.cpuIdx = cpuIdx + oldCPUIdx.Close() + return nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_unix.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_unix.go new file mode 100644 index 0000000000..116e45a709 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_unix.go @@ -0,0 +1,25 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors && !windows +// +build vectors,!windows + +package zap + +import faiss "github.com/blevesearch/go-faiss" + +const ( + faissIOFlags = faiss.IOFlagReadMmap | faiss.IOFlagSkipPrefetch + faissIOFlagsReadOnly = faissIOFlags | faiss.IOFlagReadOnly +) diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_win.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_win.go new file mode 100644 index 0000000000..a7bd7d657b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_io_flags_win.go @@ -0,0 +1,25 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors && windows +// +build vectors,windows + +package zap + +import faiss "github.com/blevesearch/go-faiss" + +const ( + faissIOFlags = faiss.IOFlagReadOnly + faissIOFlagsReadOnly = faiss.IOFlagReadOnly +) diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_posting.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_posting.go new file mode 100644 index 0000000000..d974616b68 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_posting.go @@ -0,0 +1,355 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeVecPostingsList int +var reflectStaticSizeVecPostingsIterator int +var reflectStaticSizeVecPosting int + +func init() { + var pl VecPostingsList + reflectStaticSizeVecPostingsList = int(reflect.TypeOf(pl).Size()) + var pi VecPostingsIterator + reflectStaticSizeVecPostingsIterator = int(reflect.TypeOf(pi).Size()) + var p VecPosting + reflectStaticSizeVecPosting = int(reflect.TypeOf(p).Size()) +} + +type VecPosting struct { + docNum uint64 + score float32 +} + +func (vp *VecPosting) Number() uint64 { + return vp.docNum +} + +func (vp *VecPosting) Score() float32 { + return vp.score +} + +func (vp *VecPosting) Size() int { + sizeInBytes := reflectStaticSizePosting + + return sizeInBytes +} + +// ============================================================================= + +// the vector postings list is supposed to store the docNum and its similarity +// score as a vector postings entry in it. +// The way in which is it stored is using a roaring64 bitmap. +// the docNum is stored in high 32 and the lower 32 bits contains the score value. +// the score is actually a float32 value and in order to store it as a uint32 in +// the bitmap, we use the IEEE 754 floating point format. +// +// each entry in the roaring64 bitmap of the vector postings list is a 64 bit +// number which looks like this: +// MSB LSB +// |64 63 62 ... 32| 31 30 ... 0| +// | | | +type VecPostingsList struct { + // todo: perhaps we don't even need to store a bitmap if there is only + // one similar vector the query, but rather store it as a field value + // in the struct + except *roaring64.Bitmap + postings *roaring64.Bitmap +} + +var emptyVecPostingsIterator = &VecPostingsIterator{} +var emptyVecPostingsList = &VecPostingsList{} + +func (vpl *VecPostingsList) Iterator(prealloc segment.VecPostingsIterator) segment.VecPostingsIterator { + if vpl.postings == nil { + return emptyVecPostingsIterator + } + // tbd: do we check the cardinality of postings and scores? + var preallocPI *VecPostingsIterator + pi, ok := prealloc.(*VecPostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyVecPostingsIterator { + preallocPI = nil + } + + return vpl.iterator(preallocPI) +} + +func (vpl *VecPostingsList) iterator(rv *VecPostingsIterator) *VecPostingsIterator { + if rv == nil { + rv = &VecPostingsIterator{} + } else { + *rv = VecPostingsIterator{} // clear the struct + } + // think on some of the edge cases over here. + if vpl.postings == nil { + return rv + } + rv.postings = vpl + rv.all = vpl.postings.Iterator() + if vpl.except != nil { + rv.ActualBM = roaring64.AndNot(vpl.postings, vpl.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = vpl.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + return rv +} + +func (vpl *VecPostingsList) Size() int { + sizeInBytes := reflectStaticSizeVecPostingsList + SizeOfPtr + + if vpl.except != nil { + sizeInBytes += int(vpl.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (vpl *VecPostingsList) Count() uint64 { + if vpl.postings != nil { + n := vpl.postings.GetCardinality() + var e uint64 + if vpl.except != nil { + e = vpl.postings.AndCardinality(vpl.except) + } + return n - e + } + return 0 +} + +func (vpl *VecPostingsList) ResetBytesRead(val uint64) { + +} + +func (vpl *VecPostingsList) BytesRead() uint64 { + return 0 +} + +func (vpl *VecPostingsList) BytesWritten() uint64 { + return 0 +} + +// ============================================================================= + +type VecPostingsIterator struct { + postings *VecPostingsList + all roaring64.IntPeekable64 + Actual roaring64.IntPeekable64 + ActualBM *roaring64.Bitmap + + next VecPosting // reused across Next() calls +} + +func (vpItr *VecPostingsIterator) nextCodeAtOrAfterClean(atOrAfter uint64) (uint64, bool, error) { + vpItr.Actual.AdvanceIfNeeded(atOrAfter) + + if !vpItr.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return vpItr.Actual.Next(), true, nil +} + +func (vpItr *VecPostingsIterator) nextCodeAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if vpItr.Actual == nil || !vpItr.Actual.HasNext() { + return 0, false, nil + } + + if vpItr.postings == nil || vpItr.postings == emptyVecPostingsList { + // couldn't find anything + return 0, false, nil + } + + if vpItr.postings.postings == vpItr.ActualBM { + return vpItr.nextCodeAtOrAfterClean(atOrAfter) + } + + vpItr.Actual.AdvanceIfNeeded(atOrAfter) + + if !vpItr.Actual.HasNext() || !vpItr.all.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := vpItr.Actual.Next() + allN := vpItr.all.Next() + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do. + for allN != n { + if !vpItr.all.HasNext() { + return 0, false, nil + } + allN = vpItr.all.Next() + } + + return n, true, nil +} + +// a transformation function which stores both the score and the docNum as a single +// entry which is a uint64 number. +func getVectorCode(docNum uint32, score float32) uint64 { + return uint64(docNum)<<32 | uint64(math.Float32bits(score)) +} + +// Next returns the next posting on the vector postings list, or nil at the end +func (vpItr *VecPostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.VecPosting, error) { + // transform the docNum provided to the vector code format and use that to + // get the next entry. the comparison still happens docNum wise since after + // the transformation, the docNum occupies the upper 32 bits just an entry in + // the postings list + atOrAfter = getVectorCode(uint32(atOrAfter), 0) + code, exists, err := vpItr.nextCodeAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + vpItr.next = VecPosting{} // clear the struct + rv := &vpItr.next + rv.score = math.Float32frombits(uint32(code)) + rv.docNum = code >> 32 + + return rv, nil +} + +func (vpItr *VecPostingsIterator) Next() (segment.VecPosting, error) { + return vpItr.nextAtOrAfter(0) +} + +func (vpItr *VecPostingsIterator) Advance(docNum uint64) (segment.VecPosting, error) { + return vpItr.nextAtOrAfter(docNum) +} + +func (vpItr *VecPostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + SizeOfPtr + + vpItr.next.Size() + + return sizeInBytes +} + +func (vpItr *VecPostingsIterator) ResetBytesRead(val uint64) { + +} + +func (vpItr *VecPostingsIterator) BytesRead() uint64 { + return 0 +} + +func (vpItr *VecPostingsIterator) BytesWritten() uint64 { + return 0 +} + +// InterpretVectorIndex returns a struct based implementation (vectorIndexWrapper) +// that will allow the caller to - +// (1) search within an attached vector index +// (2) search limited to a subset of documents within an attached vector index +// (3) close attached vector index +// (4) get the size of the attached vector index +func (sb *SegmentBase) InterpretVectorIndex(field string, except *roaring.Bitmap) (segment.VectorIndex, error) { + rv := &vectorIndexWrapper{sb: sb} + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 <= 0 { + return rv, nil + } + // adjust to get the actual fieldID + fieldID := fieldIDPlus1 - 1 + rv.fieldID = fieldID + // get the position of the vector section for the field + pos := sb.fieldsSectionsMap[fieldID][SectionFaissVectorIndex] + // check if the field has a vector section in the segment. + if pos <= 0 { + return rv, nil + } + // the below loop loads the following: + // 1. doc values(first 2 iterations) - adhering to the sections format. never + // valid values for vector section + // 2. index optimization type. + for i := 0; i < 3; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + } + + // create the vector index wrapper by loading (or creating) the vector index + // and the vector to docID mapping + useGPU := sb.fieldsOptions[field].UseGPU() + var err error + rv.index, rv.mapping, rv.exclude, err = sb.vecIndexCache.loadOrCreate(fieldID, sb.mem[pos:], uint32(sb.numDocs), except, useGPU, sb.fileReader) + if err != nil { + return nil, err + } + // get the size of the vector index + if rv.index != nil { + rv.vecIndexSize = rv.index.size() + } + + // get the number of nested documents in this segment, if any + // to determine if the wrapper needs to handle nested documents + rv.nestedMode = sb.countNested() > 0 + + return rv, nil +} + +func (sb *SegmentBase) UpdateFieldStats(stats segment.FieldStats) { + for _, fieldName := range sb.fieldsInv { + pos := int(sb.fieldsSectionsMap[sb.fieldsMap[fieldName]-1][SectionFaissVectorIndex]) + if pos == 0 { + continue + } + + for i := 0; i < 3; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + } + numVecs, _ := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + + stats.Store("num_vectors", fieldName, numVecs) + } +} + +func (sb *SegmentBase) UpdateVectorFieldStats(stats segment.FieldStats) { + if sb.vecIndexCache == nil { + return + } + for _, fieldName := range sb.fieldsInv { + pos := int(sb.fieldsSectionsMap[sb.fieldsMap[fieldName]-1][SectionFaissVectorIndex]) + if pos == 0 { + continue + } + fieldID := sb.fieldsMap[fieldName] - 1 + switch sb.vecIndexCache.indexLocation(fieldID) { + case vectorIndexInGPU: + stats.Store("num_vector_indexes_in_gpu", fieldName, 1) + case vectorIndexInCPU: + stats.Store("num_vector_indexes_in_cpu", fieldName, 1) + } + } +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_request_batcher.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_request_batcher.go new file mode 100644 index 0000000000..a5614dd823 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_request_batcher.go @@ -0,0 +1,292 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "errors" + "sync" +) + +var ( + errBatcherStopped error = errors.New("batcher has been stopped") +) + +// The requestBatcher is responsible for batching search requests to a Faiss index. +// It will accumulate incoming search requests and execute them in batches to improve performance. +// The batcher will use the provided Faiss index to perform the searches, and it +// will manage the batching logic, including timing and concurrency control. +type requestBatcher struct { + // the coalesce queue that manages the batching of incoming search requests. + cq *coalesceQueue +} + +func newRequestBatcher(idx faissQueryBatch) *requestBatcher { + b := &requestBatcher{ + cq: newCoalesceQueue(idx), + } + return b +} + +// search performs a search on the Faiss index using the provided query vector and k value. +// NOTE: it must be ensured that every query vector passed to this method has the same dimensionality +// as the vectors in the Faiss index, this is considered as an invariant to be upheld by the caller, +// and is not checked within this method for performance reasons. +func (b *requestBatcher) search(qVector *vectorSet, k int64) ([]float32, []int64, error) { + // create a new batch request for this search query. + req, respCh := newBatchRequest(qVector, k) + // check if the batcher has been stopped before processing the search request. + select { + case b.cq.enqueueCh <- req: + case <-b.cq.stopCh: + return nil, nil, errBatcherStopped + } + // wait for the search results to be sent back through the response channel, + // and return those results to the caller. + resp := <-respCh + return resp.distances, resp.ids, resp.err +} + +func (b *requestBatcher) stop() { + b.cq.stop() +} + +// -------------------------------------------------- +// batch request +// -------------------------------------------------- + +type batchRequest struct { + qVector *vectorSet + k int64 + respCh []chan *batchResponse +} + +func newBatchRequest(qVector *vectorSet, k int64) (*batchRequest, chan *batchResponse) { + // response channel for sending the search results back to the requester. + respChan := make(chan *batchResponse, 1) + return &batchRequest{ + qVector: qVector, + k: k, + respCh: []chan *batchResponse{respChan}, + }, respChan +} + +// canMerge checks if this batch request can be merged with another request. +// For now, we can only merge requests that have the same k value. +func (r *batchRequest) canMerge(other *batchRequest) bool { + // for now, we can only merge requests that have the same k value, + // since the Faiss search API requires a single k value for each search. + return r.k == other.k +} + +// mergeWith combines another batch request into this one by concatenating their query vectors and response channels. +// NOTE: must only be called after veryfing that canMerge() returns true for these two requests. +func (r *batchRequest) mergeWith(other *batchRequest) { + // merge the query vectors of the two requests by concatenating them together. + r.qVector.mergeWith(other.qVector) + // append the response channels from the other request to this request, so that when the search results are ready, + // we can send the results back to all requesters that were merged into this batch. + r.respCh = append(r.respCh, other.respCh...) +} + +func (r *batchRequest) sendResponse(distances []float32, ids []int64, err error) { + // we may have multiple batches merged together, so we need to segregate the results for each original request + // and send them back to the appropriate response channels. + if err != nil { + // if there was an error during the search, send the error back to all requesters in this batch. + for _, respCh := range r.respCh { + respCh <- newBatchResponse(nil, nil, err) + close(respCh) + } + return + } + // if the search was successful, we need to split the combined results back into individual responses for each original request. + for i, respCh := range r.respCh { + offset := int64(i) * r.k + // calculate the start and end indices for the results corresponding to this response channel. + curDistances := distances[offset : offset+r.k] + curIDs := ids[offset : offset+r.k] + // send the results back to the requester through the response channel. + respCh <- newBatchResponse(curDistances, curIDs, nil) + // close the response channel to signal that the response has been sent and there will be no more data. + close(respCh) + } +} + +// -------------------------------------------------- +// batch response +// -------------------------------------------------- + +type batchResponse struct { + distances []float32 + ids []int64 + err error +} + +func newBatchResponse(distances []float32, ids []int64, err error) *batchResponse { + return &batchResponse{ + distances: distances, + ids: ids, + err: err, + } +} + +// --------------------------------------------------- +// batch manager +// --------------------------------------------------- +type batchManager struct { + batchPool sync.Pool +} + +func newBatchManager() *batchManager { + return &batchManager{ + batchPool: sync.Pool{ + New: func() any { + return make([]*batchRequest, 0, 16) + }, + }, + } +} + +func (m *batchManager) getBatch() []*batchRequest { + return m.batchPool.Get().([]*batchRequest)[:0] +} + +func (m *batchManager) putBatch(batch []*batchRequest) { + clear(batch) + m.batchPool.Put(batch[:0]) +} + +// -------------------------------------------------- +// coalesceQueue +// -------------------------------------------------- +// Implements Nagle's algorithm for coalescing search requests: +// - The coalesce goroutine continuously receives and coalesces incoming requests. +// - When the flusher is idle, the coalesce goroutine hands off the coalesced batch. +// - While the flusher is busy executing a batch, the coalesce goroutine keeps coalescing new requests. +// - Once the flusher completes, the coalesce goroutine hands off any accumulated requests right away. +type coalesceQueue struct { + // the Faiss index that this coalesce queue will execute search requests against. + idx faissQueryBatch + // channel for enqueuing new batch requests into the queue. + enqueueCh chan *batchRequest + // channel for handing off coalesced batches to the flusher goroutine for execution. + flushCh chan []*batchRequest + // safeguard to ensure that the stop() method is thread-safe and can only be called once, + // preventing multiple close operations on the stopCh. + stopOnce sync.Once + // channel for signaling the batcher to stop processing requests and shut down. + stopCh chan struct{} + // closed when filler goroutine has exited after receiving a stop signal. + fillerDoneCh chan struct{} + // closed when flusher goroutine has exited after receiving a stop signal. + flusherDoneCh chan struct{} + // a sync.Pool for reusing batch slices to reduce allocations and GC overhead. + batchManager *batchManager +} + +func newCoalesceQueue(idx faissQueryBatch) *coalesceQueue { + q := &coalesceQueue{ + idx: idx, + enqueueCh: make(chan *batchRequest), + flushCh: make(chan []*batchRequest), + stopCh: make(chan struct{}), + fillerDoneCh: make(chan struct{}), + flusherDoneCh: make(chan struct{}), + batchManager: newBatchManager(), + } + go q.filler() + go q.flusher() + return q +} + +func (q *coalesceQueue) stop() { + q.stopOnce.Do(func() { + close(q.stopCh) + }) + // wait for all goroutines to exit + <-q.fillerDoneCh + <-q.flusherDoneCh +} + +// filler is the enqueuer goroutine. It receives incoming search requests, +// coalesces them into batches, and hands them off to the flusher when it is idle. +func (q *coalesceQueue) filler() { + defer close(q.fillerDoneCh) + var pendingBatch []*batchRequest + for { + if len(pendingBatch) > 0 { + select { + case req := <-q.enqueueCh: + pendingBatch = q.coalesce(pendingBatch, req) + case q.flushCh <- pendingBatch: + pendingBatch = nil + case <-q.stopCh: + q.flushCh <- pendingBatch + return + } + } else { + select { + case req := <-q.enqueueCh: + pendingBatch = q.coalesce(pendingBatch, req) + case <-q.stopCh: + return + } + } + } +} + +// flusher is the background goroutine that executes batches handed off by the monitor. +func (q *coalesceQueue) flusher() { + defer close(q.flusherDoneCh) + for { + select { + case batch := <-q.flushCh: + q.executeBatch(batch) + case <-q.fillerDoneCh: + return + } + } +} + +// coalesce merges req into the queue, either by finding a compatible pending +// request to merge with or by appending a new entry. +func (q *coalesceQueue) coalesce(queue []*batchRequest, req *batchRequest) []*batchRequest { + for _, pendingReq := range queue { + if pendingReq.canMerge(req) { + pendingReq.mergeWith(req) + return queue + } + } + // No compatible request found; clone the query vector so that future + // merges into this entry do not mutate the caller's data. + req.qVector = req.qVector.clone() + if queue == nil { + queue = q.batchManager.getBatch() + } + return append(queue, req) +} + +// executeBatch runs all coalesced requests against the Faiss index and delivers results. +func (q *coalesceQueue) executeBatch(batch []*batchRequest) { + for _, req := range batch { + distances, ids, err := q.idx.batchSearch(req.qVector, req.k) + req.sendResponse(distances, ids, err) + } + // recycle the batch slice back into the pool + q.batchManager.putBatch(batch) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/faiss_vector_wrapper.go b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_wrapper.go new file mode 100644 index 0000000000..0e15e295a5 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/faiss_vector_wrapper.go @@ -0,0 +1,1060 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/json" + "fmt" + "math" + "math/bits" + "slices" + + "github.com/RoaringBitmap/roaring/v2/roaring64" + index "github.com/blevesearch/bleve_index_api" + faiss "github.com/blevesearch/go-faiss" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +const ( + // maxMultiVectorDocSearchRetries limits repeated searches when deduplicating + // multi-vector documents. Each retry excludes previously seen vectors to find + // new unique documents. Acts as a safeguard against pathological data distributions. + maxMultiVectorDocSearchRetries = 100 + + // Pre-Filtered IVF Index search: Threshold for when to start increasing: after 2 iterations without + // finding enough documents, we start increasing up to the number of centroidsToProbe + // up to the total number of eligible centroids available + nprobeIncreaseThreshold = 2 + + // binaryOversampleValue is the multiplier used to determine how many additional vectors to retrieve + // from the binary index as an oversampling strategy to improve recall. + binaryOversampleValue = 4 +) + +// vectorIndexWrapper conforms to scorch_segment_api's VectorIndex interface +type vectorIndexWrapper struct { + index faissIndex + mapping *idMapping + exclude *bitmap + fieldID uint16 + vecIndexSize uint64 + + // nestedMode indicates if the vector index is operating in nested document mode. + // if so we have a reusable ancestry slice to help with docID lookups + nestedMode bool + ancestry []index.AncestorID + + sb *SegmentBase +} + +func (v *vectorIndexWrapper) Search(qVector []float32, k int64, params json.RawMessage) (segment.VecPostingsList, error) { + if v.index == nil { + // vector index not found, so return empty postings list + return emptyVecPostingsList, nil + } + if v.index.dim() != len(qVector) { + // dimensionality mismatch, so return empty postings list + return emptyVecPostingsList, nil + } + // check if number of docs or number of vectors is zero + if v.mapping == nil || v.mapping.numVectors() == 0 || v.mapping.numDocuments() == 0 { + // no vectors or no documents indexed, so return empty postings list + return emptyVecPostingsList, nil + } + // check if all the vectors are excluded + if v.exclude != nil && v.exclude.cardinality() == v.mapping.numVectors() { + // all vectors excluded, so return empty postings list + return emptyVecPostingsList, nil + } + // create a vector set using the query vector + qVecSet, err := newVectorSet(len(qVector), qVector) + if err != nil { + return nil, err + } + rs, err := v.searchWithoutIDs(qVecSet, k, v.exclude, params) + if err != nil { + return nil, err + } + // populate the postings list from the result set + return getPostingsList(rs), nil +} + +func (v *vectorIndexWrapper) SearchWithFilter(qVector []float32, k int64, + eligibleList index.EligibleDocumentList, params json.RawMessage) ( + segment.VecPostingsList, error) { + // if no eligible documents, return empty postings list + if eligibleList == nil || eligibleList.Count() == 0 { + return emptyVecPostingsList, nil + } + if v.index == nil { + // vector index not found, so return empty postings list + return emptyVecPostingsList, nil + } + if v.index.dim() != len(qVector) { + // dimensionality mismatch, so return empty postings list + return emptyVecPostingsList, nil + } + // check if number of docs or number of vectors is zero + if v.mapping == nil || v.mapping.numVectors() == 0 || v.mapping.numDocuments() == 0 { + // no vectors or no documents indexed, so return empty postings list + return emptyVecPostingsList, nil + } + // if all documents are eligible, do a normal search + if eligibleList.Count() == uint64(v.mapping.numDocuments()) { + return v.Search(qVector, k, params) + } + // get the eligible document iterator + eligibleIterator := eligibleList.Iterator() + // vector IDs corresponding to the local doc numbers to be + // considered for the search + // create a bitmap for the vector IDs to include in the search + includeBM := newBitmap(v.mapping.numVectors()) + includeCardinality := 0 + for { + // get the next eligible document ID + id, ok := eligibleIterator.Next() + if !ok { + // exhausted all eligible document IDs + break + } + // get the vector IDs for this document ID + vecIDs, exists := v.mapping.vecsForDoc(uint32(id)) + if !exists { + continue + } + // since a vector can never belong to multiple documents, we calculate + // the cardinality by simply adding the number of vectors for each document + // we include, without worrying about duplicates and avoiding a potential + // costly population count on the bitmap at the end + includeCardinality += len(vecIDs) + for _, vecID := range vecIDs { + // add all vector IDs for this document to the inclusion bitmap + includeBM.set(vecID) + } + } + // In case a doc has invalid vector fields but valid non-vector fields, + // filter hit IDs may be ineligible for the kNN since the document does + // not have any/valid vectors. Also can happen if no documents have vectors + numSelected := uint32(includeCardinality) + if numSelected == 0 { + return emptyVecPostingsList, nil + } + // if we have included all vectors, then we can do a normal search + // with full selectivity (no filtering) + if numSelected == v.mapping.numVectors() { + return v.Search(qVector, k, params) + } + // get a vector set using the query vector + qVecSet, err := newVectorSet(len(qVector), qVector) + if err != nil { + return nil, err + } + // try to cast the index to an IVF index + ivfPtr := v.index.castIVF() + if ivfPtr == nil { + // perform search with included IDs in the bitmap, since + // this is not an IVF index + rs, err := v.searchWithIDs(qVecSet, k, includeBM, params) + if err != nil { + return nil, err + } + // populate the postings list from the result set + return getPostingsList(rs), nil + } + // Getting the IVF index parameters, nprobe and nlist, set at index time. + nprobe, nlist := ivfPtr.ivfParams() + // Create a FAISS selector based on the include bitmap. + includeSelector, err := getIncludeSelector(includeBM) + if err != nil { + return nil, err + } + // Ensure the selector is deleted after use, this does NOT free the inner includeBM bitmap. + // We control its lifecycle in GO. + defer includeSelector.Delete() + // Determining which clusters, identified by centroid ID, + // have at least one eligible vector and hence, ought to be + // probed. + clusterVectorCounts, err := ivfPtr.clusterVectorCounts(includeSelector, nlist) + if err != nil { + return nil, err + } + // Create a bitmap for the eligible centroids to be considered for probing. + centroidBM := newBitmap(uint32(nlist)) + centroidCount := 0 + for centroidID, vectorCount := range clusterVectorCounts { + // Only centroids with at least one eligible vector are considered. + if vectorCount > 0 { + // since we are adding only unique centroid IDs, this is simply an increment + // and we can avoid a population count at the end + centroidCount++ + centroidBM.set(uint32(centroidID)) + } + } + if centroidCount == 0 { + // No centroids have any eligible vectors, so return empty postings list. + return emptyVecPostingsList, nil + } + // create a FAISS selector based on the centroid bitmap + centroidSelector, err := getIncludeSelector(centroidBM) + if err != nil { + return nil, err + } + defer centroidSelector.Delete() + // Search the coarse quantizer to order the centroids based on proximity + // to the query vector. + eligibleCentroidIDs, centroidDistances, err := ivfPtr.searchQuantizer(qVecSet, centroidSelector, int64(centroidCount)) + if err != nil { + return nil, err + } + // Determining the minimum number of centroids to be probed + // to ensure that at least 'k' vectors are collected while + // examining at least 'nprobe' centroids. + // centroidsToProbe range: [nprobe, number of eligible centroids] + var eligibleVecsTillNow int64 + var eligibleCentroidsTillNow int + centroidsToProbe := len(eligibleCentroidIDs) + for i, centroidID := range eligibleCentroidIDs { + // if we get a -1 somehow here, it means no more centroids + // need to reslice the eligibleCentroidIDs and distances + // accordingly, just a safeguard check as this does not + // really happen. FAISS can pad with -1s if there are not enough + // eligible centroids, but we have already counted the cardinality so + // we should not see -1s here. + if centroidID == -1 { + centroidsToProbe = i + // reslice to only valid centroids + eligibleCentroidIDs = eligibleCentroidIDs[:centroidsToProbe] + centroidDistances = centroidDistances[:centroidsToProbe] + break + } + eligibleVecsTillNow += clusterVectorCounts[centroidID] + eligibleCentroidsTillNow = i + 1 + // Stop once we've examined at least 'nprobe' centroids and + // collected at least 'k' vectors. + if eligibleVecsTillNow >= k && eligibleCentroidsTillNow >= nprobe { + centroidsToProbe = eligibleCentroidsTillNow + break + } + } + // Search the clusters specified by 'eligibleCentroidIDs' for + // vectors whose IDs are present in the includeBM bitmap. + // This is done while probing only 'centroidsToProbe' clusters. + // unless overridden dynamically, either by the search parameters + // or by the deduplication logic in searchClustersFromIVFIndex. + rs, err := v.searchClustersFromIVFIndex( + eligibleCentroidIDs, centroidDistances, centroidsToProbe, + qVecSet, k, includeBM, params) + if err != nil { + return nil, err + } + // populate the postings list from the result set + return getPostingsList(rs), nil +} +func (v *vectorIndexWrapper) Close() { + // skipping the closing because the index is cached and it's being + // deferred to a later point of time. + v.sb.vecIndexCache.decRef(v.fieldID) +} + +func (v *vectorIndexWrapper) Size() uint64 { + return v.vecIndexSize +} + +func (v *vectorIndexWrapper) ObtainKCentroidCardinalitiesFromIVFIndex(limit int, descending bool) ( + []index.CentroidCardinality, error) { + if v.index == nil { + return nil, nil + } + var ivfIdx faissIndexIVF + if ivfIdx = v.index.castIVF(); ivfIdx == nil { + return nil, nil + } + cardinalities, centroids, err := ivfIdx.centroidCardinalities(limit, descending) + if err != nil { + return nil, err + } + centroidCardinalities := make([]index.CentroidCardinality, len(cardinalities)) + for i, cardinality := range cardinalities { + centroidCardinalities[i] = index.CentroidCardinality{ + Centroid: centroids[i], + Cardinality: cardinality, + } + } + return centroidCardinalities, nil +} + +// docSearch performs a search on the vector index to retrieve +// top k documents based on the provided search function. +// It handles deduplication of documents that may have multiple +// vectors associated with them. +// The prepareNextIter function is used to set up the state +// for the next iteration, if more searches are needed to find +// k unique documents. The callback recieves the number of iterations +// done so far and the vector ids retrieved in the last search. While preparing +// the next iteration, if its decided that no further searches are needed, +// the prepareNextIter function can decide whether to continue searching or not +func (v *vectorIndexWrapper) docSearch(k int64, numDocs uint64, + search func() (scores []float32, labels []int64, err error), + prepareNextIter func(numIter int, labels []int64) bool) (resultSet, error) { + // create a result set to hold top K docIDs and their scores + rs := newResultSet(k, numDocs) + // flag to indicate if we have exhausted the vector index + var exhausted bool + // keep track of number of iterations done, we execute the loop more than once only when + // we have multi-vector documents leading to duplicates in docIDs retrieved + numIter := 0 + // get the metric type of the index to help with deduplication logic + metricType := v.index.metricType() + // we keep searching until we have k unique docIDs or we have exhausted the vector index + // or we have reached the maximum number of deduplication iterations allowed + for numIter < maxMultiVectorDocSearchRetries && rs.size() < k && !exhausted { + // search the vector index + numIter++ + scores, labels, err := search() + if err != nil { + return nil, err + } + // process the retrieved ids and scores, getting the corresponding docIDs + // for each vector id retrieved, and storing the best score for each unique docID + for i, vecID := range labels { + // a vecID of -1 indicates that all valid vectors in the index have been exhausted, + // so we set the flag to prevent further iterations. However, the current iteration + // may still contain valid results, so we process them before stopping. + if vecID == -1 { + exhausted = true + continue + } + docID, exists := v.getDocIDForVectorID(vecID) + if !exists { + continue + } + score := scores[i] + prevScore, exists := rs.get(docID) + if !exists { + // first time seeing this docID, so just store it + rs.put(docID, score) + continue + } + // we have seen this docID before, so we must compare scores + // check the index metric type first to check how we compare distances/scores + // and store the best score for the docID accordingly + // for inner product, higher the score, better the match + // for euclidean distance, lower the score/distance, better the match + // so we invert the comparison accordingly + switch metricType { + case faiss.MetricInnerProduct: // similarity metrics like dot product => higher is better + if score > prevScore { + rs.put(docID, score) + } + case faiss.MetricL2: + fallthrough + default: // distance metrics like euclidean distance => lower is better + if score < prevScore { + rs.put(docID, score) + } + } + } + // if we still have less than k unique docIDs, prepare for the next iteration, provided + // we have not exhausted the index + if rs.size() < k && !exhausted { + // prepare state for next iteration + shouldContinue := prepareNextIter(numIter, labels) + if !shouldContinue { + break + } + } + } + // at this point we either have k unique docIDs or we have exhausted + // the vector index or we have reached the maximum number of deduplication iterations allowed + // or the prepareNextIter function decided to break out of the loop + return rs, nil +} + +// searchWithoutIDs performs a search on the vector index to retrieve the top K documents +// while excluding any vector IDs specified in the exclude bitmap. +func (v *vectorIndexWrapper) searchWithoutIDs(qVector *vectorSet, k int64, + exclude *bitmap, params json.RawMessage) (resultSet, error) { + return v.docSearch(k, v.sb.numDocs, + func() ([]float32, []int64, error) { + // build the FAISS selector based on the exclude bitmap, if any. + // The exclude bitmap can be nil, indicating no exclusions, in that + // case we can pass a nil selector to FAISS. + // NOTE: The bitmap selector is just a wrapper over the exclude bitmap + // which is shared across the CGO layer. + sel, err := getExcludeSelector(exclude) + if err != nil { + return nil, nil, err + } + // NOTE: the selector being freed does NOT free the inner bitmap, as we control + // its lifecycle in GO, to reuse the bitmap across iterations, if needed, for + // multi-vector document retrieval. + if sel != nil { + // The selector can be nil here as we may not be excluding any vectors + // in which case we can just pass a nil selector to FAISS. + defer sel.Delete() + } + return v.index.search(qVector, k, sel, params) + }, + func(numIter int, labels []int64) bool { + // if this is the first loop iteration and we have < k unique docIDs, + // we must clone the existing exclude bitmap before modifying it + // to avoid modifying the original bitmap passed in by the caller + if numIter == 1 { + // if we do not have an exclude bitmap yet, create a new one + if exclude == nil { + exclude = newBitmap(v.mapping.numVectors()) + } else { + // clone the existing exclude bitmap + exclude = exclude.clone() + } + } + // prepare the exclude list for the next iteration by adding + // the vector ids retrieved in this iteration + for _, vecID := range labels { + // should not happen, but just a safeguard, as we catch -1 + // in the main loop + if vecID == -1 { + continue + } + exclude.set(uint32(vecID)) + } + // with exclude bitmap updated, we can proceed to the next iteration + // fast check if the exclude bitmap has all vectors excluded, in which case + // we can stop searching further + return exclude.cardinality() != v.mapping.numVectors() + }) +} + +// searchWithIDs performs a search on the vector index to retrieve the top K documents while only +// considering the vector IDs specified in the include bitmap. +// NOTE: The include bitmap must NOT be nil and must have at least one vector ID set. +func (v *vectorIndexWrapper) searchWithIDs(vecSet *vectorSet, k int64, include *bitmap, params json.RawMessage) (resultSet, error) { + return v.docSearch(k, v.sb.numDocs, + func() ([]float32, []int64, error) { + // build the FAISS selector based on the include bitmap. + // NOTE: The bitmap selector is just a wrapper over the include bitmap + // which is shared across the CGO layer. + sel, err := getIncludeSelector(include) + if err != nil { + return nil, nil, err + } + // NOTE: the selector being freed does NOT free the inner bitmap, as we control + // its lifecycle in GO, to reuse the bitmap across iterations, if needed, for + // multi-vector document retrieval. + defer sel.Delete() + return v.index.search(vecSet, k, sel, params) + }, + func(numIter int, labels []int64) bool { + // if this is the first loop iteration and we have < k unique docIDs, + // we clone the existing include slice before modifying it + if numIter == 1 { + if include == nil { + // should not happen, but just a safeguard + include = newBitmap(v.mapping.numVectors()) + } else { + // clone the existing include bitmap + include = include.clone() + } + } + // removing the vector ids retrieved in this iteration + // from the include set + for _, vecID := range labels { + // should not happen, but just a safeguard, as we catch -1 + // in the main loop + if vecID == -1 { + continue + } + include.clear(uint32(vecID)) + } + // only continue searching if we still have vector ids to include + return !include.isEmpty() + }) +} + +// searchClustersFromIVFIndex performs a search on the IVF vector index to retrieve the top K documents +// while including only the vectors present in the includeBM bitmap. +// It takes into account the eligible centroid IDs and ensures that at least centroidsToProbe are probed. +// If after a few iterations we haven't found enough documents, it dynamically increases the number of +// clusters searched (up to the number of eligible centroids) to ensure we can find k unique documents. +func (v *vectorIndexWrapper) searchClustersFromIVFIndex(eligibleCentroidIDs []int64, centroidDis []float32, + centroidsToProbe int, qVecSet *vectorSet, k int64, include *bitmap, params json.RawMessage) ( + resultSet, error) { + // get ivf index pointer, should not be nil at this point since this method is only called after confirming its an ivf index + ivfPtr := v.index.castIVF() + var totalEligibleCentroids = len(eligibleCentroidIDs) + return v.docSearch(k, v.sb.numDocs, + func() ([]float32, []int64, error) { + // build the FAISS selector based on the include bitmap. + // NOTE: The bitmap selector is just a wrapper over the include bitmap + // which is shared across the CGO layer. + sel, err := getIncludeSelector(include) + if err != nil { + return nil, nil, err + } + // NOTE: the selector being freed does NOT free the inner bitmap, as we control + // its lifecycle in GO, to reuse the bitmap across iterations, if needed, for + // multi-vector document retrieval. + if sel != nil { + defer sel.Delete() + } + return ivfPtr.searchClusters(eligibleCentroidIDs, centroidDis, centroidsToProbe, + qVecSet, k, sel, params) + }, + func(numIter int, labels []int64) bool { + // if this is the first loop iteration and we have < k unique docIDs, + // we must clone the existing ids slice before modifying it to avoid + // modifying the original slice passed in by the caller + if numIter == 1 { + if include == nil { + // should not happen, but just a safeguard + include = newBitmap(v.mapping.numVectors()) + } else { + // clone the existing include bitmap + include = include.clone() + } + } + // if we have iterated atleast nprobeIncreaseThreshold times + // and still have not found enough unique docIDs, we increase + // the number of centroids to probe for the next iteration + // to try and find more vectors/documents + if numIter >= nprobeIncreaseThreshold && centroidsToProbe < totalEligibleCentroids { + // Calculate how much to increase: increase by 50% of the remaining centroids to probe, + // but at least by 1 to ensure progress. + increaseAmount := max((totalEligibleCentroids-centroidsToProbe)/2, 1) + // Update centroidsToProbe, ensuring it does not exceed the total eligible centroids + centroidsToProbe = min(centroidsToProbe+increaseAmount, totalEligibleCentroids) + } + // removing the vector ids retrieved in this iteration + // from the include set + for _, vecID := range labels { + // should not happen, but just a safeguard, as we catch -1 + // in the main loop + if vecID == -1 { + continue + } + include.clear(uint32(vecID)) + } + // only continue searching if we still have vector ids to include + return !include.isEmpty() + }) +} + +// Utility function to get the docID for a given vectorID, used for the +// deduplication logic, to map vectorIDs back to their corresponding docIDs +// if we are in nested mode, this method returns the root docID instead of +// the nested docID, by consulting the edge list. This ensures that kNN searches +// return unique root documents when nested documents are involved. +func (v *vectorIndexWrapper) getDocIDForVectorID(vecID int64) (uint32, bool) { + docID, exists := v.mapping.docForVec(uint32(vecID)) + if !v.nestedMode || !exists { + // either not in nested mode, or docID does not exist + //for the vectorID, so just return the docID as is + return docID, exists + } + // in nested mode and docID exists, so we must get the root docID from the edge list + // reuse the wrapper's ancestry slice to avoid allocations + v.ancestry = v.sb.Ancestors(uint64(docID), v.ancestry[:0]) + if len(v.ancestry) == 0 { + // should not happen, but just in case, return the docID as is + return docID, exists + } + // return the root docID, which is the last element in the ancestry slice + // in case the docID is a root doc, the ancestry slice would have + // just one element, which is the docID itself + return uint32(v.ancestry[len(v.ancestry)-1]), true +} + +// ------------------------------------------------------------------------------ +// Utility functions not tied to vector index wrapper +// ------------------------------------------------------------------------------ + +// Utility function to get a faiss.BitmapSelector to include the IDs specified in the bitmap +// The caller must ensure to free the selector by calling selector.Delete() when done using it. +func getIncludeSelector(bm *bitmap) (selector faiss.Selector, err error) { + if bm == nil { + // no bitmap provided, so return an error as we expect at least one ID to include + return nil, fmt.Errorf("include bitmap is nil or empty") + } + // create a bitmap inclusion selector + selector, err = faiss.NewIDSelectorBitmap(bm.bytes()) + if err != nil { + return nil, err + } + return selector, nil +} + +// Utility function to get a faiss.BitmapSelector to exclude the IDs specified in the bitmap +// The caller must ensure to free the selector by calling selector.Delete() when done using it. +func getExcludeSelector(bm *bitmap) (selector faiss.Selector, err error) { + if bm == nil { + // no bitmap provided, so return nil selector indicating no exclusions + return nil, nil + } + // create a bitmap exclusion selector + selector, err = faiss.NewIDSelectorBitmapNot(bm.bytes()) + if err != nil { + return nil, err + } + return selector, nil +} + +// Utility function to create a vector postings list from the corresponding docID and scores for each +// unique docID retrieved from the vector index +func getPostingsList(rs resultSet) segment.VecPostingsList { + // 1. returned postings list (of type PostingsList) has two types of information - docNum and its score. + // 2. both the values can be represented using roaring bitmaps. + // 3. the Iterator (of type VecPostingsIterator) returned would operate in terms of VecPostings. + // 4. VecPostings would just have the docNum and the score. Every call of Next() + // and just returns the next VecPostings. The caller would do a vp.Number() + // and the Score() to get the corresponding values + rv := &VecPostingsList{ + postings: roaring64.New(), + } + rs.iterate(func(docID uint32, score float32) { + // transform the docID and score to vector code format + code := getVectorCode(docID, score) + // add to postings list, this ensures ordered storage + // based on the docID since it occupies the upper 32 bits + rv.postings.Add(code) + }) + return rv +} + +// ------------------------------------------------------------------------------ +// ResultSet +// ------------------------------------------------------------------------------ + +// resultSet is a data structure to hold (docID, score) pairs while ensuring +// that each docID is unique. It supports efficient insertion, retrieval, +// and iteration over the stored pairs. +type resultSet interface { + // Add a (docID, score) pair to the result set. + put(docID uint32, score float32) + // Get the score for a given docID. Returns false if docID not present. + get(docID uint32) (float32, bool) + // Iterate over all (docID, score) pairs in the result set. + iterate(func(docID uint32, score float32)) + // Get the size of the result set. + size() int64 +} + +// resultSetSliceThreshold defines the threshold ratio of k to total documents +// in the index, below which a map-based resultSet is used, and above which +// a slice-based resultSet is used. +// It is derived using the following reasoning: +// +// Let N = total number of documents +// Let K = number of top K documents to retrieve +// +// Memory usage if the Result Set uses a map[uint32]float32 of size K underneath: +// +// ~20 bytes per entry (key + value + map overhead) +// Total ≈ 20 * K bytes +// +// Memory usage if the Result Set uses a slice of float32 of size N underneath: +// +// 4 bytes per entry +// Total ≈ 4 * N bytes +// +// We want the threshold below which a map is more memory-efficient than a slice: +// +// 20K < 4N +// K/N < 4/20 +// +// Therefore, if the ratio of K to N is less than 0.2 (4/20), we use a map-based resultSet. +const resultSetSliceThreshold float64 = 0.2 + +// newResultSet creates a new resultSet +func newResultSet(k int64, numDocs uint64) resultSet { + // if numDocs is zero (empty index), just use map-based resultSet as its a no-op + // else decide based the percent of documents being retrieved. If we require + // greater than 20% of total documents, use slice-based resultSet for better memory efficiency + // else use map-based resultSet + if numDocs == 0 || float64(k)/float64(numDocs) < resultSetSliceThreshold { + return newResultSetMap(k) + } + return newResultSetSlice(numDocs) +} + +type resultSetMap struct { + data map[uint32]float32 +} + +func newResultSetMap(k int64) resultSet { + return &resultSetMap{ + data: make(map[uint32]float32, k), + } +} + +func (rs *resultSetMap) put(docID uint32, score float32) { + rs.data[docID] = score +} + +func (rs *resultSetMap) get(docID uint32) (float32, bool) { + score, exists := rs.data[docID] + return score, exists +} + +func (rs *resultSetMap) iterate(f func(docID uint32, score float32)) { + for docID, score := range rs.data { + f(docID, score) + } +} + +func (rs *resultSetMap) size() int64 { + return int64(len(rs.data)) +} + +type resultSetSlice struct { + count int64 + data []float32 +} + +func newResultSetSlice(numDocs uint64) resultSet { + data := make([]float32, numDocs) + // scores can be negative, so initialize to a sentinel value which is NaN + sentinel := float32(math.NaN()) + for i := range data { + data[i] = sentinel + } + return &resultSetSlice{ + count: 0, + data: data, + } +} + +func (rs *resultSetSlice) put(docID uint32, score float32) { + // only increment count if this docID was not already present + if math.IsNaN(float64(rs.data[docID])) { + rs.count++ + } + rs.data[docID] = score +} + +func (rs *resultSetSlice) get(docID uint32) (float32, bool) { + score := rs.data[docID] + if math.IsNaN(float64(score)) { + return 0, false + } + return score, true +} + +func (rs *resultSetSlice) iterate(f func(docID uint32, score float32)) { + for docID, score := range rs.data { + if !math.IsNaN(float64(score)) { + f(uint32(docID), score) + } + } +} + +func (rs *resultSetSlice) size() int64 { + return rs.count +} + +// ----------------------------------------------------------------------------- +// Bitmap +// ----------------------------------------------------------------------------- + +// bitmap is a simple, fixed-size bitmap. +type bitmap struct { + bits []byte + size uint32 +} + +// newBitmap creates a new bitmap with the given number of bits +func newBitmap(numBits uint32) *bitmap { + bitsetSize := (numBits + 7) / 8 + return &bitmap{ + bits: make([]byte, bitsetSize), + size: numBits, + } +} + +// set the bit at the given position +func (b *bitmap) set(pos uint32) { + if pos >= b.size { + return + } + // set the bit in the byte slice + // the byte index is pos / 8, which is equivalent to pos >> 3 + // the bit index within that byte is pos % 8, which is equivalent to pos & 7 + // and is from the LSB side of the byte + b.bits[pos>>3] |= 1 << (pos & 7) +} + +// clear the bit at the given position +func (b *bitmap) clear(pos uint32) { + if pos >= b.size { + return + } + // clear the bit in the byte slice + // the byte index is pos / 8, which is equivalent to pos >> 3 + // the bit index within that byte is pos % 8, which is equivalent to pos & 7 + // and is from the LSB side of the byte + b.bits[pos>>3] &^= 1 << (pos & 7) +} + +// test if the bit at the given position is set +func (b *bitmap) test(pos uint32) bool { + if pos >= b.size { + return false + } + return (b.bits[pos>>3]>>(pos&7))&1 != 0 +} + +// return the underlying byte slice +func (b *bitmap) bytes() []byte { + return b.bits +} + +// returns the number of bits currently set +func (b *bitmap) cardinality() uint32 { + var count int + for _, byteVal := range b.bits { + // count the number of set bits in the byte + count += bits.OnesCount8(byteVal) + } + return uint32(count) +} + +// isEmpty checks if the bitmap has no bits set +// or if the cardinality (population count) is zero +func (b *bitmap) isEmpty() bool { + for _, byteVal := range b.bits { + if byteVal != 0 { + return false + } + } + return true +} + +// creates a clone of the bitmap +func (b *bitmap) clone() *bitmap { + newB := &bitmap{} + newB.bits = slices.Clone(b.bits) + newB.size = b.size + return newB +} + +// ----------------------------------------------------------------------------- +// ID Mapping +// ----------------------------------------------------------------------------- + +// idMapping maintains a bidirectional mapping between vector IDs and document IDs. +// It allows efficient retrieval of document IDs for given vector IDs and vice versa. +// The mapping assumes that vector IDs and document IDs are ordered sequentially starting from 0 +// up to numVecs-1 and numDocs-1 respectively. +type idMapping struct { + vecToDoc []uint32 // vector ID -> document ID (size = numVecs) + docToVec [][]uint32 // document ID -> vector IDs (size = numDocs) + + // keep track of sizes for convenience + numVecs uint32 + numDocs uint32 +} + +// newIDMapping creates a new idMapping with the specified sizes +// numVecs: number of vectors (for vecToDoc mapping) +// numDocs: number of documents (for docToVec mapping) +func newIDMapping(numVecs, numDocs uint32) *idMapping { + return &idMapping{ + vecToDoc: make([]uint32, numVecs), + docToVec: make([][]uint32, numDocs), + numVecs: numVecs, + numDocs: numDocs, + } +} + +// add a mapping from vector ID to document ID and vice versa +func (m *idMapping) add(vecID uint32, docID uint32) { + // safety check to avoid out of bounds access + if vecID >= m.numVecs || docID >= m.numDocs { + return + } + m.vecToDoc[vecID] = docID + m.docToVec[docID] = append(m.docToVec[docID], vecID) +} + +// return the number of vectors in the mapping +func (m *idMapping) numVectors() uint32 { + return m.numVecs +} + +// return the number of documents in the mapping +func (m *idMapping) numDocuments() uint32 { + return m.numDocs +} + +// retrieve the document ID for a given vector ID +func (m *idMapping) docForVec(vecID uint32) (uint32, bool) { + if vecID >= m.numVecs { + return 0, false + } + return m.vecToDoc[vecID], true +} + +// retrieve the vector IDs for a given document ID +func (m *idMapping) vecsForDoc(docID uint32) ([]uint32, bool) { + if docID >= m.numDocs { + return nil, false + } + return m.docToVec[docID], true +} + +// ------------------------------------------------------------------------------ +// Quick Select +// ------------------------------------------------------------------------------ + +// topNIDsByDistance performs an in-place Quickselect on the dist slice (while +// keeping ids aligned with their corresponding distances) to find the N largest +// distances without fully sorting the data. It partitions the array such that +// the element at index len(dist)-n is the pivot separating the top-N largest +// values from the rest, and then returns the last N elements of both dist and +// ids (unordered) +func topNIDsByDistance(dist []float32, ids []int64, n int) ([]float32, []int64) { + if n <= 0 || n > len(dist) { + return nil, nil + } + + // We want the N largest distances + target := len(dist) - n + + left := 0 + right := len(dist) - 1 + for left < right { + pivotVal := dist[right] + store := left + + for i := left; i < right; i++ { + // We want largest distances ⇒ partition small ones left + if dist[i] < pivotVal { + dist[i], dist[store] = dist[store], dist[i] + ids[i], ids[store] = ids[store], ids[i] + store++ + } + } + + dist[store], dist[right] = dist[right], dist[store] + ids[store], ids[right] = ids[right], ids[store] + if store == target { + break + } else if store < target { + left = store + 1 + } else { + right = store - 1 + } + } + + // Return top-N IDs (unordered) + return dist[target:], ids[target:] +} + +// ----------------------------------------------------------------------------- +// vectorSet +// ----------------------------------------------------------------------------- +type vectorSet struct { + // dimensionality of each vector + dim int + // number of vectors represented + nvecs int + // float vectors stored in row-major format, + // i.e. for N vectors of D dimensions, + // the length of this slice is N*D, + floatData []float32 + // row-major binary representation of the float vectors, + // where each bit represents the sign bit + // of the corresponding float value. + binaryData []uint8 +} + +func newVectorSet(dim int, data []float32) (*vectorSet, error) { + if len(data) == 0 || dim <= 0 || len(data)%dim != 0 { + return nil, fmt.Errorf("invalid vector data: dims %d, data length %d", dim, len(data)) + } + nvecs := len(data) / dim + return &vectorSet{ + dim: dim, + nvecs: nvecs, + floatData: data, + }, nil +} + +// converts float32 vectors into binary format based on the sign bit +// of the float32 values. +func convertToBinary(vecs []float32, dims int) []uint8 { + nvecs := len(vecs) / dims + packed := make([]uint8, 0, nvecs*(dims+7)/8) + var cur uint8 + var count int + for i := 0; i < nvecs; i++ { + count = 0 + for j := 0; j < dims; j++ { + value := vecs[i*dims+j] + // Apply the threshold: convert the float32 to 1 or 0 based on threshold + if value >= 0.0 { + // Shift the bit into the correct position in the byte + cur |= (1 << (7 - count)) + } + count++ + // When we have 8 bits, store the byte and reset for the next byte + if count == 8 { + packed = append(packed, cur) + cur = 0 + count = 0 + } + } + // If there are any remaining bits, pack them into a byte and append + if count > 0 { + cur <<= (8 - count) + packed = append(packed, cur) + } + } + return packed +} + +func (v *vectorSet) binarize() { + // if binaryData is already populated, no need to convert again + if v.binaryData != nil { + return + } + // convert the floatData to binary format and store in binaryData + v.binaryData = convertToBinary(v.floatData, v.dim) +} + +func (v *vectorSet) clone() *vectorSet { + // create a new vectorSet with the same dimensions and number of vectors + clone := &vectorSet{ + dim: v.dim, + nvecs: v.nvecs, + floatData: slices.Clone(v.floatData), + binaryData: slices.Clone(v.binaryData), + } + return clone +} + +func (v *vectorSet) mergeWith(other *vectorSet) { + // sanity check to ensure the two vector sets are compatible for merging + if v.dim != other.dim { + return + } + // merge the float data + v.floatData = append(v.floatData, other.floatData...) + v.nvecs += other.nvecs + // invalidate the binary data as the float data has changed + v.binaryData = nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/file_callbacks.go b/vendor/github.com/blevesearch/zapx/v17/file_callbacks.go new file mode 100644 index 0000000000..f3418a5cb0 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/file_callbacks.go @@ -0,0 +1,129 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package zap + +import ( + "fmt" + + index "github.com/blevesearch/bleve_index_api" +) + +// This file provides a mechanism for users of zap to provide callbacks +// that can process data before it is written to disk, and after it is read +// from disk. This can be used for things like encryption, compression, etc. + +// The user is responsible for ensuring that the writer and reader callbacks +// are compatible with each other, and that any state needed by the callbacks +// is managed appropriately. For example, if the writer callback uses a +// unique key or nonce per write, the reader callback must be able to +// determine the correct key or nonce to use for each read. + +// The callbacks are identified by an id string, which is returned by the +// WriterCallbackGetter. The same id string is passed to the ReaderCallbackGetter +// when creating a reader. This allows the reader to determine which +// callback to use for a given file. + +// An example implementation using AES-GCM encryption is provided in +// file_callbacks_test.go within initFileCallbacks(). + +// FileWriter wraps a CountHashWriter and applies a user provided +// writer callback to the data being written. +type FileWriter struct { + id string + c *CountHashWriter + processor func(data []byte) []byte +} + +// creates an empty FileWriter with no callback. Used +// when we are writing data that is not going to be persisted +func NewFileWriterEmpty(c *CountHashWriter) *FileWriter { + rv := &FileWriter{ + c: c, + } + + return rv +} + +// NewFileWriter creates a FileWriter with the provided CountHashWriter and applies +// the writer callback identified by the context. +func NewFileWriter(c *CountHashWriter, context []byte) (*FileWriter, error) { + rv := &FileWriter{ + c: c, + } + + if index.WriterHook != nil { + var err error + rv.id, rv.processor, err = index.WriterHook(context) + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func (w *FileWriter) Write(data []byte) (int, error) { + return w.c.Write(data) +} + +// process applies the writer callback to the data, if one is set +func (w *FileWriter) process(data []byte) []byte { + if w.processor != nil { + return w.processor(data) + } + return data +} + +func (w *FileWriter) Count() int { + return w.c.Count() +} + +func (w *FileWriter) Sum32() uint32 { + return w.c.Sum32() +} + +// FileReader wraps a reader callback to be applied to data read from a file. +type FileReader struct { + id string + processor func(data []byte) ([]byte, error) +} + +// NewFileReader creates a FileReader with the reader callback identified by the context. +// The id is used to identify which callback to use when reading data. +func NewFileReader(id string, context []byte) (*FileReader, error) { + rv := &FileReader{ + id: id, + } + + if index.ReaderHook != nil { + var err error + rv.processor, err = index.ReaderHook(id, context) + if err != nil { + return nil, err + } + } else if id != "" { + return nil, fmt.Errorf("reader callback id %s provided but no ReaderHook is set", id) + } + + return rv, nil +} + +// process applies the reader callback to the data, if one is set +func (r *FileReader) process(data []byte) ([]byte, error) { + if r.processor != nil { + return r.processor(data) + } + return data, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/intDecoder.go b/vendor/github.com/blevesearch/zapx/v17/intDecoder.go new file mode 100644 index 0000000000..9c54b28cb7 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/intDecoder.go @@ -0,0 +1,145 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" +) + +type chunkedIntDecoder struct { + startOffset uint64 + dataStartOffset uint64 + chunkOffsets []uint64 + curChunkBytes []byte + data []byte + r *memUvarintReader + fr *FileReader + + bytesRead uint64 +} + +// newChunkedIntDecoder expects an optional or reset chunkedIntDecoder for better reuse. +func newChunkedIntDecoder(buf []byte, offset uint64, rv *chunkedIntDecoder, fr *FileReader) *chunkedIntDecoder { + if rv == nil { + rv = &chunkedIntDecoder{startOffset: offset, data: buf} + } else { + rv.startOffset = offset + rv.data = buf + } + + var n, numChunks uint64 + var read int + if offset == termNotEncoded { + numChunks = 0 + } else { + numChunks, read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + } + + n += uint64(read) + if cap(rv.chunkOffsets) >= int(numChunks) { + rv.chunkOffsets = rv.chunkOffsets[:int(numChunks)] + } else { + rv.chunkOffsets = make([]uint64, int(numChunks)) + } + for i := 0; i < int(numChunks); i++ { + rv.chunkOffsets[i], read = binary.Uvarint(buf[offset+n : offset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.bytesRead += n + rv.dataStartOffset = offset + n + rv.fr = fr + return rv +} + +// A util function which fetches the query time +// specific bytes encoded by intcoder (for eg the +// freqNorm and location details of a term in document) +// the loadChunk retrieves the next chunk and the +// number of bytes retrieve in that operation is accounted +func (d *chunkedIntDecoder) getBytesRead() uint64 { + return d.bytesRead +} + +func (d *chunkedIntDecoder) loadChunk(chunk int) error { + if d.startOffset == termNotEncoded { + d.r = newMemUvarintReader([]byte(nil)) + return nil + } + + if chunk >= len(d.chunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(d.chunkOffsets)) + } + + end, start := d.dataStartOffset, d.dataStartOffset + s, e := readChunkBoundary(chunk, d.chunkOffsets) + start += s + end += e + + var err error + d.curChunkBytes, err = d.fr.process(d.data[start:end]) + if err != nil { + return fmt.Errorf("error processing chunk %d: %w", chunk, err) + } + d.bytesRead += end - start + if d.r == nil { + d.r = newMemUvarintReader(d.curChunkBytes) + } else { + d.r.Reset(d.curChunkBytes) + } + + return nil +} + +func (d *chunkedIntDecoder) reset() { + d.startOffset = 0 + d.dataStartOffset = 0 + d.chunkOffsets = d.chunkOffsets[:0] + d.curChunkBytes = d.curChunkBytes[:0] + d.bytesRead = 0 + d.data = d.data[:0] + if d.r != nil { + d.r.Reset([]byte(nil)) + } +} + +func (d *chunkedIntDecoder) isNil() bool { + return d.curChunkBytes == nil || len(d.curChunkBytes) == 0 +} + +func (d *chunkedIntDecoder) readUvarint() (uint64, error) { + return d.r.ReadUvarint() +} + +func (d *chunkedIntDecoder) readBytes(start, end int) []byte { + return d.curChunkBytes[start:end] +} + +func (d *chunkedIntDecoder) SkipUvarint() { + d.r.SkipUvarint() +} + +func (d *chunkedIntDecoder) SkipBytes(count int) { + d.r.SkipBytes(count) +} + +func (d *chunkedIntDecoder) Len() int { + return d.r.Len() +} + +func (d *chunkedIntDecoder) remainingLen() int { + return len(d.curChunkBytes) - d.r.Len() +} diff --git a/vendor/github.com/blevesearch/zapx/v17/intcoder.go b/vendor/github.com/blevesearch/zapx/v17/intcoder.go new file mode 100644 index 0000000000..d3d354577b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/intcoder.go @@ -0,0 +1,236 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" +) + +// We can safely use 0 to represent termNotEncoded since 0 +// could never be a valid address for term location information. +// (stored field index is always non-empty and earlier in the +// file) +const termNotEncoded = 0 + +type chunkedIntCoder struct { + final []byte + chunkSize uint64 + chunkBuf bytes.Buffer + chunkLens []uint64 + currChunk uint64 + + buf []byte + + bytesWritten uint64 +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + final: make([]byte, 0, 64), + } + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.bytesWritten = 0 + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// SetChunkSize changes the chunk size. It is only valid to do so +// with a new chunkedIntCoder, or immediately after calling Reset() +func (c *chunkedIntCoder) SetChunkSize(chunkSize uint64, maxDocNum uint64) { + total := int(maxDocNum/chunkSize + 1) + c.chunkSize = chunkSize + if cap(c.chunkLens) < total { + c.chunkLens = make([]uint64, total) + } else { + c.chunkLens = c.chunkLens[:total] + } +} + +func (c *chunkedIntCoder) incrementBytesWritten(val uint64) { + c.bytesWritten += val +} + +func (c *chunkedIntCoder) getBytesWritten() uint64 { + return c.bytesWritten +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + + for _, val := range vals { + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) + if err != nil { + return err + } + } + + return nil +} + +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + encodingBytes := c.chunkBuf.Bytes() + c.incrementBytesWritten(uint64(len(encodingBytes))) + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) + if len(c.buf) < bufNeeded { + c.buf = make([]byte, bufNeeded) + } + buf := c.buf + + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + + // process each chunk's data individually and recalculate the chunk + // boundaries if necessary. + if fw, ok := w.(*FileWriter); ok && fw != nil { + var prevOffset int + processedBuf := make([]byte, 0) + for i := 0; i < len(chunkOffsets); i++ { + if chunkOffsets[i] == uint64(prevOffset) { + continue + } + buf := fw.process(c.final[prevOffset:chunkOffsets[i]]) + processedBuf = append(processedBuf, buf...) + prevOffset = int(chunkOffsets[i]) + c.chunkLens[i] = uint64(len(buf)) + } + c.final = processedBuf + chunkOffsets = modifyLengthsToEndOffsets(c.chunkLens) + } + + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) + } + + tw, err := w.Write(buf[:n]) + if err != nil { + return tw, err + } + + // write out the data + nw, err := w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} + +// writeAt commits all the encoded chunked integers to the provided writer +// and returns the starting offset, total bytes written and an error +func (c *chunkedIntCoder) writeAt(w io.Writer) (uint64, int, error) { + startOffset := uint64(termNotEncoded) + if len(c.final) <= 0 { + return startOffset, 0, nil + } + + if fw, ok := w.(*FileWriter); ok && fw != nil { + startOffset = uint64(fw.Count()) + } + + tw, err := c.Write(w) + return startOffset, tw, err +} + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} + +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ + } + return lengths +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} diff --git a/vendor/github.com/blevesearch/zapx/v17/inverted_text_cache.go b/vendor/github.com/blevesearch/zapx/v17/inverted_text_cache.go new file mode 100644 index 0000000000..32b9f7bd34 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/inverted_text_cache.go @@ -0,0 +1,107 @@ +// Copyright (c) 2025 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "sync" + + "github.com/blevesearch/vellum" +) + +func newInvertedIndexCache() *invertedIndexCache { + return &invertedIndexCache{ + cache: make(map[uint16]*invertedCacheEntry), + } +} + +type invertedIndexCache struct { + m sync.RWMutex + + cache map[uint16]*invertedCacheEntry +} + +func (sc *invertedIndexCache) Clear() { + sc.m.Lock() + sc.cache = nil + sc.m.Unlock() +} + +// loadOrCreate loads the inverted index cache for the specified fieldID if it is already present, +// or creates it if not. The inverted index cache for a fieldID consists of an FST (Finite State Transducer): +// - A Vellum FST (Finite State Transducer) representing the TermDictionary. +// This function returns the loaded or newly created FST, and the number of bytes read from the provided memory slice, +// if the cache was created. +func (sc *invertedIndexCache) loadOrCreate(fieldID uint16, mem []byte, fr *FileReader) (*vellum.FST, uint64, error) { + sc.m.RLock() + entry, ok := sc.cache[fieldID] + if ok { + sc.m.RUnlock() + return entry.load() + } + + sc.m.RUnlock() + + sc.m.Lock() + defer sc.m.Unlock() + + entry, ok = sc.cache[fieldID] + if ok { + return entry.load() + } + + return sc.createAndCacheLOCKED(fieldID, mem, fr) +} + +// createAndCacheLOCKED creates the inverted index cache for the specified fieldID and caches it. +func (sc *invertedIndexCache) createAndCacheLOCKED(fieldID uint16, mem []byte, fr *FileReader) (*vellum.FST, uint64, error) { + var pos uint64 + vellumLen, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if vellumLen == 0 || read <= 0 { + return nil, 0, fmt.Errorf("vellum length is 0") + } + pos += uint64(read) + fstBytes, err := fr.process(mem[pos : pos+vellumLen]) + if err != nil { + return nil, 0, fmt.Errorf("error processing vellum bytes: %v", err) + } + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, 0, fmt.Errorf("vellum err: %v", err) + } + pos += vellumLen + sc.insertLOCKED(fieldID, fst) + return fst, pos, nil +} + +// insertLOCKED inserts the vellum FST into the cache for the specified fieldID. +func (sc *invertedIndexCache) insertLOCKED(fieldID uint16, fst *vellum.FST) { + _, ok := sc.cache[fieldID] + if !ok { + sc.cache[fieldID] = &invertedCacheEntry{ + fst: fst, + } + } +} + +// invertedCacheEntry is the vellum FST and is the value stored in the invertedIndexCache cache, for a given fieldID. +type invertedCacheEntry struct { + fst *vellum.FST +} + +func (ce *invertedCacheEntry) load() (*vellum.FST, uint64, error) { + return ce.fst, 0, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/memuvarint.go b/vendor/github.com/blevesearch/zapx/v17/memuvarint.go new file mode 100644 index 0000000000..48a57f9c85 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/memuvarint.go @@ -0,0 +1,103 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "fmt" +) + +type memUvarintReader struct { + C int // index of next byte to read from S + S []byte +} + +func newMemUvarintReader(s []byte) *memUvarintReader { + return &memUvarintReader{S: s} +} + +// Len returns the number of unread bytes. +func (r *memUvarintReader) Len() int { + n := len(r.S) - r.C + if n < 0 { + return 0 + } + return n +} + +// ReadUvarint reads an encoded uint64. The original code this was +// based on is at encoding/binary/ReadUvarint(). +func (r *memUvarintReader) ReadUvarint() (uint64, error) { + if r.C >= len(r.S) { + // nothing else to read + return 0, nil + } + + var x uint64 + var s uint + var C = r.C + var S = r.S + + for { + b := S[C] + C++ + + if b < 0x80 { + r.C = C + + // why 63? The original code had an 'i += 1' loop var and + // checked for i > 9 || i == 9 ...; but, we no longer + // check for the i var, but instead check here for s, + // which is incremented by 7. So, 7*9 == 63. + // + // why the "extra" >= check? The normal case is that s < + // 63, so we check this single >= guard first so that we + // hit the normal, nil-error return pathway sooner. + if s >= 63 && (s > 63 || b > 1) { + return 0, fmt.Errorf("memUvarintReader overflow") + } + + return x | uint64(b)<= len(r.S) { + return + } + + b := r.S[r.C] + r.C++ + + if b < 0x80 { + return + } + } +} + +// SkipBytes skips a count number of bytes. +func (r *memUvarintReader) SkipBytes(count int) { + r.C = r.C + count +} + +func (r *memUvarintReader) Reset(s []byte) { + r.C = 0 + r.S = s +} diff --git a/vendor/github.com/blevesearch/zapx/v17/merge.go b/vendor/github.com/blevesearch/zapx/v17/merge.go new file mode 100644 index 0000000000..a029aea0e5 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/merge.go @@ -0,0 +1,821 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "math" + "os" + "sort" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var DefaultFileMergerBufferSize = 1024 * 1024 + +const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc + +// Merge takes a slice of segments and bit masks describing which +// documents may be dropped, and creates a new segment containing the +// remaining data. This new segment is built at the specified path. +func (z *ZapPlugin) Merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, nil) +} + +func (z *ZapPlugin) MergeUsing(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + return z.merge(segments, drops, path, closeCh, s, config) +} + +func (*ZapPlugin) merge(segments []seg.Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + switch segmentx := segment.(type) { + case *Segment: + segmentBases[segmenti] = &segmentx.SegmentBase + case *SegmentBase: + segmentBases[segmenti] = segmentx + default: + panic(fmt.Sprintf("oops, unexpected segment type: %T", segment)) + } + } + return mergeSegmentBases(segmentBases, drops, path, DefaultChunkMode, closeCh, s, config) +} + +func mergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, + chunkMode uint32, closeCh chan struct{}, s seg.StatsReporter, config map[string]interface{}) ( + [][]uint64, uint64, error) { + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return nil, 0, err + } + + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + + // buffer the output + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriterWithStatsReporter(br, s) + w, err := NewFileWriter(cr, []byte(path)) + if err != nil { + cleanup() + return nil, 0, err + } + + newDocNums, numDocs, storedIndexOffset, _, _, sectionsIndexOffset, err := + mergeToWriter(segmentBases, drops, chunkMode, w, closeCh, config) + if err != nil { + cleanup() + return nil, 0, err + } + + err = persistFooter(numDocs, storedIndexOffset, sectionsIndexOffset, chunkMode, cr.Sum32(), w, w.id) + if err != nil { + cleanup() + return nil, 0, err + } + + err = br.Flush() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Sync() + if err != nil { + cleanup() + return nil, 0, err + } + + err = f.Close() + if err != nil { + cleanup() + return nil, 0, err + } + + return newDocNums, uint64(cr.Count()), nil +} + +// Remove fields that have been completely deleted from fieldsInv +func filterFields(fieldsInv []string, fieldInfo map[string]*index.UpdateFieldInfo) []string { + idx := 0 + for _, field := range fieldsInv { + if val, ok := fieldInfo[field]; ok && val.Deleted { + continue + } + fieldsInv[idx] = field + idx++ + } + return fieldsInv[:idx] +} + +// Update field options using updateFieldInfo to override the options +// selected during mergeFields, if needed. This includes removing field +// options for deleted fields and updating options for fields with changes +// that have not yet been propagated because a new segment has not been created. +func finalizeFieldOptions(fieldOptions map[string]index.FieldIndexingOptions, + updatedFields map[string]*index.UpdateFieldInfo) map[string]index.FieldIndexingOptions { + for field, opts := range fieldOptions { + if info, ok := updatedFields[field]; ok { + // if field is deleted, remove its options + if info.Deleted { + delete(fieldOptions, field) + continue + } + // otherwise, update options based on info + if info.Index { + // ensure indexing is disabled + opts &^= index.IndexField + } + if info.Store { + // ensure storing is disabled + opts &^= index.StoreField + } + if info.DocValues { + // ensure doc values is disabled + opts &^= index.DocValues + } + fieldOptions[field] = opts + } + } + return fieldOptions +} + +func mergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, + chunkMode uint32, w *FileWriter, closeCh chan struct{}, config map[string]interface{}) ( + newDocNums [][]uint64, numDocs, storedIndexOffset uint64, + fieldsInv []string, fieldsMap map[string]uint16, sectionsIndexOffset uint64, + err error) { + + var fieldsSame bool + var fieldsOptions map[string]index.FieldIndexingOptions + fieldsSame, fieldsInv, fieldsOptions = mergeFields(segments) + updatedFields := mergeUpdatedFields(segments) + fieldsInv = filterFields(fieldsInv, updatedFields) + fieldsMap = mapFields(fieldsInv) + if len(updatedFields) > 0 { + // finalize field options based on updated field info + fieldsOptions = finalizeFieldOptions(fieldsOptions, updatedFields) + // fieldsSame cannot be true if fields were deleted + fieldsSame = false + } + + numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, nil, nil, 0, seg.ErrClosed + } + + // the merge opaque is especially important when it comes to tracking the file + // offset a field of a particular section is at. This will be used to write the + // offsets in the fields section index of the file (the final merged file). + mergeOpaque := map[int]resetable{} + args := map[string]interface{}{ + "chunkMode": chunkMode, + "fieldsSame": fieldsSame, + "fieldsMap": fieldsMap, + "numDocs": numDocs, + "fieldsOptions": fieldsOptions, + "config": config, + } + + if numDocs > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, fieldsOptions, fieldsSame, numDocs, w, closeCh) + if err != nil { + return nil, 0, 0, nil, nil, 0, err + } + + // at this point, ask each section implementation to merge itself + for i, x := range segmentSections { + mergeOpaque[int(i)] = x.InitOpaque(args) + err = x.Merge(mergeOpaque, segments, drops, fieldsInv, newDocNums, w, closeCh) + if err != nil { + return nil, 0, 0, nil, nil, 0, err + } + } + } + + // we can persist the fields section index now, this will point + // to the various indexes (each in different section) available for a field. + sectionsIndexOffset, err = persistFieldsSection(fieldsInv, fieldsOptions, w, mergeOpaque) + if err != nil { + return nil, 0, 0, nil, nil, 0, err + } + + return newDocNums, numDocs, storedIndexOffset, fieldsInv, fieldsMap, sectionsIndexOffset, nil +} + +// mapFields takes the fieldsInv list and returns a map of fieldName +// to fieldID+1 +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16, len(fields)) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + 1 + } + return rv +} + +// computeNewDocCount determines how many documents will be in the newly +// merged segment when obsoleted docs are dropped +func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { + var newDocCount uint64 + for segI, segment := range segments { + newDocCount += segment.numDocs + if drops[segI] != nil { + newDocCount -= drops[segI].GetCardinality() + } + } + return newDocCount +} + +func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := + postItr.nextBytes() + for err == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNums[nextDocNum] + if hitNewDocNum == docDropped { + return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err != nil { + return 0, 0, 0, err + } + + if len(nextLocBytes) > 0 { + err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err != nil { + return 0, 0, 0, err + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = + postItr.nextBytes() + } + + return lastDocNum, lastFreq, lastNorm, err +} + +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + var nextNorm uint64 + if pi, ok := next.(*Posting); ok { + nextNorm = pi.NormUint64() + } else { + return 0, 0, 0, nil, fmt.Errorf("unexpected posting type %T", next) + } + + locs := next.Locations() + + if nextFreq > 0 { + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + } else { + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0)) + } + if err != nil { + return 0, 0, 0, nil, err + } + + if len(locs) > 0 { + numBytesLocs := 0 + for _, loc := range locs { + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(ap)) + args = append(args, ap...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *FileWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + if postings == nil { + return 0, nil + } + + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + var tfOffset uint64 + tfOffset, _, err = tfEncoder.writeAt(w) + if err != nil { + return 0, err + } + + var locOffset uint64 + locOffset, _, err = locEncoder.writeAt(w) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +type varintEncoder func(uint64) (int, error) + +func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, + fieldsOptions map[string]index.FieldIndexingOptions, + fieldsSame bool, newSegDocCount uint64, + w *FileWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { + var rv [][]uint64 // The remapped or newDocNums for each segment. + + var newDocNum uint64 + + var curr int + var data, compressed []byte + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } + + vals := make([][][]byte, len(fieldsInv)) + typs := make([][]byte, len(fieldsInv)) + poss := make([][][]uint64, len(fieldsInv)) + + // copying data directly is safe only if there are no + // file callbacks that might modify the data in all + // of the involved segments and the current writer + copyFlag := true + for _, segment := range segments { + if segment.fileReader.id != "" { + copyFlag = false + break + } + } + if w.id != "" { + copyFlag = false + } + + var posBuf []uint64 + + docNumOffsets := make([]uint64, newSegDocCount) + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + + // for each segment + for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + + segNewDocNums := make([]uint64, segment.numDocs) + + dropsI := drops[segI] + + // optimize when the field mapping is the same across all + // segments and there are no deletions, via byte-copying + // of stored docs bytes directly to the writer + // cannot copy directly if fields might have been deleted + if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) && copyFlag { + err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < segment.numDocs; i++ { + segNewDocNums[i] = newDocNum + newDocNum++ + } + rv = append(rv, segNewDocNums) + + continue + } + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + // TODO: roaring's API limits docNums to 32-bits? + if dropsI != nil && dropsI.Contains(uint32(docNum)) { + segNewDocNums[docNum] = docDropped + continue + } + + segNewDocNums[docNum] = newDocNum + + curr = 0 + metaBuf.Reset() + data = data[:0] + + posTemp := posBuf + + // collect all the data + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } + err := segment.visitStoredFields(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := int(fieldsMap[field]) - 1 + if fieldID < 0 { + // no entry for field in fieldsMap + return false + } + // early exit if the store is not wanted for this field + if !fieldsOptions[field].IsStored() { + return true + } + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + + // copy array positions to preserve them beyond the scope of this callback + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) + + return true + }) + if err != nil { + return 0, nil, err + } + + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + // early exit if the store is not wanted for this field + if !fieldsOptions[fieldsInv[fieldID]].IsStored() { + continue + } + // early exit if no stored values for this field + if len(vals[fieldID]) == 0 { + continue + } + storedFieldValues := vals[fieldID] + + stf := typs[fieldID] + spf := poss[fieldID] + + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncode, data) + if err2 != nil { + return 0, nil, err2 + } + } + + metaBytes := metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + bufMeta := w.process(metaBytes) + + // idFieldVal is a pointer to a mem mapped byte slice, so we copy + // before merging it with the compressed data + buf := make([]byte, 0, len(idFieldVal)+len(compressed)) + buf = append(buf, idFieldVal...) + buf = append(buf, compressed...) + + bufCompressed := w.process(buf) + + // write out the meta len and compressed data len + _, err = writeUvarints(w, + uint64(len(bufMeta)), + uint64(len(bufCompressed))) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(bufMeta) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(bufCompressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + storedIndexOffset := uint64(w.Count()) + + // now write out the stored doc index + for _, docNumOffset := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffset) + if err != nil { + return 0, nil, err + } + } + + // calculate new edge list if applicable + var newEdgeList map[uint64]uint64 + + for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + // get the edgeList for this segment + edgeList := segment.EdgeList() + // if no edgeList, nothing to do + if edgeList == nil { + continue + } + newSegDocNums := rv[segI] + edgeList.Iterate(func(oldChild uint64, oldParent uint64) bool { + newParent := newSegDocNums[oldParent] + newChild := newSegDocNums[oldChild] + if newParent != docDropped && + newChild != docDropped { + if newEdgeList == nil { + newEdgeList = make(map[uint64]uint64) + } + newEdgeList[newChild] = newParent + } + return true + }) + } + + // write out the new edge list + // first write out the number of entries + // which is also the number of valid subDocs + // in the merged segment + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(newEdgeList))) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + // write the child -> parent edge list + // child and parent are both flattened doc ids + for child, parent := range newEdgeList { + n = binary.PutUvarint(buf, child) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + n = binary.PutUvarint(buf, parent) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + } + + return storedIndexOffset, rv, nil +} + +// copyStoredDocs writes out a segment's stored doc info, optimized by +// using a single Write() call for the entire set of bytes. The +// newDocNumOffsets is filled with the new offsets for each doc. +func (sb *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, + w *FileWriter) error { + if sb.numDocs <= 0 { + return nil + } + + indexOffset0, storedOffset0, _, _, _ := + sb.getDocStoredOffsets(0) // the segment's first doc + + indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := + sb.getDocStoredOffsets(sb.numDocs - 1) // the segment's last doc + + storedOffset0New := uint64(w.Count()) + + storedBytes := sb.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] + _, err := w.Write(storedBytes) + if err != nil { + return err + } + + // remap the storedOffset's for the docs into new offsets relative + // to storedOffset0New, filling the given docNumOffsetsOut array + for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { + storedOffset := binary.BigEndian.Uint64(sb.mem[indexOffset : indexOffset+8]) + storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New + newDocNumOffsets[newDocNum] = storedOffsetNew + newDocNum += 1 + } + + return nil +} + +// mergeFields builds a unified list of fields used across all the +// input segments, and computes whether the fields are the same across +// segments (which depends on fields to be sorted in the same way +// across segments) +func mergeFields(segments []*SegmentBase) (bool, []string, map[string]index.FieldIndexingOptions) { + fieldsSame := true + + var segment0Fields []string + if len(segments) > 0 { + segment0Fields = segments[0].Fields() + } + + fieldsExist := map[string]struct{}{} + fieldOptions := map[string]index.FieldIndexingOptions{} + for _, segment := range segments { + fields := segment.Fields() + for fieldi, field := range fields { + fieldsExist[field] = struct{}{} + + if prev, ok := fieldOptions[field]; ok { + // Merge options conservatively: once a field option is disabled (bit cleared) + // in any segment, it remains disabled. This ensures deterministic behavior + // when options can only transition from true -> false. + fieldOptions[field] = prev & segment.fieldsOptions[field] + // check if any bits were cleared + if fieldOptions[field] != prev { + // Some bits were cleared (option changed from true -> false) + fieldsSame = false + } + } else { + // first occurrence of the field + fieldOptions[field] = segment.fieldsOptions[field] + } + if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { + fieldsSame = false + } + } + } + + rv := make([]string, 0, len(fieldsExist)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsExist { + if k != "_id" { + rv = append(rv, k) + } + } + + sort.Strings(rv[1:]) // leave _id as first + + return fieldsSame, rv, fieldOptions +} + +// Combine updateFieldInfo from all segments +func mergeUpdatedFields(segments []*SegmentBase) map[string]*index.UpdateFieldInfo { + var fieldInfo map[string]*index.UpdateFieldInfo + + for _, segment := range segments { + for field, info := range segment.updatedFields { + if fieldInfo == nil { + fieldInfo = make(map[string]*index.UpdateFieldInfo) + } + // if field not present, add it + if _, ok := fieldInfo[field]; !ok { + fieldInfo[field] = &index.UpdateFieldInfo{ + // mark whether field is deleted in any segment + Deleted: info.Deleted, + Index: info.Index, + Store: info.Store, + DocValues: info.DocValues, + } + } else { + fieldInfo[field].Deleted = fieldInfo[field].Deleted || info.Deleted + fieldInfo[field].Index = fieldInfo[field].Index || info.Index + fieldInfo[field].Store = fieldInfo[field].Store || info.Store + fieldInfo[field].DocValues = fieldInfo[field].DocValues || info.DocValues + } + } + + } + return fieldInfo +} + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/vendor/github.com/blevesearch/zapx/v17/nested_cache.go b/vendor/github.com/blevesearch/zapx/v17/nested_cache.go new file mode 100644 index 0000000000..2d273ba235 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/nested_cache.go @@ -0,0 +1,290 @@ +// Copyright (c) 2026 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" +) + +type nestedIndexCache struct { + cache *nestedCacheEntry +} + +// newNestedIndexCache creates a new nested index cache +// instance, which contains cached edge list +// for a nested segment +func newNestedIndexCache() *nestedIndexCache { + return &nestedIndexCache{} +} + +// Clear clears the nested index cache, removing the cached edge list +func (nc *nestedIndexCache) Clear() { + nc.cache = nil +} + +func (nc *nestedIndexCache) initialize(numDocs uint64, edgeListOffset uint64, mem []byte) error { + // pos stores the current read position + pos := edgeListOffset + if pos == 0 { + // no edge list + return nil + } + // read number of edges in the edge list + numEdges, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("error reading number of edges in nested edge list") + } + pos += uint64(read) + // if no documents or edges/nested documents, return + if numDocs == 0 || numEdges == 0 { + return nil + } + edgeList := NewEdgeList(numDocs, numEdges) + for i := uint64(0); i < numEdges; i++ { + child, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("error reading child doc id in nested edge list") + } + pos += uint64(read) + parent, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("error reading parent doc id in nested edge list") + } + pos += uint64(read) + edgeList.AddEdge(child, parent) + } + nc.cache = &nestedCacheEntry{ + el: edgeList, + } + return nil +} + +type nestedCacheEntry struct { + // edgeList[child] = parent + el EdgeList +} + +func (nc *nestedIndexCache) ancestry(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { + cache := nc.cache + // add self as first ancestor + prealloc = append(prealloc, index.NewAncestorID(docNum)) + if cache == nil || cache.el == nil { + return prealloc + } + current := docNum + for { + parent, ok := cache.el.Parent(current) + if !ok { + break + } + prealloc = append(prealloc, index.NewAncestorID(parent)) + current = parent + } + return prealloc +} + +func (nc *nestedIndexCache) edgeList() EdgeList { + cache := nc.cache + if cache == nil || cache.el == nil { + return nil + } + return cache.el +} + +func (nc *nestedIndexCache) countNested() uint64 { + cache := nc.cache + if cache == nil || cache.el == nil { + return 0 + } + return cache.el.Count() +} + +// countRoot returns the number of root documents in the given bitmap +func (nc *nestedIndexCache) countRoot(bm *roaring.Bitmap) uint64 { + var totalDocs uint64 + if bm == nil { + // if bitmap is empty, return 0 + return totalDocs + } + totalDocs = bm.GetCardinality() + cache := nc.cache + if cache == nil || cache.el == nil { + // if cache is nil, no nested docs, so all docs are root docs + // so just return the cardinality of the bitmap + return totalDocs + } + // count nested documents in the bitmap, a nested doc is one that has a parent in the edge list + var nestedDocCount uint64 + bm.Iterate(func(docNum uint32) bool { + if _, ok := cache.el.Parent(uint64(docNum)); ok { + nestedDocCount++ + } + return true + }) + // root docs = total docs - nested docs + if totalDocs < nestedDocCount { + // should not happen, but just in case + return 0 + } + return totalDocs - nestedDocCount +} + +// ------------------------------------------------------- + +// EdgeList provides an interface to access parent of a child document +type EdgeList interface { + // Parent returns the parent of the given child document ID, + // and a boolean indicating if the parent exists. + Parent(child uint64) (uint64, bool) + + // AddEdge adds an edge from child to parent in the edge list. + AddEdge(child uint64, parent uint64) + + // Count returns the number of edges in the edge list. + Count() uint64 + + // Iterate iterates over all edges in the edge list, calling the provided function + // with each child-parent pair. If the function returns false, iteration stops. + Iterate(func(child uint64, parent uint64) bool) +} + +type edgeListMap struct { + edges map[uint64]uint64 +} + +func newEdgeListMap(numEdges uint64) *edgeListMap { + return &edgeListMap{ + edges: make(map[uint64]uint64, numEdges), + } +} + +func (elm *edgeListMap) Parent(child uint64) (uint64, bool) { + parent, ok := elm.edges[child] + return parent, ok +} + +func (elm *edgeListMap) AddEdge(child uint64, parent uint64) { + elm.edges[child] = parent +} + +func (elm *edgeListMap) Count() uint64 { + return uint64(len(elm.edges)) +} + +func (elm *edgeListMap) Iterate(f func(child uint64, parent uint64) bool) { + for child, parent := range elm.edges { + if !f(child, parent) { + return + } + } +} + +type edgeListSlice struct { + count uint64 + sentinel uint64 + edges []uint64 +} + +func newEdgeListSlice(numDocs uint64, numEdges uint64) *edgeListSlice { + var sentinel uint64 = math.MaxUint64 + edges := make([]uint64, numDocs) + for i := range edges { + edges[i] = sentinel + } + return &edgeListSlice{ + count: numEdges, + sentinel: sentinel, + edges: edges, + } +} + +func (els *edgeListSlice) Parent(child uint64) (uint64, bool) { + if child >= uint64(len(els.edges)) { + return 0, false + } + parent := els.edges[child] + if parent == els.sentinel { + return 0, false + } + return parent, true +} + +func (el *edgeListSlice) AddEdge(child uint64, parent uint64) { + if child >= uint64(len(el.edges)) { + // out of bounds, ignore as this should not happen + return + } + el.edges[child] = parent +} + +func (el *edgeListSlice) Count() uint64 { + return el.count +} + +func (el *edgeListSlice) Iterate(f func(child uint64, parent uint64) bool) { + for child, parent := range el.edges { + if parent != el.sentinel { + if !f(uint64(child), parent) { + return + } + } + } +} + +// nestedCacheRatio defines the threshold ratio of nested documents to total documents. +// It is derived using the following reasoning: +// +// Let N = number of nested documents (i.e., edges in the edge list) +// Let T = total number of documents +// +// Memory usage if the edge list is stored as a map[uint64]uint64: +// +// ~30 bytes per entry (key + value + map overhead) +// Total ≈ 30 * N bytes +// +// Memory usage if the edge list is stored as a []uint64: +// +// 8 bytes per entry +// Total ≈ 8 * T bytes +// +// We want the threshold at which a map becomes more memory-efficient than a slice: +// +// 30N < 8T +// N/T < 8/30 +// +// Therefore, if the ratio of nested documents to total documents is less than 8/30, +// we use a map for the edge list; otherwise, we use a slice. +var edgeListMapThreshold = 8.0 / 30.0 + +// NewEdgeList creates a new EdgeList instance based on the provided +// constants, the total number of documents and the number of nested documents/edges. +func NewEdgeList(numDocs uint64, numEdges uint64) EdgeList { + if numDocs == 0 || numEdges == 0 { + // no edges, return nil + return nil + } + ratio := float64(numEdges) / float64(numDocs) + if ratio < edgeListMapThreshold { + // use map representation + return newEdgeListMap(numEdges) + } + // use slice representation + return newEdgeListSlice(numDocs, numEdges) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/new.go b/vendor/github.com/blevesearch/zapx/v17/new.go new file mode 100644 index 0000000000..a1d0b10636 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/new.go @@ -0,0 +1,555 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync" + "sync/atomic" + + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + +// ValidateDocFields can be set by applications to perform additional checks +// on fields in a document being added to a new segment, by default it does +// nothing. +// This API is experimental and may be removed at any time. +var ValidateDocFields = func(field index.Field) error { + return nil +} + +// New creates an in-memory zap-encoded SegmentBase from a set of Documents +func (z *ZapPlugin) New(results []index.Document) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, nil) +} + +func (z *ZapPlugin) NewUsing(results []index.Document, config map[string]interface{}) ( + segment.Segment, uint64, error) { + return z.newWithChunkMode(results, DefaultChunkMode, config) +} + +func (*ZapPlugin) newWithChunkMode(results []index.Document, + chunkMode uint32, config map[string]interface{}) (segment.Segment, uint64, error) { + s := interimPool.Get().(*interim) + + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) + } + + var err error + s.results, s.edgeList = flattenNestedDocuments(results, s.edgeList) + s.config = config + s.chunkMode = chunkMode + + s.w = NewFileWriterEmpty(NewCountHashWriter(&br)) + + storedIndexOffset, sectionsIndexOffset, err := s.convert() + if err != nil { + return nil, uint64(0), err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkMode, + uint64(len(s.results)), storedIndexOffset, sectionsIndexOffset, config) + + // get the bytes written before the interim's reset() call + // write it to the newly formed segment base. + totalBytesWritten := s.getBytesWritten() + if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) + sb.setBytesWritten(totalBytesWritten) + interimPool.Put(s) + } + + return sb, uint64(len(br.Bytes())), err +} + +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []index.Document + + // edge list for nested documents: child -> parent + edgeList map[uint64]uint64 + + chunkMode uint32 + + w *FileWriter + + config map[string]interface{} + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsOptions holds the indexing options for each field + FieldsOptions map[string]index.FieldIndexingOptions + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + metaBuf bytes.Buffer + + tmp0 []byte + tmp1 []byte + + lastNumDocs int + lastOutSize int + + // atomic access to this variable + bytesWritten uint64 + + opaque map[int]resetable +} + +func (s *interim) reset() (err error) { + s.results = nil + s.chunkMode = 0 + s.w = nil + clear(s.edgeList) + clear(s.FieldsMap) + clear(s.FieldsOptions) + s.FieldsInv = s.FieldsInv[:0] + s.metaBuf.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 + + // reset the bytes written stat count + // to avoid leaking of bytesWritten across reuse cycles. + s.setBytesWritten(0) + + if s.opaque != nil { + for _, v := range s.opaque { + err = v.Reset() + } + } else { + s.opaque = map[int]resetable{} + } + + return err +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 + numLocs int +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, uint64, error) { + if s.FieldsMap == nil { + s.FieldsMap = map[string]uint16{} + } + if s.FieldsOptions == nil { + s.FieldsOptions = map[string]index.FieldIndexingOptions{} + } + + s.getOrDefineField("_id") // _id field is fieldID 0 + // special case _id field options: the _id is the canonical document identifier and + // must always be both indexed and stored so that it can be used for lookups/queries + // and retrieved back from the stored fields, regardless of user-specified field options. + s.FieldsOptions["_id"] = index.IndexField | index.StoreField + + var fName string + for _, result := range s.results { + result.VisitComposite(func(field index.CompositeField) { + fName = field.Name() + s.getOrDefineField(fName) + s.FieldsOptions[fName] = field.Options() + }) + result.VisitFields(func(field index.Field) { + fName = field.Name() + s.getOrDefineField(fName) + s.FieldsOptions[fName] = field.Options() + }) + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + args := map[string]interface{}{ + "results": s.results, + "chunkMode": s.chunkMode, + "fieldsMap": s.FieldsMap, + "fieldsInv": s.FieldsInv, + "config": s.config, + "fieldsOptions": s.FieldsOptions, + } + if s.opaque == nil { + s.opaque = map[int]resetable{} + for i, x := range segmentSections { + s.opaque[int(i)] = x.InitOpaque(args) + } + } else { + for k, v := range args { + for _, op := range s.opaque { + op.Set(k, v) + } + } + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, 0, err + } + + // we can persist the various sections at this point. + // the rule of thumb here is that each section must persist field wise. + for _, x := range segmentSections { + err = x.Persist(s.opaque, s.w) + if err != nil { + return 0, 0, err + } + } + + // after persisting the sections to the writer, account corresponding + for _, opaque := range s.opaque { + opaqueIO, ok := opaque.(segment.DiskStatsReporter) + if ok { + s.incrementBytesWritten(opaqueIO.BytesWritten()) + } + } + + // we can persist a new fields section here + // this new fields section will point to the various indexes available + sectionsIndexOffset, err := persistFieldsSection(s.FieldsInv, s.FieldsOptions, s.w, s.opaque) + if err != nil { + return 0, 0, err + } + + return storedIndexOffset, sectionsIndexOffset, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + } + + return int(fieldIDPlus1 - 1) +} + +func (s *interim) processDocuments() { + for docNum, result := range s.results { + s.processDocument(uint32(docNum), result) + } +} + +func (s *interim) processDocument(docNum uint32, + result index.Document) { + // this callback is essentially going to be invoked on each field, + // as part of which preprocessing, cumulation etc. of the doc's data + // will take place. + visitField := func(field index.Field) { + fieldID := uint16(s.getOrDefineField(field.Name())) + + // section specific processing of the field + for _, section := range segmentSections { + section.Process(s.opaque, docNum, field, fieldID) + } + } + + // walk each composite field + result.VisitComposite(func(field index.CompositeField) { + visitField(field) + }) + + // walk each field + result.VisitFields(visitField) + + // given that as part of visiting each field, there may some kind of totalling + // or accumulation that can be updated, it becomes necessary to commit or + // put that totalling/accumulation into effect. However, for certain section + // types this particular step need not be valid, in which case it would be a + // no-op in the implmentation of the section's process API. + for _, section := range segmentSections { + section.Process(s.opaque, docNum, nil, math.MaxUint16) + } + +} + +func (s *interim) getBytesWritten() uint64 { + return atomic.LoadUint64(&s.bytesWritten) +} + +func (s *interim) incrementBytesWritten(val uint64) { + atomic.AddUint64(&s.bytesWritten, val) +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncode := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + var validationErr error + result.VisitFields(func(field index.Field) { + fieldID := uint16(s.getOrDefineField(field.Name())) + + if field.Options().IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, field.EncodedFieldType()) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + err := ValidateDocFields(field) + if err != nil && validationErr == nil { + validationErr = err + } + }) + if validationErr != nil { + return 0, validationErr + } + + var curr int + + s.metaBuf.Reset() + data = data[:0] + + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncode(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncode, data) + if err != nil { + return 0, err + } + } + } + + metaBytes := s.metaBuf.Bytes() + + compressed = snappy.Encode(compressed[:cap(compressed)], data) + s.incrementBytesWritten(uint64(len(compressed))) + docStoredOffsets[docNum] = uint64(s.w.Count()) + + combined := make([]byte, len(idFieldVal)+len(compressed)) + copy(combined, idFieldVal) + copy(combined[len(idFieldVal):], compressed) + bufMeta := s.w.process(metaBytes) + bufCompressed := s.w.process(combined) + + _, err = writeUvarints(s.w, + uint64(len(bufMeta)), + uint64(len(bufCompressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(bufMeta) + if err != nil { + return 0, err + } + + _, err = s.w.Write(bufCompressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + // write the number of edges in the child -> parent edge list + // this will be zero if there are no nested documents + // and this number also reflects the number of nested documents + // in the segment + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(s.edgeList))) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, err + } + // write the child -> parent edge list + // child and parent are both flattened doc ids + for child, parent := range s.edgeList { + n = binary.PutUvarint(buf, child) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, err + } + n = binary.PutUvarint(buf, parent) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) setBytesWritten(val uint64) { + atomic.StoreUint64(&s.bytesWritten, val) +} + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} + +// flattenNestedDocuments returns a preorder list of the given documents and +// all their nested documents, along with a map mapping each flattened index +// to its parent index (excluding root docs entirely). +// The edge list is represented as a map[child]parent, where both child and +// parent are flattened document indices. +// Root documents (those without a parent) are not included in the edge list, +// as they have no parent. The order of documents in the returned slice is +// such that parents always appear before their children. A reusable edgeList +// can be provided to avoid allocations across multiple calls. +func flattenNestedDocuments(docs []index.Document, edgeList map[uint64]uint64) ( + []index.Document, map[uint64]uint64) { + totalCount := 0 + for _, doc := range docs { + totalCount += countNestedDocuments(doc) + } + + if totalCount == len(docs) { + // no nested documents, return early + return docs, nil + } + + flattened := make([]index.Document, 0, totalCount) + if edgeList == nil { + edgeList = make(map[uint64]uint64, totalCount-len(docs)) + } + + var traverse func(doc index.Document, hasParent bool, parentIdx uint64) + traverse = func(d index.Document, hasParent bool, parentIdx uint64) { + curIdx := uint64(len(flattened)) + flattened = append(flattened, d) + + if hasParent { + edgeList[curIdx] = parentIdx + } + + if nestedDoc, ok := d.(index.NestedDocument); ok { + nestedDoc.VisitNestedDocuments(func(child index.Document) { + traverse(child, true, curIdx) + }) + } + } + // Top-level docs have no parent + for _, doc := range docs { + traverse(doc, false, 0) + } + return flattened, edgeList +} + +// countNestedDocuments returns the total number of docs in preorder, +// including the parent and all descendants. +func countNestedDocuments(doc index.Document) int { + count := 1 // include this doc + if nd, ok := doc.(index.NestedDocument); ok { + nd.VisitNestedDocuments(func(child index.Document) { + count += countNestedDocuments(child) + }) + } + return count +} diff --git a/vendor/github.com/blevesearch/zapx/v17/plugin.go b/vendor/github.com/blevesearch/zapx/v17/plugin.go new file mode 100644 index 0000000000..f67297ec2f --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/plugin.go @@ -0,0 +1,27 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +// ZapPlugin implements the Plugin interface of +// the blevesearch/scorch_segment_api pkg +type ZapPlugin struct{} + +func (*ZapPlugin) Type() string { + return Type +} + +func (*ZapPlugin) Version() uint32 { + return Version +} diff --git a/vendor/github.com/blevesearch/zapx/v17/posting.go b/vendor/github.com/blevesearch/zapx/v17/posting.go new file mode 100644 index 0000000000..0e63bb1cd5 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/posting.go @@ -0,0 +1,947 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "math" + "reflect" + + "github.com/RoaringBitmap/roaring/v2" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(1) + +// PostingsList is an in-memory representation of a postings list +type PostingsList struct { + sb *SegmentBase + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 + normBits1Hit uint64 + + chunkSize uint64 + + bytesRead uint64 +} + +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) +} + +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, + rv *PostingsIterator) *PostingsIterator { + if rv == nil { + rv = &PostingsIterator{} + } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.reset() + } + + locReader := rv.locReader + if locReader != nil { + locReader.reset() + } + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf + } + + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm || includeLocs + rv.includeLocs = includeLocs + + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = DocNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty + if p.postings == nil { + return rv + } + + // initialize freq chunk reader + if rv.includeFreqNorm { + rv.freqNormReader = newChunkedIntDecoder(p.sb.mem, p.freqOffset, rv.freqNormReader, p.sb.fileReader) + rv.incrementBytesRead(rv.freqNormReader.getBytesRead()) + } + + // initialize the loc chunk reader + if rv.includeLocs { + rv.locReader = newChunkedIntDecoder(p.sb.mem, p.locOffset, rv.locReader, p.sb.fileReader) + rv.incrementBytesRead(rv.locReader.getBytesRead()) + } + + rv.all = p.postings.Iterator() + if p.except != nil { + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() + } else { + rv.ActualBM = p.postings + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var n, e uint64 + if p.normBits1Hit != 0 { + n = 1 + if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { + e = 1 + } + } else if p.postings != nil { + n = p.postings.GetCardinality() + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } + } + return n - e +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the postings lists stored +// on disk, while querying +func (p *PostingsList) ResetBytesRead(val uint64) { + p.bytesRead = val +} + +func (p *PostingsList) BytesRead() uint64 { + return p.bytesRead +} + +func (p *PostingsList) incrementBytesRead(val uint64) { + p.bytesRead += val +} + +func (p *PostingsList) BytesWritten() uint64 { + return 0 +} + +func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { + rv.postingsOffset = postingsOffset + + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes, err := d.sb.fileReader.process(d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]) + if err != nil { + return err + } + + rv.incrementBytesRead(n + postingsLen) + + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } + _, err = rv.postings.FromBuffer(roaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + chunkSize, err := getChunkSize(d.sb.chunkMode, + rv.postings.GetCardinality(), d.sb.numDocs) + if err != nil { + return fmt.Errorf("failed to get chunk size: %v", err) + } + + rv.chunkSize = chunkSize + + return nil +} + +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.docNum1Hit = docNum + rv.normBits1Hit = normBits + + return nil +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntPeekable + Actual roaring.IntPeekable + ActualBM *roaring.Bitmap + + currChunk uint32 + freqNormReader *chunkedIntDecoder + locReader *chunkedIntDecoder + + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls + + docNum1Hit uint64 + normBits1Hit uint64 + + buf []byte + + includeFreqNorm bool + includeLocs bool + + bytesRead uint64 +} + +var emptyPostingsIterator = &PostingsIterator{} + +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + SizeOfPtr + + i.next.Size() + // account for freqNormReader, locReader if we start using this. + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Implements the segment.DiskStatsReporter interface +// The purpose of this implementation is to get +// the bytes read from the disk which includes +// the freqNorm and location specific information +// of a hit +func (i *PostingsIterator) ResetBytesRead(val uint64) { + i.bytesRead = val +} + +func (i *PostingsIterator) BytesRead() uint64 { + return i.bytesRead +} + +func (i *PostingsIterator) incrementBytesRead(val uint64) { + i.bytesRead += val +} + +func (i *PostingsIterator) BytesWritten() uint64 { + return 0 +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if i.includeFreqNorm { + err := i.freqNormReader.loadChunk(chunk) + if err != nil { + return err + } + + // assign the bytes read at this point, since + // the postingsIterator is tracking only the chunk loaded + // and the cumulation is tracked correctly in the downstream + // intDecoder + i.ResetBytesRead(i.freqNormReader.getBytesRead()) + } + + if i.includeLocs { + err := i.locReader.loadChunk(chunk) + if err != nil { + return err + } + i.ResetBytesRead(i.locReader.getBytesRead()) + } + + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + if freq == 0 { + return freq, 0, hasLocs, nil + } + + normBits, err := i.freqNormReader.readUvarint() + if err != nil { + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + } + + return freq, normBits, hasLocs, nil +} + +func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { + if i.normBits1Hit != 0 { + return false, nil + } + + freqHasLocs, err := i.freqNormReader.readUvarint() + if err != nil { + return false, fmt.Errorf("error reading freqHasLocs: %v", err) + } + + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + if freq == 0 { + return hasLocs, nil + } + + i.freqNormReader.SkipUvarint() // Skip normBits. + + return hasLocs, nil // See decodeFreqHasLocs() / hasLocs. +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs +} + +// readLocation processes all the integers on the stream representing a single +// location. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + l.field = i.postings.sb.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + + if cap(l.ap) < int(numArrayPos) { + l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + + l.ap[k] = ap + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) + if err != nil || !exists { + return nil, err + } + + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + if !i.includeFreqNorm { + return rv, nil + } + + var normBits uint64 + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return nil, err + } + + rv.norm = math.Float32frombits(uint32(normBits)) + + if i.includeLocs && hasLocs { + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled + if rv.freq > 0 { + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq, rv.freq*2) + } + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) + } + rv.locs = i.nextSegmentLocs[:0] + } + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + j := 0 + var nextLoc *Location + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { + if len(i.nextLocs) > j { + nextLoc = &i.nextLocs[j] + } else { + nextLoc = &Location{} + } + + err := i.readLocation(nextLoc) + if err != nil { + return nil, err + } + + rv.locs = append(rv.locs, nextLoc) + j++ + } + } + + return rv, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == DocNum1HitFinished { + return 0, false, nil + } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + + if i.Actual == nil || !i.Actual.HasNext() { + return 0, false, nil + } + + if i.postings == nil || i.postings == emptyPostingsList { + // couldn't find anything + return 0, false, nil + } + + if i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + + if i.postings.chunkSize == 0 { + return 0, false, ErrChunkSizeZero + } + + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() || !i.all.HasNext() { + // couldn't find anything + return 0, false, nil + } + + n := i.Actual.Next() + allN := i.all.Next() + nChunk := n / uint32(i.postings.chunkSize) + + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * uint32(i.postings.chunkSize) + + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do + for allN != n { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, err + } + } + + if !i.all.HasNext() { + return 0, false, nil + } + + allN = i.all.Next() + } + + if i.includeFreqNorm && (i.currChunk != nChunk || i.freqNormReader.isNil()) { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + +// nextBytes returns the docNum and the encoded freq & loc bytes for +// the next posting +func (i *PostingsIterator) nextBytes() ( + docNumOut uint64, freq uint64, normBits uint64, + bytesFreqNorm []byte, bytesLoc []byte, err error) { + docNum, exists, err := i.nextDocNumAtOrAfter(0) + if err != nil || !exists { + return 0, 0, 0, nil, nil, err + } + + if i.normBits1Hit != 0 { + if i.buf == nil { + i.buf = make([]byte, binary.MaxVarintLen64*2) + } + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) + n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) + return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil + } + + startFreqNorm := i.freqNormReader.remainingLen() + + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + if err != nil { + return 0, 0, 0, nil, nil, err + } + + endFreqNorm := i.freqNormReader.remainingLen() + bytesFreqNorm = i.freqNormReader.readBytes(startFreqNorm, endFreqNorm) + + if hasLocs { + startLoc := i.locReader.remainingLen() + + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return 0, 0, 0, nil, nil, + fmt.Errorf("error reading location nextBytes numLocs: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + + endLoc := i.locReader.remainingLen() + bytesLoc = i.locReader.readBytes(startLoc, endLoc) + } + + return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil +} + +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + if !i.includeFreqNorm { + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { + return 0, false, nil // couldn't find anything + } + + return uint64(i.Actual.Next()), true, nil + } + + if i.postings != nil && i.postings.chunkSize == 0 { + return 0, false, ErrChunkSizeZero + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + n := i.Actual.Next() + nChunk := n / uint32(i.postings.chunkSize) + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / uint32(i.postings.chunkSize) + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + + return uint64(n), true, nil +} + +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.freqNormReader.isNil() { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + hasLocs, err := i.skipFreqNormReadHasLocs() + if err != nil { + return err + } + + if i.includeLocs && hasLocs { + numLocsBytes, err := i.locReader.readUvarint() + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + i.locReader.SkipBytes(int(numLocsBytes)) + } + + return nil +} + +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// ActualBitmap returns the underlying actual bitmap +// which can be used up the stack for optimizations +func (p *PostingsIterator) ActualBitmap() *roaring.Bitmap { + return p.ActualBM +} + +// ReplaceActual replaces the ActualBM with the provided +// bitmap +func (p *PostingsIterator) ReplaceActual(abm *roaring.Bitmap) { + p.ActualBM = abm + p.Actual = abm.Iterator() +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit uint64, + includeFreqNorm, includeLocs bool) (segment.PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: NormBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location +} + +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequencies of occurrence of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(float32(1.0 / math.Sqrt(float64(math.Float32bits(p.norm))))) +} + +// Locations returns the location information for each occurrence +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// NormUint64 returns the norm value as uint64 +func (p *Posting) NormUint64() uint64 { + return uint64(math.Float32bits(p.norm)) +} + +// Location represents the location of a single occurrence +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*SizeOfUint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurrence +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurrence +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurrence +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurrence +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/vendor/github.com/blevesearch/zapx/v17/read.go b/vendor/github.com/blevesearch/zapx/v17/read.go new file mode 100644 index 0000000000..690705596d --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/read.go @@ -0,0 +1,65 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import "encoding/binary" + +func (sb *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte, error) { + _, storedOffset, n, metaLen, dataLen := sb.getDocStoredOffsets(docNum) + + meta := sb.mem[storedOffset+n : storedOffset+n+metaLen] + data := sb.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] + + meta, err := sb.fileReader.process(meta) + if err != nil { + return nil, nil, err + } + data, err = sb.fileReader.process(data) + if err != nil { + return nil, nil, err + } + + return meta, data, nil +} + +func (sb *SegmentBase) getDocStoredOffsets(docNum uint64) ( + uint64, uint64, uint64, uint64, uint64) { + indexOffset := sb.storedIndexOffset + (8 * docNum) + + storedOffset := binary.BigEndian.Uint64(sb.mem[indexOffset : indexOffset+8]) + + var n uint64 + + metaLen, read := binary.Uvarint(sb.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) + n += uint64(read) + + dataLen, read := binary.Uvarint(sb.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + return indexOffset, storedOffset, n, metaLen, dataLen +} + +func (sb *SegmentBase) getEdgeListOffset() uint64 { + // if no stored index, then no edge list + if sb.storedIndexOffset == 0 { + return 0 + } + // Edge list comes after the stored fields index (doc stored offsets) + // The stored index offset points to where the doc offsets start + // So edge list starts right after the last document offset + // which is at sb.storedIndexOffset + (8 * sb.numDocs) + // since each doc offset is 8 bytes + return sb.storedIndexOffset + (8 * sb.numDocs) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/section.go b/vendor/github.com/blevesearch/zapx/v17/section.go new file mode 100644 index 0000000000..8d431644f3 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/section.go @@ -0,0 +1,83 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "sync" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" +) + +type section interface { + // process is essentially parsing of a specific field's content in a specific + // document. any tracking of processed data *specific to this section* should + // be done in opaque which will be passed to the Persist() API. + Process(opaque map[int]resetable, docNum uint32, f index.Field, fieldID uint16) + + // flush the processed data in the opaque to the writer. + Persist(opaque map[int]resetable, w *FileWriter) error + + // this API is used to fetch the file offset of the field for this section. + // this is used during search time to parse the section, and fetch results + // for the specific "index" thats part of the section. + AddrForField(opaque map[int]resetable, fieldID int) int + + // for every field in the fieldsInv (relevant to this section) merge the section + // contents from all the segments into a single section data for the field. + // as part of the merge API, write the merged data to the writer and also track + // the starting offset of this newly merged section data. + Merge(opaque map[int]resetable, segments []*SegmentBase, drops []*roaring.Bitmap, fieldsInv []string, + newDocNumsIn [][]uint64, w *FileWriter, closeCh chan struct{}) error + + // opaque is used to track the data specific to this section. its not visible + // to the other sections and is only visible and freely modifiable by this specifc + // section. + InitOpaque(args map[string]interface{}) resetable +} + +type resetable interface { + Reset() error + Set(key string, value interface{}) +} + +// ----------------------------------------------------------------------------- + +const ( + SectionInvertedTextIndex = iota + SectionFaissVectorIndex + SectionSynonymIndex + + // Add new sections above this line. + // NumSections automatically reflects the total number of sections + // and is used to track how many sections can be registered. + NumSections +) + +// ----------------------------------------------------------------------------- + +var ( + segmentSectionsMutex sync.Mutex + // writes to segmentSections within init()s ONLY within lock, + // reads will not require lock access + segmentSections = make(map[uint16]section) +) + +// Method to be invoked within init()s ONLY. +func registerSegmentSection(key uint16, val section) { + segmentSectionsMutex.Lock() + segmentSections[key] = val + segmentSectionsMutex.Unlock() +} diff --git a/vendor/github.com/blevesearch/zapx/v17/section_faiss_vector_index.go b/vendor/github.com/blevesearch/zapx/v17/section_faiss_vector_index.go new file mode 100644 index 0000000000..b3492574d0 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/section_faiss_vector_index.go @@ -0,0 +1,1052 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build vectors +// +build vectors + +package zap + +import ( + "encoding/binary" + "errors" + "fmt" + "math" + "sync/atomic" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + faiss "github.com/blevesearch/go-faiss" + seg "github.com/blevesearch/scorch_segment_api/v2" +) + +func init() { + registerSegmentSection(SectionFaissVectorIndex, &faissVectorIndexSection{}) + invertedTextIndexSectionExclusionChecks = append(invertedTextIndexSectionExclusionChecks, func(field index.Field) bool { + _, ok := field.(index.VectorField) + return ok + }) + faiss.SetOMPThreads(defaultFaissOMPThreads) +} + +const ( + // Set the default number of OMP threads to be used by FAISS + // to 1 since openMP does not support goroutine based threading well. + defaultFaissOMPThreads = 1 + // Divide the estimated nprobe with this value to optimize + // for latency. + nprobeLatencyOptimization = 2 + // The threshold for number of vectors beyond which we start building the ivf class + // of indexes + ivfThreshold = 1000 + // The threshold for number of vectors beyond which we consider fast merging + // using faiss's native merge capabilities, instead of reconstructing and adding + // vectors one by one + ivfSq8Threshold = 10000 +) + +// Vector index types supported. +type faissIndexType uint64 + +const ( + // faissFP32Index represents the standard float32 index in Faiss, + // which stores vectors in 32-bit floating point format. + faissFP32Index faissIndexType = iota + // faissBIVFIndex represents the binary IVF index in Faiss, + // which stores vectors in a 8-bit binary format. + faissBIVFIndex +) + +// Errors for invariant violations related to fast merge and trained index retrieval. +var ( + ErrorTrainedIndexNotIVF error = errors.New("trained index is not an IVF index, which is required for fast merge") + ErrorFastMergeIndexNotIVF error = errors.New("fast merge is only supported for IVF indexes") +) + +type faissVectorIndexSection struct { +} + +func (v *faissVectorIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if fieldID == math.MaxUint16 { + return + } + if vf, ok := field.(index.VectorField); ok { + vo := v.getVectorIndexOpaque(opaque) + vo.process(vf, field.Name(), fieldID, docNum) + } +} + +func (v *faissVectorIndexSection) Persist(opaque map[int]resetable, w *FileWriter) error { + vo := v.getVectorIndexOpaque(opaque) + return vo.writeVectorIndexes(w) +} + +func (v *faissVectorIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + vo := v.getVectorIndexOpaque(opaque) + return vo.fieldAddrs[uint16(fieldID)] +} + +// vecIndexInfo contains information specific to a vector index, +// including metadata and the faiss index pointer itself. +type vecIndexInfo struct { + startOffset int + indexSize uint64 + vecIds []int64 + indexOptimizedFor string + indexType faissIndexType + index faissIndex +} + +// Merge merges vector indexes from multiple segments into a single index. +func (v *faissVectorIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64, w *FileWriter, + closeCh chan struct{}) error { + vo := v.getVectorIndexOpaque(opaque) + // preallocating the space over here, if there are too many fields + // in the segment this will help by avoiding multiple allocation + // calls. + // the segments with valid vector sections in them + vecSegs := make([]*SegmentBase, 0, len(segments)) + // vector index information from those segments + indexes := make([]*vecIndexInfo, 0, len(segments)) + // mapping from vector IDs to docIDs across segments + vecToDocID := make([]uint64, 0, len(segments)) + // for every field, gather the vector indexes from the segments + // that have them, merge them and write them out to the writer. + for fieldID, fieldName := range fieldsInv { + // continue if field is not required to be indexed + if !vo.fieldsOptions[fieldName].IsIndexed() { + continue + } + indexes = indexes[:0] // resizing the slices + vecSegs = vecSegs[:0] + vecToDocID = vecToDocID[:0] + // flag to indicate if there are deleted/updated vectors + // in any of the vector indexes being merged. + var drops bool + for segI, sb := range segments { + if isClosed(closeCh) { + return seg.ErrClosed + } + if _, ok := sb.fieldsMap[fieldName]; !ok { + continue + } + // check if the section address is a valid one for "fieldName" in the + // segment sb. the local fieldID (fetched by the fieldsMap of the sb) + // is to be used while consulting the fieldsSectionsMap + pos := int(sb.fieldsSectionsMap[sb.fieldsMap[fieldName]-1][SectionFaissVectorIndex]) + if pos == 0 { + continue + } + + // loading doc values - adhering to the sections format. never + // valid values for vector section + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + _, n = binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // read the vector index optimization type represented as an int + indexOptimizationTypeInt, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + // read the number of vectors + numVecs, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + + // track the valid vectors to be reconstructed for this segment + // during the merge operation. + newIndexInfo := &vecIndexInfo{ + indexOptimizedFor: index.VectorIndexOptimizationsReverseLookup[int(indexOptimizationTypeInt)], + vecIds: make([]int64, 0, numVecs), + } + + // read the length of the docID list + listLen, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + buf, err := sb.fileReader.process(sb.mem[pos : pos+int(listLen)]) + if err != nil { + return err + } + pos += int(listLen) + + bufPos := 0 + bufLen := len(buf) + for vecID := 0; vecID < int(numVecs); vecID++ { + docID, n := binary.Uvarint(buf[bufPos:min(bufPos+binary.MaxVarintLen64, bufLen)]) + bufPos += n + // check if this docID is dropped in the new segment + newDocID := newDocNumsIn[segI][uint32(docID)] + if newDocID != docDropped { + // valid docID, track the mapping + vecToDocID = append(vecToDocID, newDocID) + // if the remapped doc ID is valid, track it + // as part of vecs to be reconstructed (for larger indexes). + // This accounts only for valid vector IDs, so deleted + // ones won't be reconstructed in the final index. + newIndexInfo.vecIds = append(newIndexInfo.vecIds, int64(vecID)) + } else { + // some vectors are dropped, so we can't do a fast merge using faiss's + // native merge capabilities, because of the data drift issue. + drops = true + } + } + + if len(newIndexInfo.vecIds) == 0 { + // no valid vectors to be merged from this segment + continue + } + + // read the type of vector index + indexType, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + // read the size of the vector index + indexSize, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + // record the start offset and size of the vector index + newIndexInfo.startOffset = pos + newIndexInfo.indexSize = indexSize + newIndexInfo.indexType = faissIndexType(indexType) + vecSegs = append(vecSegs, sb) + indexes = append(indexes, newIndexInfo) + pos += int(indexSize) + } + // continue if there are absolutely no valid vectors present in the segment + // for this field and crucially don't store the section start offset in it + if len(indexes) == 0 || len(vecToDocID) == 0 { + continue + } + + err := vo.flushSectionMetadata(fieldID, w, vecToDocID, indexes) + if err != nil { + return err + } + + // we're going to use the trained index template regardless of whether there's + // a update/delete in the segments being merged and we let the fast merge + // path handle what exactly to do in such cases. + trainedIndex, err := trainedIndexFromConfig(vo.config, fieldName) + if err != nil { + return err + } + + useGPU := vo.fieldsOptions[fieldName].UseGPU() + err = vo.mergeAndWriteVectorIndexes(trainedIndex, vecSegs, indexes, w, closeCh, useGPU, drops) + if err != nil { + return err + } + } + return nil +} + +func trainedIndexFromConfig(config map[string]interface{}, fieldName string) (faissIndexIVF, error) { + var trainedIndexFor index.TrainedIndexCallbackFn + var training bool + var rv faissIndex + if cb, ok := config[index.TrainedIndexCallback]; ok { + trainedIndexFor = cb.(index.TrainedIndexCallbackFn) + } + if tf, ok := config[index.TrainingKey]; ok { + training = tf.(bool) + } + // if we have a callback registered AND if the training flag is not set: + // - fastmerge is supported for this index + // - we're not in the training phase of index creation where you want to be + // able to reconstruct the vectors for training + if trainedIndexFor != nil && !training { + trainedIndex, err := trainedIndexFor(fieldName) + if err != nil { + return nil, err + } + if trainedIndex != nil { + rv = trainedIndex.(faissIndex) + } + } + if rv == nil { + return nil, nil + } + + trainedIndex := rv.castIVF() + if trainedIndex == nil { + return nil, ErrorTrainedIndexNotIVF + } + return trainedIndex, nil +} + +func (v *vectorIndexOpaque) flushSectionMetadata(fieldID int, w *FileWriter, + vecToDocID []uint64, indexes []*vecIndexInfo) error { + tempBuf := v.grabBuf(binary.MaxVarintLen64) + fieldStart := w.Count() + + // marking the fact that for vector index, doc values are not valid by + // storing fieldNotUninverted values. + n := binary.PutUvarint(tempBuf, fieldNotUninverted) + _, err := w.Write(tempBuf[:n]) + if err != nil { + return err + } + n = binary.PutUvarint(tempBuf, fieldNotUninverted) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // write the index optimization type + n = binary.PutUvarint(tempBuf, uint64(index.SupportedVectorIndexOptimizations[indexes[0].indexOptimizedFor])) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + // write the number of vectors + n = binary.PutUvarint(tempBuf, uint64(len(vecToDocID))) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + buf := make([]byte, binary.MaxVarintLen64*len(vecToDocID)) + bufPos := 0 + for _, docID := range vecToDocID { + n = binary.PutUvarint(buf[bufPos:], docID) + bufPos += n + } + buf = w.process(buf[:bufPos]) + + // write the size of the vector to docID map + n = binary.PutUvarint(tempBuf, uint64(len(buf))) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + // write the vecID -> docID mapping + _, err = w.Write(buf) + if err != nil { + return err + } + + // record the fieldStart value for this section. + v.fieldAddrs[uint16(fieldID)] = fieldStart + return nil +} + +// Calculates the nprobe count, given nlist(number of centroids) based on +// the metric the index is optimized for. +func calculateNprobe(nlist int, indexOptimizedFor string) int32 { + nprobe := int32(math.Sqrt(float64(nlist))) + if indexOptimizedFor == index.IndexOptimizedForLatency { + nprobe /= nprobeLatencyOptimization + if nprobe < 1 { + nprobe = 1 + } + } + return nprobe +} + +// todo: need to detect and handle data drift in a more intelligent way +func (v *vectorIndexOpaque) fastMergeIndexes(trainedIndex faissIndexIVF, cfg *faissIndexConfig, + drops bool, vecIndexes []*vecIndexInfo, w *FileWriter, closeCh chan struct{}) error { + // create a faissIndex for merged index using nlist and nprobe from trained index's + // config and we're hitting the fast merge path only if we've not enabled GPU + nprobe, nlist := trainedIndex.ivfParams() + cfg.nlist = nlist + mergedIdx, err := faissIndexFactory(cfg) + if err != nil { + return err + } + defer mergedIdx.close() + + // cast to IVF index to be able to set the quantizer for the fast merge + ivfMergedIdx := mergedIdx.castIVF() + if ivfMergedIdx == nil { + return ErrorFastMergeIndexNotIVF + } + err = ivfMergedIdx.setDirectMap(1) + if err != nil { + return err + } + // setting the same nprobe value in the merged index as the centroid + // index to ensure that we probe the same number of clusters + ivfMergedIdx.setNProbe(int32(nprobe)) + // using the trained index to copy the quantizers and set them in the final + // merged index if possible + err = ivfMergedIdx.setQuantizers(trainedIndex) + if err != nil { + return err + } + + var reconsVecs []float32 + reconsAndAddFrom := func(vecIDs []int64, srcIdx faissIndex) error { + neededReconsLen := len(vecIDs) * cfg.dimension + if cap(reconsVecs) < neededReconsLen { + reconsVecs = make([]float32, neededReconsLen) + } + reconsVecs = reconsVecs[:neededReconsLen] + reconsVecs, err = srcIdx.reconstructBatch(vecIDs, reconsVecs) + if err != nil { + return err + } + + vecSet, err := newVectorSet(cfg.dimension, reconsVecs) + if err != nil { + return err + } + if cfg.indexType == faissBIVFIndex { + vecSet.binarize() + } + // add to target index the reconstructed vectors for the valid vector IDs from the source index. + err = mergedIdx.add(vecSet) + if err != nil { + return err + } + reconsVecs = reconsVecs[:0] + return nil + } + + for _, vi := range vecIndexes { + if isClosed(closeCh) { + return seg.ErrClosed + } + childIdx := vi.index + if drops { + // if there are some deletes or updates in the segments being merged, + // we can't say definitely which mutation can cause a data drift solely + // by the vector count - as a fallback mechanism we reconstruct + add + // the vectors in this scenario using the trained template since we can't + // use merge_from. + err = reconsAndAddFrom(vi.vecIds, childIdx) + if err != nil { + return err + } + } else { + if err = ivfMergedIdx.mergeFrom(childIdx, mergedIdx.ntotal()); err != nil { + // either the childIdx isn't compatible for fast merge or merge_from failed + // so, in either case we can fallback to reconstructing and adding the vectors + // one by one from the source index to the target index as a error handling mechanism. + err = reconsAndAddFrom(vi.vecIds, childIdx) + if err != nil { + return err + } + } + } + } + + tempBuf := v.grabBuf(binary.MaxVarintLen64) + // write the type of the vector index + n := binary.PutUvarint(tempBuf, uint64(cfg.indexType)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + return mergedIdx.write(tempBuf, w) +} + +func (v *vectorIndexOpaque) mergeAndWriteVectorIndexes(trainedIndex faissIndexIVF, sbs []*SegmentBase, + vecIndexes []*vecIndexInfo, w *FileWriter, closeCh chan struct{}, useGPU, drops bool) error { + // safe to assume that all the indexes are of the same config values, given + // that they are extracted from the field mapping info. + var dims, metric, indexDataCap, reconsCap, nvecs int + var indexOptimizedFor string + var indexType faissIndexType + var validMerge bool + + for segI, segBase := range sbs { + // Considering merge operations on vector indexes are expensive, it is + // worth including an early exit if the merge is aborted, saving us + // the resource spikes, even if temporary. + if isClosed(closeCh) { + freeReconstructedIndexes(vecIndexes) + return seg.ErrClosed + } + // track which index we are currently processing + currVecIndex := vecIndexes[segI] + currNumVecs := len(currVecIndex.vecIds) + // if no valid vectors for this index, don't bring it into memory + if currNumVecs == 0 { + continue + } + + // read the serialized index bytes + indexBytes, err := segBase.fileReader.process(segBase.mem[currVecIndex.startOffset : currVecIndex.startOffset+int(currVecIndex.indexSize)]) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + ioFlags := faissIOFlags + if trainedIndex == nil { + ioFlags = faissIOFlagsReadOnly + } + // reconstruct the faiss index from the bytes + faissIndex, err := faiss.ReadIndexFromBuffer(indexBytes, ioFlags) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + + // set the dims and metric values from the constructed index. + dims = faissIndex.D() + // at least one valid index to be merged, mark the merge as valid. + validMerge = true + metric = faissIndex.MetricType() + indexOptimizedFor = currVecIndex.indexOptimizedFor + indexType = currVecIndex.indexType + // update trackers for buffer capacities + indexReconsLen := currNumVecs * dims + if indexReconsLen > reconsCap { + reconsCap = indexReconsLen + } + indexDataCap += indexReconsLen + + // track the reconstruct index for this vector index, which will be used + // to reconstruct the vectors corresponding to the valid vector IDs for this index. + config := newFaissIndexConfig(indexType, indexOptimizedFor, dims, metric, currNumVecs, determineCentroids(currNumVecs), useGPU) + fIndex, err := newFaissFloat32IndexWithConfig(faissIndex, config) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + vecIndexes[segI].index = fIndex + + // load binary index from disk if present + if currVecIndex.indexType == faissBIVFIndex { + // get to the bivf part of the vector index section + pos := currVecIndex.startOffset + int(currVecIndex.indexSize) + binSize, n := binary.Uvarint(segBase.mem[pos : pos+binary.MaxVarintLen64]) + pos += n + indexBytes, err = segBase.fileReader.process(segBase.mem[pos : pos+int(binSize)]) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + + binaryIndex, err := faiss.ReadBinaryIndexFromBuffer(indexBytes, ioFlags) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + vecIndexes[segI].index, err = newFaissBinaryIndexWithConfig(binaryIndex, faissIndex, config) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + } + nvecs += currNumVecs + } + + // not a valid merge operation as there are no valid indexes to merge. + if !validMerge { + return nil + } + // if no valid vectors after merge, nothing to do + if nvecs == 0 { + // no valid vectors for this index, so we don't even have to + // record it in the section + freeReconstructedIndexes(vecIndexes) + return nil + } + + // create the faiss index to hold the merged data, either via fast merge or reconstruction + config := newFaissIndexConfig(indexType, indexOptimizedFor, dims, metric, nvecs, determineCentroids(nvecs), useGPU) + // we perform fast merge if we're not using the GPU and if the trained index + // is compatible to be used for fast merge + if !useGPU && canFastMerge(trainedIndex, indexOptimizedFor, nvecs) { + err := v.fastMergeIndexes(trainedIndex, config, drops, vecIndexes, w, closeCh) + if err != nil { + return err + } + // free the indexes as we won't need them anymore after the fast merge + freeReconstructedIndexes(vecIndexes) + return nil + } + + // Reconstruct Merge Path: + // merging of indexes with reconstruction method. + // the vecIds in each index contain only the valid vectors, + // so we reconstruct only those. + indexData := make([]float32, 0, indexDataCap) + // reusable buffer for reconstruction + recons := make([]float32, 0, reconsCap) + for idx, currVecIndex := range vecIndexes { + if isClosed(closeCh) { + freeReconstructedIndexes(vecIndexes) + return seg.ErrClosed + } + currNumVecs := len(currVecIndex.vecIds) + // reconstruct the vectors only if present, it could be that + // some of the indexes had all of their vectors updated/deleted. + if currNumVecs > 0 && vecIndexes[idx] != nil { + neededReconsLen := currNumVecs * config.dimension + recons = recons[:neededReconsLen] + var err error + fIndex := vecIndexes[idx].index + recons, err = fIndex.reconstructBatch(currVecIndex.vecIds, recons) + if err != nil { + freeReconstructedIndexes(vecIndexes) + return err + } + indexData = append(indexData, recons...) + } + } + + // freeing the reconstructed indexes immediately - waiting till the end + // to do the same is not needed because the following operations don't need + // the reconstructed ones anymore and doing so will hold up memory which can + // be detrimental while creating indexes during introduction. + freeReconstructedIndexes(vecIndexes) + + vecSet, err := newVectorSet(config.dimension, indexData) + if err != nil { + return err + } + return v.writeFaissIndex(vecSet, config, w) +} + +// constructs a faiss on the vectors according to the provided config and writes it out +// the given writer +func (v *vectorIndexOpaque) writeFaissIndex(vecs *vectorSet, config *faissIndexConfig, w *FileWriter) error { + // create the faiss index based on the provided description string, and the metric type. + index, err := faissIndexFactory(config) + if err != nil { + return err + } + // ensure the faiss index is closed after use + defer index.close() + + // binarize the vectors for BIVF indexes + if config.indexType == faissBIVFIndex { + vecs.binarize() + } + // if we are using an IVF index, train and add first, then set the direct map + // and nprobe. The order matters for GPU indexes: CloneToCPU (done inside + // trainAndAdd) clears the direct map and nprobe, so they must be set after. + if ivfIndex := index.castIVF(); ivfIndex != nil { + // train the vector index and add the vectors to it. The training step + // performs k-means clustering to partition the data space such that during + // search time we probe only a subset of vectors (non-exhaustive search). + err = ivfIndex.trainAndAdd(vecs, vecs) + if err != nil { + return err + } + // the direct map maintained in the IVF index is essential for the + // reconstruction of vectors based on the sequential vector IDs in the + // future merges use direct map type 1 -> array based direct map, since + // we have sequential vector IDs starting from 0 to N-1. + err = ivfIndex.setDirectMap(1) + if err != nil { + return err + } + // calculate nprobe using a heuristic. + nprobe := calculateNprobe(config.nlist, config.optimizationType) + ivfIndex.setNProbe(nprobe) + } else { + // add the vectors to the index using sequential vector IDs starting + // from 0 to N-1 + err = index.add(vecs) + if err != nil { + return err + } + } + + // get a temporary buffer for writing out the index + tempBuf := v.grabBuf(binary.MaxVarintLen64) + // write the type of the vector index + n := binary.PutUvarint(tempBuf, uint64(config.indexType)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // serialize the merged index into a byte slice, and write it out + err = index.write(tempBuf, w) + if err != nil { + return err + } + + return nil +} + +// returns the index description string and index type constant for the binary +// index to be created based on the number of vectors and centroids. +func determineBinaryIndexToUse(nvecs, nlist int) string { + switch { + case nvecs >= ivfThreshold: + return fmt.Sprintf("BIVF%d", nlist) + default: + return "BFlat" + } +} + +// returns the index type constant for the vector index to be created based on the +// index optimization type specified in the field mapping. +func determineIndexTypeFromOptimization(indexOptimizedFor string) faissIndexType { + if index.OptimizationRequiresBinaryIndex(indexOptimizedFor) { + return faissBIVFIndex + } + return faissFP32Index +} + +// freeReconstructedIndexes closes all faiss indexes in the provided slice. +func freeReconstructedIndexes(vecIndexes []*vecIndexInfo) { + for _, entry := range vecIndexes { + if entry != nil && entry.index != nil { + entry.index.close() + } + } +} + +// grabBuf returns a reusable buffer of the given size, allocating a new one if needed. +func (v *vectorIndexOpaque) grabBuf(size int) []byte { + buf := v.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + v.tmp0 = buf + } + return buf[:size] +} + +// determineCentroids determines the number of centroids to use for an IVF index. +func determineCentroids(nvecs int) int { + var nlist int + switch { + case nvecs >= 200000: + nlist = int(4 * math.Sqrt(float64(nvecs))) + case nvecs >= ivfThreshold: + // 100 points per cluster is a reasonable default, considering the default + // minimum and maximum points per cluster is 39 and 256 respectively. + // Since it's a recommendation to have a minimum of 10 clusters, 1000(100 * 10) + // was chosen as the lower threshold. + nlist = nvecs / 100 + } + return nlist +} + +// determineFloat32IndexToUse returns a description string for the float32 +// index and quantizer type, and an index type constant. +func determineFloat32IndexToUse(nvecs, nlist int, optimizationType string) string { + if nvecs < ivfThreshold { + return "Flat" + } + switch optimizationType { + case index.IndexBIVFWithBackingFlat: + return "Flat" + case index.IndexBIVFWithBackingSQ8: + return "SQ8" + case index.IndexOptimizedForMemoryEfficient: + return fmt.Sprintf("IVF%d,SQ4", nlist) + case index.IndexIVFRaBitQ: + return fmt.Sprintf("IVF%d,RaBitQ", nlist) + default: + switch { + case nvecs >= ivfSq8Threshold: + return fmt.Sprintf("IVF%d,SQ8", nlist) + default: + return fmt.Sprintf("IVF%d,Flat", nlist) + } + } +} + +func (vo *vectorIndexOpaque) writeVectorIndexes(w *FileWriter) error { + // for every fieldID, contents to store over here are: + // 1. the serialized representation of the dense vector index. + // 2. its constituent metadata like: + // a. number of vectors + // b. dimension of vectors + // c. distance metric + // d. index optimization type + // e. vectorID -> docID mapping + tempBuf := vo.grabBuf(binary.MaxVarintLen64) + for fieldID, content := range vo.fieldVectorIndex { + // number of vectors to be indexed for this field + nvecs := len(content.vecDocIDs) + // Set the faiss metric type (default is Euclidean Distance or l2_norm) + metric := faiss.MetricL2 + if content.metric == index.InnerProduct || content.metric == index.CosineSimilarity { + // use the same FAISS metric for inner product and cosine similarity + metric = faiss.MetricInnerProduct + } + + // create a vector set wrapping the vector data + vecSet, err := newVectorSet(content.dimension, content.vectors) + if err != nil { + return err + } + + // record the fieldStart value for this section. + fieldStart := w.Count() + // writing out two offset values to indicate that the current field's + // vector section doesn't have valid doc value content within it. + n := binary.PutUvarint(tempBuf, fieldNotUninverted) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + n = binary.PutUvarint(tempBuf, fieldNotUninverted) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + // write the index optimization type + n = binary.PutUvarint(tempBuf, uint64(index.SupportedVectorIndexOptimizations[content.optimizedFor])) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + // write the number of vectors + n = binary.PutUvarint(tempBuf, uint64(nvecs)) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + + buf := make([]byte, binary.MaxVarintLen64*len(content.vecDocIDs)) + bufPos := 0 + for _, docID := range content.vecDocIDs { + n = binary.PutUvarint(buf[bufPos:], uint64(docID)) + bufPos += n + } + buf = w.process(buf[:bufPos]) + + // write the size of the vector to docID map + n = binary.PutUvarint(tempBuf, uint64(len(buf))) + _, err = w.Write(tempBuf[:n]) + if err != nil { + return err + } + // write the vecID -> docID mapping + _, err = w.Write(buf) + if err != nil { + return err + } + + // determine the type of vector index to be created based on the index optimization + // and create the faiss index for the vectors associated with this field and + // write out the index into the segment writer. + indexType := determineIndexTypeFromOptimization(content.optimizedFor) + config := newFaissIndexConfig(indexType, content.optimizedFor, content.dimension, metric, nvecs, determineCentroids(nvecs), false) + err = vo.writeFaissIndex(vecSet, config, w) + if err != nil { + return err + } + + // accounts for whatever data has been written out to the writer. + vo.incrementBytesWritten(uint64(w.Count() - fieldStart)) + vo.fieldAddrs[fieldID] = fieldStart + } + return nil +} + +func (vo *vectorIndexOpaque) process(field index.VectorField, fieldName string, fieldID uint16, docNum uint32) { + if fieldID == math.MaxUint16 { + // doc processing checkpoint - no action needed + return + } + vec := field.Vector() + dim := field.Dims() + metric := field.Similarity() + indexOptimizedFor := field.IndexOptimizedFor() + // caller is supposed to make sure len(vec) is a multiple of dim. + // Not double checking it here to avoid the overhead. + // This accounts for multi-vector fields, where a field can have + // multiple vectors associated with it. In this case we process all + // vectors associated with the field as separate vectors. + numVectors := len(vec) / dim + for i := 0; i < numVectors; i++ { + vector := vec[i*dim : (i+1)*dim] + // check if we have content for this fieldID already + content, ok := vo.fieldVectorIndex[fieldID] + if !ok { + // create an entry for this fieldID as this is the first time + // we are seeing this field + content = &vectorIndexContent{ + dimension: dim, + metric: metric, + optimizedFor: indexOptimizedFor, + vectors: make([]float32, 0, dim*numVectors), + vecDocIDs: make([]uint32, 0, numVectors), + useGPU: vo.fieldsOptions[fieldName].UseGPU(), + } + vo.fieldVectorIndex[fieldID] = content + } + // track the vector data and docIDs + content.vectors = append(content.vectors, vector...) + content.vecDocIDs = append(content.vecDocIDs, docNum) + } +} + +func (v *faissVectorIndexSection) getVectorIndexOpaque(opaque map[int]resetable) *vectorIndexOpaque { + if _, ok := opaque[SectionFaissVectorIndex]; !ok { + opaque[SectionFaissVectorIndex] = v.InitOpaque(nil) + } + return opaque[SectionFaissVectorIndex].(*vectorIndexOpaque) +} + +func (v *faissVectorIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &vectorIndexOpaque{ + fieldAddrs: make(map[uint16]int), + fieldVectorIndex: make(map[uint16]*vectorIndexContent), + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +// vectorIndexContent contains the information required to create a vector index for a vector field. +type vectorIndexContent struct { + // vectors stores flattened vectors in a row-major order + vectors []float32 + // vecDocIDs corresponding to each vector + vecDocIDs []uint32 + // dimension is the dimension of all vectors + dimension int + // metric is the distance metric to be used + metric string + // optimizedFor is the optimization type for the index + optimizedFor string + // useGPU indicates whether the index should be created on the GPU + useGPU bool +} + +// vectorIndexOpaque holds the internal state for vector index processing. +type vectorIndexOpaque struct { + // external config values passed in, which controls the behavior of vector index creation and merging + config map[string]interface{} + // number of bytes written out for the vector index section, used for metrics and tracking + bytesWritten uint64 + // fieldAddrs maps fieldID to the address of its vector section + fieldAddrs map[uint16]int + // fieldVectorIndex maps fieldID to its vector index content + fieldVectorIndex map[uint16]*vectorIndexContent + // fieldsOptions contains field indexing options + fieldsOptions map[string]index.FieldIndexingOptions + // tmp0 is a reusable buffer + tmp0 []byte +} + +func (vo *vectorIndexOpaque) incrementBytesWritten(val uint64) { + atomic.AddUint64(&vo.bytesWritten, val) +} + +func (vo *vectorIndexOpaque) BytesWritten() uint64 { + return atomic.LoadUint64(&vo.bytesWritten) +} + +func (vo *vectorIndexOpaque) BytesRead() uint64 { + return 0 +} + +func (vo *vectorIndexOpaque) ResetBytesRead(uint64) { +} + +// Reset clears all state in the vectorIndexOpaque for reuse. +func (vo *vectorIndexOpaque) Reset() error { + clear(vo.fieldAddrs) + clear(vo.fieldVectorIndex) + vo.tmp0 = vo.tmp0[:0] + vo.fieldsOptions = nil + vo.config = nil + atomic.StoreUint64(&vo.bytesWritten, 0) + return nil +} + +func (v *vectorIndexOpaque) Set(key string, val interface{}) { + switch key { + case "fieldsOptions": + v.fieldsOptions = val.(map[string]index.FieldIndexingOptions) + case "config": + v.config = val.(map[string]interface{}) + } +} + +// --------------------------------- +// Faiss Index Factory +// --------------------------------- +type faissIndexConfig struct { + indexType faissIndexType + dimension int + metricType int + numVecs int + optimizationType string + nlist int + useGPU bool +} + +func newFaissIndexConfig(idxType faissIndexType, optimizationType string, dimension, metricType, numVecs, nlist int, useGPU bool) *faissIndexConfig { + return &faissIndexConfig{ + indexType: idxType, + dimension: dimension, + metricType: metricType, + numVecs: numVecs, + nlist: nlist, + optimizationType: optimizationType, + useGPU: useGPU, + } +} + +// Factory function to create a faissIndex for the given index config. +func faissIndexFactory(cfg *faissIndexConfig) (faissIndex, error) { + switch cfg.indexType { + case faissFP32Index: + description := determineFloat32IndexToUse(cfg.numVecs, cfg.nlist, cfg.optimizationType) + idx, err := faiss.IndexFactory(cfg.dimension, description, cfg.metricType) + if err != nil { + return nil, err + } + // we restrict GPU to IVF indexes only; flat and SQ indexes do not get a noticeable speedup + // when run on GPU, and the GPU overhead can actually make them slower than CPU. + if cfg.useGPU && idx.IsIVFIndex() { + return newFaissGPUFloat32Index(idx) + } + return newFaissFloat32Index(idx) + case faissBIVFIndex: + description := determineBinaryIndexToUse(cfg.numVecs, cfg.nlist) + binaryIdx, err := faiss.BinaryIndexFactory(cfg.dimension, description) + if err != nil { + return nil, err + } + + description = determineFloat32IndexToUse(cfg.numVecs, cfg.nlist, cfg.optimizationType) + backingIdx, err := faiss.IndexFactory(cfg.dimension, description, cfg.metricType) + if err != nil { + return nil, err + } + return newFaissBinaryIndex(binaryIdx, backingIdx) + default: + return nil, errNotSupported + } +} + +// canFastMerge determines whether we can use the fast merge capabilities of faiss based on +// - the presence of a trained index +// - the optimization type of the index. +// - the total number of vectors being merged. +func canFastMerge(trainedIndex faissIndexIVF, opt string, totalVecs int) bool { + // if the trained index isn't IVF or not available, fallback to naive merge + if trainedIndex == nil { + return false + } + + var minVecsForFastMerge int + switch opt { + case index.IndexBIVFWithBackingFlat, index.IndexBIVFWithBackingSQ8: + fallthrough + case index.IndexOptimizedForMemoryEfficient: + fallthrough + case index.IndexIVFRaBitQ: + minVecsForFastMerge = ivfThreshold + default: + minVecsForFastMerge = ivfSq8Threshold + } + return trainedIndex.ntotal() > int64(minVecsForFastMerge) && totalVecs > minVecsForFastMerge +} diff --git a/vendor/github.com/blevesearch/zapx/v17/section_inverted_text_index.go b/vendor/github.com/blevesearch/zapx/v17/section_inverted_text_index.go new file mode 100644 index 0000000000..a2a82de861 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/section_inverted_text_index.go @@ -0,0 +1,1121 @@ +// Copyright (c) 2023 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + "sync/atomic" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +func init() { + registerSegmentSection(SectionInvertedTextIndex, &invertedTextIndexSection{}) +} + +type invertedTextIndexSection struct { +} + +// This function checks whether the inverted text index section should avoid processing +// a particular field, preventing unnecessary work if another section will handle it. +// +// NOTE: The exclusion check is applicable only to the InvertedTextIndexSection +// because it serves as a catch-all section. This section processes every field +// unless explicitly excluded, similar to a "default" case in a switch statement. +// Other sections, such as VectorSection and SynonymSection, rely on inclusion +// checks to process only specific field types (e.g., index.VectorField or +// index.SynonymField). Any new section added in the future must define its +// special field type and inclusion logic explicitly. +var isFieldExcludedFromInvertedTextIndexSection = func(field index.Field) bool { + for _, excludeField := range invertedTextIndexSectionExclusionChecks { + if excludeField(field) { + // atleast one section has agreed to exclude this field + // from inverted text index section processing and has + // agreed to process it independently + return true + } + } + // no section has excluded this field from inverted index processing + // so it should be processed by the inverted index section + return false +} + +// List of checks to determine if a field is excluded from the inverted text index section +var invertedTextIndexSectionExclusionChecks = make([]func(field index.Field) bool, 0) + +func (i *invertedTextIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if !isFieldExcludedFromInvertedTextIndexSection(field) { + io := i.getInvertedIndexOpaque(opaque) + io.process(field, fieldID, docNum) + } +} + +func (i *invertedTextIndexSection) Persist(opaque map[int]resetable, w *FileWriter) error { + io := i.getInvertedIndexOpaque(opaque) + return io.writeDicts(w) +} + +func (i *invertedTextIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + io := i.getInvertedIndexOpaque(opaque) + return io.fieldAddrs[fieldID] +} + +func mergeAndPersistInvertedSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, fieldsOptions map[string]index.FieldIndexingOptions, + fieldsSame bool, newDocNumsIn [][]uint64, newSegDocCount uint64, chunkMode uint32, w *FileWriter, + closeCh chan struct{}) (map[int]int, error) { + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 + + var postings *PostingsList + var postItr *PostingsIterator + + fieldAddrs := make(map[int]int) + dictOffsets := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) + + // copying data directly is safe only if there are no + // file callbacks that might modify the data in all + // of the involved segments and the current writer + copyFlag := true + for _, segment := range segments { + if segment.fileReader.id != "" { + copyFlag = false + break + } + } + if w.id != "" { + copyFlag = false + } + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, err + } + + newRoaring := roaring.NewBitmap() + newDocNums := make([][]uint64, 0, len(segments)) + drops := make([]*roaring.Bitmap, 0, len(segments)) + dicts := make([]*Dictionary, 0, len(segments)) + itrs := make([]vellum.Iterator, 0, len(segments)) + segmentsInFocus := make([]*SegmentBase, 0, len(segments)) + // for each field + for fieldID, fieldName := range fieldsInv { + // collect FST iterators from all active segments for this field + newDocNums = newDocNums[:0] + drops = drops[:0] + dicts = dicts[:0] + itrs = itrs[:0] + segmentsInFocus = segmentsInFocus[:0] + for segmentI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, seg.ErrClosed + } + // early exit if the field's index option is false + if !fieldsOptions[fieldName].IsIndexed() { + continue + } + + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, err2 + } + if dict != nil && dict.fst != nil { + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + dicts = append(dicts, dict) + itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) + } + } + } + + var prevTerm []byte + + newRoaring.Clear() + + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } + } + return false, 0, 0 + } + + finishTerm := func(term []byte) error { + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } + + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + + newRoaring.Clear() + + tfEncoder.Reset() + locEncoder.Reset() + + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, err + } + } + if !bytes.Equal(prevTerm, term) || prevTerm == nil { + // compute cardinality of field-term in new seg + var newCard uint64 + lowItrIdxs, lowItrVals := enumerator.GetLowIdxsAndValues() + for i, idx := range lowItrIdxs { + pl, err := dicts[idx].postingsListFromOffset(lowItrVals[i], drops[idx], nil) + if err != nil { + return nil, err + } + newCard += pl.Count() + } + // compute correct chunk size with this + chunkSize, err := getChunkSize(chunkMode, newCard, newSegDocCount) + if err != nil { + return nil, err + } + // update encoders chunk + tfEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + locEncoder.SetChunkSize(chunkSize, newSegDocCount-1) + } + + postings, err = dicts[itrI].postingsListFromOffset( + postingsOffset, drops[itrI], postings) + if err != nil { + return nil, err + } + + postItr = postings.iterator(true, true, true, postItr) + + // can only safely copy data if all segments have same fields and all have an empty + // writer id (i.e. no callbacks) + if fieldsSame && copyFlag { + // can optimize by copying freq/norm/loc bytes directly + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder) + } else { + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, + tfEncoder, locEncoder, bufLoc) + } + if err != nil { + return nil, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, err + } + // close the enumerator to free the underlying iterators + err = enumerator.Close() + if err != nil { + return nil, err + } + + err = finishTerm(prevTerm) + if err != nil { + return nil, err + } + + dictOffset := uint64(w.Count()) + + err = newVellum.Close() + if err != nil { + return nil, err + } + vellumData := w.process(vellumBuf.Bytes()) + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + + dictOffsets[fieldID] = dictOffset + + fieldDvLocsStart[fieldID] = uint64(w.Count()) + + // update the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return nil, err + } + if fieldsOptions[fieldName].SkipDVChunking() { + chunkSize = 1 + } + fdvEncoder := newChunkedContentCoder(chunkSize, newSegDocCount-1, w, true, fieldsOptions[fieldName].SkipDVCompression()) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + var dvIter *docValueReader + for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, seg.ErrClosed + } + // early exit if docvalues are not wanted for this field + if !fieldsOptions[fieldName].IncludeDocValues() { + continue + } + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + dvIter = segment.fieldDvReaders[SectionInvertedTextIndex][fieldIDPlus1-1] + if dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) + if err != nil { + return nil, err + } + } + } + + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, err + } + + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } + + fieldStart := w.Count() + + n = binary.PutUvarint(bufMaxVarintLen64, fieldDvLocsStart[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, fieldDvLocsEnd[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, dictOffsets[fieldID]) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, err + } + + fieldAddrs[fieldID] = fieldStart + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, err + } + } + return fieldAddrs, nil +} + +func (i *invertedTextIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64, + w *FileWriter, closeCh chan struct{}) error { + io := i.getInvertedIndexOpaque(opaque) + fieldAddrs, err := mergeAndPersistInvertedSection(segments, drops, fieldsInv, + io.FieldsMap, io.FieldsOptions, io.fieldsSame, newDocNumsIn, io.numDocs, io.chunkMode, w, closeCh) + if err != nil { + return err + } + + io.fieldAddrs = fieldAddrs + return nil +} + +func (i *invertedIndexOpaque) grabBuf(size int) []byte { + buf := i.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + i.tmp0 = buf + } + return buf[:size] +} + +func (i *invertedIndexOpaque) incrementBytesWritten(bytes uint64) { + i.bytesWritten += bytes +} + +func (i *invertedIndexOpaque) BytesWritten() uint64 { + return i.bytesWritten +} + +func (i *invertedIndexOpaque) BytesRead() uint64 { + return 0 +} + +func (i *invertedIndexOpaque) ResetBytesRead(uint64) {} + +func (io *invertedIndexOpaque) writeDicts(w *FileWriter) error { + if len(io.results) == 0 { + return nil + } + + dictOffsets := make([]uint64, len(io.FieldsInv)) + var err error + + fdvOffsetsStart := make([]uint64, len(io.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(io.FieldsInv)) + + buf := io.grabBuf(binary.MaxVarintLen64) + + // these int coders are initialized with chunk size 1024 + // however this will be reset to the correct chunk size + // while processing each individual field-term section + tfEncoder := newChunkedIntCoder(1024, uint64(len(io.results)-1)) + locEncoder := newChunkedIntCoder(1024, uint64(len(io.results)-1)) + + var docTermMap [][]byte + + if io.builder == nil { + io.builder, err = vellum.New(&io.builderBuf, nil) + if err != nil { + return err + } + } + + for fieldID, terms := range io.DictKeys { + if cap(docTermMap) < len(io.results) { + docTermMap = make([][]byte, len(io.results)) + } else { + docTermMap = docTermMap[:len(io.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := io.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := io.Postings[pid] + + freqNorms := io.FreqNorms[pid] + freqNormOffset := 0 + + locs := io.Locs[pid] + locOffset := 0 + + var cardinality uint64 + if postingsBS != nil { + cardinality = postingsBS.GetCardinality() + } + chunkSize, err := getChunkSize(io.chunkMode, cardinality, uint64(len(io.results))) + if err != nil { + return err + } + tfEncoder.SetChunkSize(chunkSize, uint64(len(io.results)-1)) + locEncoder.SetChunkSize(chunkSize, uint64(len(io.results)-1)) + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + // check if freq/norm is enabled + if freqNorm.freq > 0 { + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), + uint64(math.Float32bits(freqNorm.norm))) + } else { + // if disabled, then skip the norm part + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0)) + } + if err != nil { + return err + } + + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return err + } + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return err + } + } + locOffset += freqNorm.numLocs + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + index.DocValueTermSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + io.incrementBytesWritten(locEncoder.getBytesWritten()) + io.incrementBytesWritten(tfEncoder.getBytesWritten()) + + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, w, buf) + if err != nil { + return err + } + + if postingsOffset > uint64(0) { + err = io.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = io.builder.Close() + if err != nil { + return err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(w.Count()) + + vellumData := w.process(io.builderBuf.Bytes()) + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + io.incrementBytesWritten(uint64(len(vellumData))) + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return err + } + + // reset vellum for reuse + io.builderBuf.Reset() + + err = io.builder.Reset(&io.builderBuf) + if err != nil { + return err + } + + // write the field doc values + // NOTE: doc values continue to use legacy chunk mode + chunkSize, err := getChunkSize(LegacyChunkMode, 0, 0) + if err != nil { + return err + } + if io.FieldsOptions[io.FieldsInv[fieldID]].SkipDVChunking() { + chunkSize = 1 + } + fdvEncoder := newChunkedContentCoder(chunkSize, uint64(len(io.results)-1), w, false, io.FieldsOptions[io.FieldsInv[fieldID]].SkipDVCompression()) + if io.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if fieldTermMap, ok := io.extraDocValues[docNum]; ok { + if sTerms, ok := fieldTermMap[uint16(fieldID)]; ok { + for _, sTerm := range sTerms { + docTerms = append(append(docTerms, sTerm...), index.DocValueTermSeparator) + } + } + } + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return err + } + + io.incrementBytesWritten(fdvEncoder.getBytesWritten()) + + fdvOffsetsStart[fieldID] = uint64(w.Count()) + + _, err = fdvEncoder.Write() + if err != nil { + return err + } + + fdvOffsetsEnd[fieldID] = uint64(w.Count()) + fdvEncoder.Reset() + } else { + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted + } + + fieldStart := w.Count() + + n = binary.PutUvarint(buf, fdvOffsetsStart[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, fdvOffsetsEnd[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, dictOffsets[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + io.fieldAddrs[fieldID] = fieldStart + } + + return nil +} + +func (io *invertedIndexOpaque) process(field index.Field, fieldID uint16, docNum uint32) { + if !io.init && io.results != nil { + io.realloc() + io.init = true + } + + // if the fieldID is MaxUint16, it's mainly indicated that the caller has + // finished invoking the process() for every field on that doc. + if fieldID == math.MaxUint16 { + for fid, tfs := range io.reusableFieldTFs { + dict := io.Dicts[fid] + norm := math.Float32frombits(uint32(io.reusableFieldLens[fid])) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := io.Postings[pid] + bs.Add(uint32(docNum)) + + io.FreqNorms[pid] = append(io.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + numLocs: len(tf.Locations), + }) + + if len(tf.Locations) > 0 { + locs := io.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fid) + if loc.Field != "" { + locf = uint16(io.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + io.Locs[pid] = locs + } + } + } + for i := 0; i < len(io.FieldsInv); i++ { // clear these for reuse + io.reusableFieldLens[i] = 0 + io.reusableFieldTFs[i] = nil + } + return + } + + io.reusableFieldLens[fieldID] += field.AnalyzedLength() + existingFreqs := io.reusableFieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(field.Name(), field.AnalyzedTokenFrequencies()) + } else { + io.reusableFieldTFs[fieldID] = field.AnalyzedTokenFrequencies() + } +} + +func (i *invertedIndexOpaque) initDictsAndKeysFromFields() { + numFields := len(i.FieldsInv) + + // Resize or allocate Dicts + if cap(i.Dicts) >= numFields { + i.Dicts = i.Dicts[:numFields] + } else { + i.Dicts = make([]map[string]uint64, numFields) + } + + // Resize or allocate DictKeys + if cap(i.DictKeys) >= numFields { + i.DictKeys = i.DictKeys[:numFields] + } else { + i.DictKeys = make([][]string, numFields) + } + + for idx := 0; idx < numFields; idx++ { + // --- Dicts --- + if i.Dicts[idx] == nil { + i.Dicts[idx] = make(map[string]uint64) + } else { + clear(i.Dicts[idx]) + } + + // --- DictKeys --- + if i.DictKeys[idx] != nil { + i.DictKeys[idx] = i.DictKeys[idx][:0] + } else { + i.DictKeys[idx] = nil + } + } +} + +func (i *invertedIndexOpaque) realloc() { + var pidNext int + + var totTFs int + var totLocs int + + // initialize dicts and dict keys from fieldsMap + i.initDictsAndKeysFromFields() + + visitField := func(field index.Field, docNum int) { + fieldID := uint16(i.getOrDefineField(field.Name())) + + dict := i.Dicts[fieldID] + dictKeys := i.DictKeys[fieldID] + + tfs := field.AnalyzedTokenFrequencies() + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + i.numTermsPerPostingsList = append(i.numTermsPerPostingsList, 0) + i.numLocsPerPostingsList = append(i.numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + i.numTermsPerPostingsList[pid] += 1 + i.numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + i.DictKeys[fieldID] = dictKeys + if field.Options().IncludeDocValues() { + i.IncludeDocValues[fieldID] = true + } + + if f, ok := field.(index.GeoShapeField); ok { + if _, exists := i.extraDocValues[docNum]; !exists { + i.extraDocValues[docNum] = make(map[uint16][][]byte) + } + i.extraDocValues[docNum][fieldID] = append(i.extraDocValues[docNum][fieldID], f.EncodedShape()) + } + } + + if cap(i.IncludeDocValues) >= len(i.FieldsInv) { + i.IncludeDocValues = i.IncludeDocValues[:len(i.FieldsInv)] + } else { + i.IncludeDocValues = make([]bool, len(i.FieldsInv)) + } + + if i.extraDocValues == nil { + i.extraDocValues = map[int]map[uint16][][]byte{} + } + + for docNum, result := range i.results { + // walk each composite field + result.VisitComposite(func(field index.CompositeField) { + visitField(field, docNum) + }) + + // walk each field + result.VisitFields(func(field index.Field) { + visitField(field, docNum) + }) + } + + numPostingsLists := pidNext + + if cap(i.Postings) >= numPostingsLists { + i.Postings = i.Postings[:numPostingsLists] + } else { + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, i.Postings[:cap(i.Postings)]) + for i := 0; i < numPostingsLists; i++ { + if postings[i] == nil { + postings[i] = roaring.New() + } + } + i.Postings = postings + } + + if cap(i.FreqNorms) >= numPostingsLists { + i.FreqNorms = i.FreqNorms[:numPostingsLists] + } else { + i.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(i.freqNormsBacking) >= totTFs { + i.freqNormsBacking = i.freqNormsBacking[:totTFs] + } else { + i.freqNormsBacking = make([]interimFreqNorm, totTFs) + } + + freqNormsBacking := i.freqNormsBacking + for pid, numTerms := range i.numTermsPerPostingsList { + i.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + if cap(i.Locs) >= numPostingsLists { + i.Locs = i.Locs[:numPostingsLists] + } else { + i.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(i.locsBacking) >= totLocs { + i.locsBacking = i.locsBacking[:totLocs] + } else { + i.locsBacking = make([]interimLoc, totLocs) + } + + locsBacking := i.locsBacking + for pid, numLocs := range i.numLocsPerPostingsList { + i.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } + + for _, dict := range i.DictKeys { + sort.Strings(dict) + } + + if cap(i.reusableFieldTFs) >= len(i.FieldsInv) { + i.reusableFieldTFs = i.reusableFieldTFs[:len(i.FieldsInv)] + } else { + i.reusableFieldTFs = make([]index.TokenFrequencies, len(i.FieldsInv)) + } + + if cap(i.reusableFieldLens) >= len(i.FieldsInv) { + i.reusableFieldLens = i.reusableFieldLens[:len(i.FieldsInv)] + } else { + i.reusableFieldLens = make([]int, len(i.FieldsInv)) + } +} + +func (i *invertedTextIndexSection) getInvertedIndexOpaque(opaque map[int]resetable) *invertedIndexOpaque { + if _, ok := opaque[SectionInvertedTextIndex]; !ok { + opaque[SectionInvertedTextIndex] = i.InitOpaque(nil) + } + return opaque[SectionInvertedTextIndex].(*invertedIndexOpaque) +} + +func (i *invertedIndexOpaque) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := i.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(i.FieldsInv) + 1) + i.FieldsMap[fieldName] = fieldIDPlus1 + i.FieldsInv = append(i.FieldsInv, fieldName) + + i.Dicts = append(i.Dicts, make(map[string]uint64)) + + n := len(i.DictKeys) + if n < cap(i.DictKeys) { + i.DictKeys = i.DictKeys[:n+1] + i.DictKeys[n] = i.DictKeys[n][:0] + } else { + i.DictKeys = append(i.DictKeys, []string(nil)) + } + } + + return int(fieldIDPlus1 - 1) +} + +func (i *invertedTextIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &invertedIndexOpaque{ + fieldAddrs: map[int]int{}, + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +type invertedIndexOpaque struct { + bytesWritten uint64 // atomic access to this variable, moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + + results []index.Document + + chunkMode uint32 + + // indicates whethere the following structs are initialized + init bool + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Field indexing options + // field name -> options + FieldsOptions map[string]index.FieldIndexingOptions + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id + + // store terms that are unnecessary for the term dictionaries but needed in doc values + // eg - encoded geoshapes + // docNum -> fieldID -> terms + extraDocValues map[int]map[uint16][][]byte + + builder *vellum.Builder + builderBuf bytes.Buffer + + // reusable stuff for processing fields etc. + reusableFieldLens []int + reusableFieldTFs []index.TokenFrequencies + + tmp0 []byte + + fieldAddrs map[int]int + + fieldsSame bool + numDocs uint64 +} + +func (io *invertedIndexOpaque) Reset() (err error) { + // cleanup stuff over here + io.results = nil + io.init = false + io.chunkMode = 0 + io.FieldsMap = nil + io.FieldsOptions = nil + io.FieldsInv = nil + for i := range io.Dicts { + io.Dicts[i] = nil + } + io.Dicts = io.Dicts[:0] + for i := range io.DictKeys { + io.DictKeys[i] = io.DictKeys[i][:0] + } + io.DictKeys = io.DictKeys[:0] + for i := range io.IncludeDocValues { + io.IncludeDocValues[i] = false + } + io.IncludeDocValues = io.IncludeDocValues[:0] + for _, idn := range io.Postings { + idn.Clear() + } + io.Postings = io.Postings[:0] + io.FreqNorms = io.FreqNorms[:0] + for i := range io.freqNormsBacking { + io.freqNormsBacking[i] = interimFreqNorm{} + } + io.freqNormsBacking = io.freqNormsBacking[:0] + io.Locs = io.Locs[:0] + for i := range io.locsBacking { + io.locsBacking[i] = interimLoc{} + } + io.locsBacking = io.locsBacking[:0] + io.numTermsPerPostingsList = io.numTermsPerPostingsList[:0] + io.numLocsPerPostingsList = io.numLocsPerPostingsList[:0] + io.builderBuf.Reset() + if io.builder != nil { + err = io.builder.Reset(&io.builderBuf) + } + + io.reusableFieldLens = io.reusableFieldLens[:0] + io.reusableFieldTFs = io.reusableFieldTFs[:0] + + io.tmp0 = io.tmp0[:0] + io.extraDocValues = nil + atomic.StoreUint64(&io.bytesWritten, 0) + io.fieldsSame = false + io.numDocs = 0 + + clear(io.fieldAddrs) + + return err +} +func (i *invertedIndexOpaque) Set(key string, val interface{}) { + switch key { + case "results": + i.results = val.([]index.Document) + case "chunkMode": + i.chunkMode = val.(uint32) + case "fieldsSame": + i.fieldsSame = val.(bool) + case "fieldsMap": + i.FieldsMap = val.(map[string]uint16) + case "fieldsOptions": + i.FieldsOptions = val.(map[string]index.FieldIndexingOptions) + case "fieldsInv": + i.FieldsInv = val.([]string) + case "numDocs": + i.numDocs = val.(uint64) + } +} diff --git a/vendor/github.com/blevesearch/zapx/v17/section_synonym_index.go b/vendor/github.com/blevesearch/zapx/v17/section_synonym_index.go new file mode 100644 index 0000000000..e15b2e17a1 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/section_synonym_index.go @@ -0,0 +1,795 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + index "github.com/blevesearch/bleve_index_api" + seg "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +func init() { + registerSegmentSection(SectionSynonymIndex, &synonymIndexSection{}) + invertedTextIndexSectionExclusionChecks = append(invertedTextIndexSectionExclusionChecks, func(field index.Field) bool { + _, ok := field.(index.SynonymField) + return ok + }) +} + +// ----------------------------------------------------------------------------- + +type synonymIndexOpaque struct { + results []index.Document + + // indicates whether the following structs are initialized + init bool + + // FieldsMap maps field name to field id and must be set in + // the index opaque using the key "fieldsMap" + // used for ensuring accurate mapping between fieldID and + // thesaurusID + // name -> field id + 1 + FieldsMap map[string]uint16 + + // ThesaurusMap adds 1 to thesaurus id to avoid zero value issues + // name -> thesaurus id + 1 + ThesaurusMap map[string]uint16 + + // ThesaurusMapInv is the inverse of ThesaurusMap + // thesaurus id + 1 -> name + ThesaurusInv []string + + // Thesaurus for each thesaurus ID + // thesaurus id -> LHS term -> synonym postings list id + 1 + Thesauri []map[string]uint64 + + // LHS Terms for each thesaurus ID, where terms are sorted ascending + // thesaurus id -> []term + ThesaurusKeys [][]string + + // FieldIDtoThesaurusID maps the field id to the thesaurus id + // field id -> thesaurus id + FieldIDtoThesaurusID map[uint16]int + + // SynonymIDtoTerm maps synonym id to term for each thesaurus + // thesaurus id -> synonym id -> term + SynonymTermToID []map[string]uint32 + + // SynonymTermToID maps term to synonym id for each thesaurus + // thesaurus id -> term -> synonym id + // this is the inverse of SynonymIDtoTerm for each thesaurus + SynonymIDtoTerm []map[uint32]string + + // synonym postings list -> synonym bitmap + Synonyms []*roaring64.Bitmap + + // A reusable vellum FST builder that will be stored in the synonym opaque + // and reused across multiple document batches during the persist phase + // of the synonym index section, the FST builder is used to build the + // FST for each thesaurus, which maps terms to their corresponding synonym bitmaps. + builder *vellum.Builder + + // A reusable buffer for the vellum FST builder. It streams data written + // into the builder into a byte slice. The final byte slice represents + // the serialized vellum FST, which will be written to disk + builderBuf bytes.Buffer + + // A reusable buffer for temporary use within the synonym index opaque + tmp0 []byte + + // A map linking thesaurus IDs to their corresponding thesaurus' file offsets + thesaurusAddrs map[int]int +} + +// Set the fieldsMap and results in the synonym index opaque before the section processes a synonym field. +func (so *synonymIndexOpaque) Set(key string, value interface{}) { + switch key { + case "results": + so.results = value.([]index.Document) + case "fieldsMap": + so.FieldsMap = value.(map[string]uint16) + } +} + +// Reset the synonym index opaque after a batch of documents have been processed into a segment. +func (so *synonymIndexOpaque) Reset() (err error) { + // cleanup stuff over here + so.results = nil + so.init = false + so.FieldsMap = nil + clear(so.ThesaurusMap) + so.ThesaurusInv = so.ThesaurusInv[:0] + for i := range so.Thesauri { + so.Thesauri[i] = nil + } + so.Thesauri = so.Thesauri[:0] + for i := range so.ThesaurusKeys { + so.ThesaurusKeys[i] = so.ThesaurusKeys[i][:0] + } + so.ThesaurusKeys = so.ThesaurusKeys[:0] + for _, idn := range so.Synonyms { + idn.Clear() + } + so.Synonyms = so.Synonyms[:0] + so.builderBuf.Reset() + if so.builder != nil { + err = so.builder.Reset(&so.builderBuf) + } + clear(so.FieldIDtoThesaurusID) + so.SynonymTermToID = so.SynonymTermToID[:0] + so.SynonymIDtoTerm = so.SynonymIDtoTerm[:0] + clear(so.thesaurusAddrs) + + so.tmp0 = so.tmp0[:0] + return err +} + +func (so *synonymIndexOpaque) process(field index.SynonymField, fieldID uint16, docNum uint32) { + // if this is the first time we are processing a synonym field in this batch + // we need to allocate memory for the thesauri and related data structures + if !so.init { + so.realloc() + so.init = true + } + + // get the thesaurus id for this field + tid := so.FieldIDtoThesaurusID[fieldID] + + // get the thesaurus for this field + thesaurus := so.Thesauri[tid] + + termSynMap := so.SynonymTermToID[tid] + + field.IterateSynonyms(func(term string, synonyms []string) { + pid := thesaurus[term] - 1 + + bs := so.Synonyms[pid] + + for _, syn := range synonyms { + code := encodeSynonym(termSynMap[syn], docNum) + bs.Add(code) + } + }) +} + +// a one-time call to allocate memory for the thesauri and synonyms which takes +// all the documents in the result batch and the fieldsMap and predetermines the +// size of the data structures in the synonymIndexOpaque +func (so *synonymIndexOpaque) realloc() { + var pidNext int + var sidNext uint32 + + // count the number of unique thesauri from the batch of documents + for _, result := range so.results { + if synDoc, ok := result.(index.SynonymDocument); ok { + synDoc.VisitSynonymFields(func(synField index.SynonymField) { + fieldIDPlus1 := so.FieldsMap[synField.Name()] + so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()) + }) + } + } + + for _, result := range so.results { + if synDoc, ok := result.(index.SynonymDocument); ok { + synDoc.VisitSynonymFields(func(synField index.SynonymField) { + fieldIDPlus1 := so.FieldsMap[synField.Name()] + thesaurusID := so.getOrDefineThesaurus(fieldIDPlus1-1, synField.Name()) + + thesaurus := so.Thesauri[thesaurusID] + thesaurusKeys := so.ThesaurusKeys[thesaurusID] + + synTermMap := so.SynonymIDtoTerm[thesaurusID] + + termSynMap := so.SynonymTermToID[thesaurusID] + + // iterate over all the term-synonyms pair from the field + synField.IterateSynonyms(func(term string, synonyms []string) { + _, exists := thesaurus[term] + if !exists { + pidNext++ + pidPlus1 := uint64(pidNext) + + thesaurus[term] = pidPlus1 + thesaurusKeys = append(thesaurusKeys, term) + } + for _, syn := range synonyms { + _, exists := termSynMap[syn] + if !exists { + termSynMap[syn] = sidNext + synTermMap[sidNext] = syn + sidNext++ + } + } + }) + so.ThesaurusKeys[thesaurusID] = thesaurusKeys + }) + } + } + + numSynonymsLists := pidNext + + if cap(so.Synonyms) >= numSynonymsLists { + so.Synonyms = so.Synonyms[:numSynonymsLists] + } else { + synonyms := make([]*roaring64.Bitmap, numSynonymsLists) + copy(synonyms, so.Synonyms[:cap(so.Synonyms)]) + for i := 0; i < numSynonymsLists; i++ { + if synonyms[i] == nil { + synonyms[i] = roaring64.New() + } + } + so.Synonyms = synonyms + } + + for _, thes := range so.ThesaurusKeys { + sort.Strings(thes) + } +} + +// getOrDefineThesaurus returns the thesaurus id for the given field id and thesaurus name. +func (so *synonymIndexOpaque) getOrDefineThesaurus(fieldID uint16, thesaurusName string) int { + thesaurusIDPlus1, exists := so.ThesaurusMap[thesaurusName] + if !exists { + // need to create a new thesaurusID for this thesaurusName and + thesaurusIDPlus1 = uint16(len(so.ThesaurusInv) + 1) + so.ThesaurusMap[thesaurusName] = thesaurusIDPlus1 + so.ThesaurusInv = append(so.ThesaurusInv, thesaurusName) + + so.Thesauri = append(so.Thesauri, make(map[string]uint64)) + + so.SynonymIDtoTerm = append(so.SynonymIDtoTerm, make(map[uint32]string)) + + so.SynonymTermToID = append(so.SynonymTermToID, make(map[string]uint32)) + + // map the fieldID to the thesaurusID + so.FieldIDtoThesaurusID[fieldID] = int(thesaurusIDPlus1 - 1) + + n := len(so.ThesaurusKeys) + if n < cap(so.ThesaurusKeys) { + so.ThesaurusKeys = so.ThesaurusKeys[:n+1] + so.ThesaurusKeys[n] = so.ThesaurusKeys[n][:0] + } else { + so.ThesaurusKeys = append(so.ThesaurusKeys, []string(nil)) + } + } + + return int(thesaurusIDPlus1 - 1) +} + +// grabBuf returns a reusable buffer of the given size from the synonymIndexOpaque. +func (so *synonymIndexOpaque) grabBuf(size int) []byte { + buf := so.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + so.tmp0 = buf + } + return buf[:size] +} + +func (so *synonymIndexOpaque) writeThesauri(w *FileWriter) error { + + if len(so.results) == 0 { + return nil + } + + thesOffsets := make([]uint64, len(so.ThesaurusInv)) + var err error + + buf := so.grabBuf(binary.MaxVarintLen64) + + if so.builder == nil { + so.builder, err = vellum.New(&so.builderBuf, nil) + if err != nil { + return err + } + } + + for thesaurusID, terms := range so.ThesaurusKeys { + thes := so.Thesauri[thesaurusID] + for _, term := range terms { // terms are already sorted + pid := thes[term] - 1 + postingsBS := so.Synonyms[pid] + postingsOffset, err := writeSynonyms(postingsBS, w, buf) + if err != nil { + return err + } + + if postingsOffset > uint64(0) { + err = so.builder.Insert([]byte(term), postingsOffset) + if err != nil { + return err + } + } + } + + err = so.builder.Close() + if err != nil { + return err + } + + thesOffsets[thesaurusID] = uint64(w.Count()) + + vellumData := w.process(so.builderBuf.Bytes()) + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return err + } + + // reset vellum for reuse + so.builderBuf.Reset() + + err = so.builder.Reset(&so.builderBuf) + if err != nil { + return err + } + + // write out the synTermMap for this thesaurus + err = writeSynTermMap(so.SynonymIDtoTerm[thesaurusID], w, buf) + if err != nil { + return err + } + + thesaurusStart := w.Count() + + n = binary.PutUvarint(buf, fieldNotUninverted) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, fieldNotUninverted) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, thesOffsets[thesaurusID]) + _, err = w.Write(buf[:n]) + if err != nil { + return err + } + so.thesaurusAddrs[thesaurusID] = thesaurusStart + } + return nil +} + +// ----------------------------------------------------------------------------- + +type synonymIndexSection struct { +} + +func (s *synonymIndexSection) getSynonymIndexOpaque(opaque map[int]resetable) *synonymIndexOpaque { + if _, ok := opaque[SectionSynonymIndex]; !ok { + opaque[SectionSynonymIndex] = s.InitOpaque(nil) + } + return opaque[SectionSynonymIndex].(*synonymIndexOpaque) +} + +// Implementations of the Section interface for the synonym index section. +// InitOpaque initializes the synonym index opaque, which sets the FieldsMap and +// results in the opaque before the section processes a synonym field. +func (s *synonymIndexSection) InitOpaque(args map[string]interface{}) resetable { + rv := &synonymIndexOpaque{ + ThesaurusMap: map[string]uint16{}, + FieldIDtoThesaurusID: map[uint16]int{}, + thesaurusAddrs: map[int]int{}, + } + for k, v := range args { + rv.Set(k, v) + } + + return rv +} + +// Process processes a synonym field by adding the synonyms to the thesaurus +// pointed to by the fieldID, implements the Process API for the synonym index section. +func (s *synonymIndexSection) Process(opaque map[int]resetable, docNum uint32, field index.Field, fieldID uint16) { + if fieldID == math.MaxUint16 { + return + } + if sf, ok := field.(index.SynonymField); ok { + so := s.getSynonymIndexOpaque(opaque) + so.process(sf, fieldID, docNum) + } +} + +// Persist serializes and writes the thesauri processed to the writer, along +// with the synonym postings lists, and the synonym term map. Implements the +// Persist API for the synonym index section. +func (s *synonymIndexSection) Persist(opaque map[int]resetable, w *FileWriter) error { + so := s.getSynonymIndexOpaque(opaque) + return so.writeThesauri(w) +} + +// AddrForField returns the file offset of the thesaurus for the given fieldID, +// it uses the FieldIDtoThesaurusID map to translate the fieldID to the thesaurusID, +// and returns the corresponding thesaurus offset from the thesaurusAddrs map. +// Implements the AddrForField API for the synonym index section. +func (s *synonymIndexSection) AddrForField(opaque map[int]resetable, fieldID int) int { + so := s.getSynonymIndexOpaque(opaque) + if so == nil || so.FieldIDtoThesaurusID == nil { + return 0 + } + tid, exists := so.FieldIDtoThesaurusID[uint16(fieldID)] + if !exists { + return 0 + } + return so.thesaurusAddrs[tid] +} + +// Merge merges the thesauri, synonym postings lists and synonym term maps from +// the segments into a single thesaurus and serializes and writes the merged +// thesaurus and associated data to the writer. Implements the Merge API for the +// synonym index section. +func (s *synonymIndexSection) Merge(opaque map[int]resetable, segments []*SegmentBase, + drops []*roaring.Bitmap, fieldsInv []string, newDocNumsIn [][]uint64, + w *FileWriter, closeCh chan struct{}) error { + so := s.getSynonymIndexOpaque(opaque) + thesaurusAddrs, fieldIDtoThesaurusID, err := mergeAndPersistSynonymSection(segments, drops, fieldsInv, newDocNumsIn, w, closeCh) + if err != nil { + return err + } + + so.thesaurusAddrs = thesaurusAddrs + so.FieldIDtoThesaurusID = fieldIDtoThesaurusID + return nil +} + +// ----------------------------------------------------------------------------- + +// encodeSynonym encodes a synonymID and a docID into a single uint64 value. +// The encoding format splits the 64 bits as follows: +// +// 63 32 31 0 +// +-----------+----------+ +// | synonymID | docNum | +// +-----------+----------+ +// +// The upper 32 bits (63-32) store the synonymID, and the lower 32 bits (31-0) store the docID. +// +// Parameters: +// +// synonymID - A 32-bit unsigned integer representing the ID of the synonym. +// docID - A 32-bit unsigned integer representing the document ID. +// +// Returns: +// +// A 64-bit unsigned integer that combines the synonymID and docID. +func encodeSynonym(synonymID uint32, docID uint32) uint64 { + return uint64(synonymID)<<32 | uint64(docID) +} + +// writeSynonyms serilizes and writes the synonym postings list to the writer, by first +// serializing the postings list to a byte slice and then writing the length +// of the byte slice followed by the byte slice itself. +func writeSynonyms(postings *roaring64.Bitmap, w *FileWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + postingsOffset := uint64(w.Count()) + + buf, err := postings.ToBytes() + if err != nil { + return 0, err + } + buf = w.process(buf) + + // write out the length + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(buf))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + // write out the roaring bytes + _, err = w.Write(buf) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + +// writeSynTermMap serializes and writes the synonym term map to the writer, by first +// writing the length of the map followed by the map entries, where each entry +// consists of the synonym ID, the length of the term, and the term itself. +func writeSynTermMap(synTermMap map[uint32]string, w *FileWriter, bufMaxVarintLen64 []byte) error { + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(synTermMap))) + _, err := w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return err + } + + lenTerms := 0 + for _, term := range synTermMap { + lenTerms += len(term) + } + + buf := make([]byte, lenTerms+binary.MaxVarintLen64*(2*len(synTermMap))) + bufPos := 0 + for sid, term := range synTermMap { + bufPos += binary.PutUvarint(buf[bufPos:], uint64(sid)) + bufPos += binary.PutUvarint(buf[bufPos:], uint64(len(term))) + copy(buf[bufPos:], term) + bufPos += len(term) + } + buf = w.process(buf[:bufPos]) + + // write out the length of the map + n = binary.PutUvarint(bufMaxVarintLen64, uint64(len(buf))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return err + } + + _, err = w.Write(buf) + if err != nil { + return err + } + + return nil +} + +func mergeAndPersistSynonymSection(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, newDocNumsIn [][]uint64, w *FileWriter, + closeCh chan struct{}) (map[int]int, map[uint16]int, error) { + + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + + var synonyms *SynonymsList + var synItr *SynonymsIterator + + thesaurusAddrs := make(map[int]int) + + var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, nil, err + } + + newRoaring := roaring64.NewBitmap() + + newDocNums := make([][]uint64, 0, len(segments)) + + drops := make([]*roaring.Bitmap, 0, len(segments)) + + thesauri := make([]*Thesaurus, 0, len(segments)) + + itrs := make([]vellum.Iterator, 0, len(segments)) + + fieldIDtoThesaurusID := make(map[uint16]int) + + var thesaurusID int + var newSynonymID uint32 + + // for each field + for fieldID, fieldName := range fieldsInv { + // collect FST iterators from all active segments for this field + newDocNums = newDocNums[:0] + drops = drops[:0] + thesauri = thesauri[:0] + itrs = itrs[:0] + newSynonymID = 0 + synTermMap := make(map[uint32]string) + termSynMap := make(map[string]uint32) + + for segmentI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, nil, seg.ErrClosed + } + + thes, err2 := segment.thesaurus(fieldName) + if err2 != nil { + return nil, nil, err2 + } + if thes != nil && thes.fst != nil { + itr, err2 := thes.fst.Iterator(nil, nil) + if err2 != nil && err2 != vellum.ErrIteratorDone { + return nil, nil, err2 + } + if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } + thesauri = append(thesauri, thes) + itrs = append(itrs, itr) + } + } + } + + // if no iterators, skip this field + if len(itrs) == 0 { + continue + } + + var prevTerm []byte + + newRoaring.Clear() + + finishTerm := func(term []byte) error { + postingsOffset, err := writeSynonyms(newRoaring, w, bufMaxVarintLen64) + if err != nil { + return err + } + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) + if err != nil { + return err + } + } + newRoaring.Clear() + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if prevTerm != nil && !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, nil, seg.ErrClosed + } + + // if the term changed, write out the info collected + // for the previous term + err = finishTerm(prevTerm) + if err != nil { + return nil, nil, err + } + } + + synonyms, err = thesauri[itrI].synonymsListFromOffset( + postingsOffset, drops[itrI], synonyms) + if err != nil { + return nil, nil, err + } + synItr = synonyms.iterator(synItr) + + var next seg.Synonym + next, err = synItr.Next() + for next != nil && err == nil { + synNewDocNum := newDocNums[itrI][next.Number()] + if synNewDocNum == docDropped { + return nil, nil, fmt.Errorf("see hit with dropped docNum") + } + nextTerm := next.Term() + var synNewID uint32 + if synID, ok := termSynMap[nextTerm]; ok { + synNewID = synID + } else { + synNewID = newSynonymID + termSynMap[nextTerm] = newSynonymID + synTermMap[newSynonymID] = nextTerm + newSynonymID++ + } + synNewCode := encodeSynonym(synNewID, uint32(synNewDocNum)) + newRoaring.Add(synNewCode) + next, err = synItr.Next() + } + if err != nil { + return nil, nil, err + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) + err = enumerator.Next() + } + if err != vellum.ErrIteratorDone { + return nil, nil, err + } + // close the enumerator to free the underlying iterators + err = enumerator.Close() + if err != nil { + return nil, nil, err + } + + if prevTerm != nil { + err = finishTerm(prevTerm) + if err != nil { + return nil, nil, err + } + } + + err = newVellum.Close() + if err != nil { + return nil, nil, err + } + vellumData := w.process(vellumBuf.Bytes()) + + thesOffset := uint64(w.Count()) + + // write out the length of the vellum data + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, nil, err + } + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, nil, err + } + + // write out the synTermMap for this thesaurus + err = writeSynTermMap(synTermMap, w, bufMaxVarintLen64) + if err != nil { + return nil, nil, err + } + + thesStart := w.Count() + + // the synonym index section does not have any doc value data + // so we write two special entries to indicate that + // the field is not uninverted and the thesaurus offset + n = binary.PutUvarint(bufMaxVarintLen64, fieldNotUninverted) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, fieldNotUninverted) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // write out the thesaurus offset from which the vellum data starts + n = binary.PutUvarint(bufMaxVarintLen64, thesOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return nil, nil, err + } + + // if we have a new thesaurus, add it to the thesaurus map + fieldIDtoThesaurusID[uint16(fieldID)] = thesaurusID + thesaurusAddrs[thesaurusID] = thesStart + thesaurusID++ + } + + return thesaurusAddrs, fieldIDtoThesaurusID, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/segment.go b/vendor/github.com/blevesearch/zapx/v17/segment.go new file mode 100644 index 0000000000..21d7bedd2a --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/segment.go @@ -0,0 +1,932 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + "sync" + "sync/atomic" + "unsafe" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + mmap "github.com/blevesearch/mmap-go" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/golang/snappy" +) + +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) +} + +// OpenUsing returns a zap impl of a segment which tracks some config values during +// the its lifetime. +func (z *ZapPlugin) OpenUsing(path string, config map[string]interface{}) (segment.Segment, error) { + return z.open(path, config) +} + +// Open returns a zap impl of a segment +func (z *ZapPlugin) Open(path string) (segment.Segment, error) { + return z.open(path, nil) +} + +func (*ZapPlugin) open(path string, config map[string]interface{}) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + SegmentBase: SegmentBase{ + fieldsMap: make(map[string]uint16), + fieldsOptions: make(map[string]index.FieldIndexingOptions), + invIndexCache: newInvertedIndexCache(), + vecIndexCache: newVectorIndexCache(), + synIndexCache: newSynonymIndexCache(), + nstIndexCache: newNestedIndexCache(), + fieldDvReaders: make([][]*docValueReader, len(segmentSections)), + config: config, + }, + f: f, + mm: mm, + path: path, + refs: 1, + } + rv.SegmentBase.updateSize() + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadDvReaders() + if err != nil { + _ = rv.Close() + return nil, err + } + + // initialize any of the caches if needed + err = rv.nstIndexCache.initialize(rv.numDocs, rv.getEdgeListOffset(), rv.mem) + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + // atomic access to these variables, moved to top to correct alignment issues on ARM, 386 and 32-bit MIPS. + bytesRead uint64 + bytesWritten uint64 + + mem []byte + memCRC uint32 + chunkMode uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsOptions map[string]index.FieldIndexingOptions // fieldName -> fieldOptions + fieldsInv []string // fieldID -> fieldName + fieldsSectionsMap [][]uint64 // fieldID -> section -> address + numDocs uint64 + storedIndexOffset uint64 + sectionsIndexOffset uint64 + fieldDvReaders [][]*docValueReader // naive chunk cache per field; section->fieldID->reader + fieldDvNames []string // field names cached in fieldDvReaders + size uint64 + + // file reader initialised with the writer callback id used by the segment + fileReader *FileReader + + // index update specific tracking + updatedFields map[string]*index.UpdateFieldInfo + config map[string]interface{} // config for the segment + + // section-specific caches + invIndexCache *invertedIndexCache + vecIndexCache *vectorIndexCache + synIndexCache *synonymIndexCache + nstIndexCache *nestedIndexCache +} + +func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { + sizeInBytes := reflectStaticSizeSegmentBase + + cap(sb.mem) + + // fieldsMap + for k := range sb.fieldsMap { + sizeInBytes += (len(k) + SizeOfString) + SizeOfUint16 + } + + // fieldsOptions + for k := range sb.fieldsOptions { + sizeInBytes += (len(k) + SizeOfString) + SizeOfUint64 + } + + // fieldsInv + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + SizeOfString + } + + // fieldDvReaders + for _, secDvReaders := range sb.fieldDvReaders { + for _, v := range secDvReaders { + sizeInBytes += SizeOfUint16 + SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + } + + sb.size = uint64(sizeInBytes) +} + +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { + sb.invIndexCache.Clear() + sb.vecIndexCache.Clear() + sb.synIndexCache.Clear() + sb.nstIndexCache.Clear() + return nil +} + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase + + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) Size() int { + // 8 /* size of file pointer */ + // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + SizeOfString) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err +} + +func (s *Segment) loadConfig() error { + // read offsets of 32 bit values - crc, ver, chunk + crcOffset := len(s.mm) - 4 + verOffset := crcOffset - 4 + chunkOffset := verOffset - 4 + + // read offsets of 64 bit values - sectionsIndexOffset, storedIndexOffset, numDocsOffset + sectionsIndexOffset := chunkOffset - 8 + storedIndexOffset := sectionsIndexOffset - 8 + numDocsOffset := storedIndexOffset - 8 + + // read offsets for the writer id length + idLenOffset := numDocsOffset - 4 + + // read 32-bit crc + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + + // read 32-bit version + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if s.version != Version { + return fmt.Errorf("unsupported version %d != %d", s.version, Version) + } + + // read 32-bit chunk mode + s.chunkMode = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + + // read 64-bit sections index offset + s.sectionsIndexOffset = binary.BigEndian.Uint64(s.mm[sectionsIndexOffset : sectionsIndexOffset+8]) + + // read 64-bit stored index offset + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) + + // read 64-bit num docs + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) + + // read the length of the id + idLen := binary.BigEndian.Uint32(s.mm[idLenOffset : idLenOffset+4]) + idOffset := idLenOffset - int(idLen) + + // read the file writer callback id and initialize the file reader with the same id + fileWriterID := string(s.mm[idOffset : idOffset+int(idLen)]) + var err error + s.fileReader, err = NewFileReader(fileWriterID, []byte(s.path)) + if err != nil { + return err + } + + footerSize := FooterSize + int(idLen) + s.incrementBytesRead(uint64(footerSize)) + s.SegmentBase.mem = s.mm[:len(s.mm)-footerSize] + return nil +} + +// Implements the segment.DiskStatsReporter interface +// Only the persistedSegment type implments the +// interface, as the intention is to retrieve the bytes +// read from the on-disk segment as part of the current +// query. +func (s *Segment) ResetBytesRead(val uint64) { + atomic.StoreUint64(&s.SegmentBase.bytesRead, val) +} + +func (s *Segment) BytesRead() uint64 { + return atomic.LoadUint64(&s.bytesRead) +} + +func (s *Segment) BytesWritten() uint64 { + return 0 +} + +func (s *Segment) incrementBytesRead(val uint64) { + atomic.AddUint64(&s.bytesRead, val) +} + +func (sb *SegmentBase) BytesWritten() uint64 { + return atomic.LoadUint64(&sb.bytesWritten) +} + +func (sb *SegmentBase) setBytesWritten(val uint64) { + atomic.AddUint64(&sb.bytesWritten, val) +} + +func (sb *SegmentBase) BytesRead() uint64 { + return 0 +} + +func (sb *SegmentBase) ResetBytesRead(val uint64) {} + +func (sb *SegmentBase) incrementBytesRead(val uint64) { + atomic.AddUint64(&sb.bytesRead, val) +} + +func (sb *SegmentBase) loadFields() error { + pos := sb.sectionsIndexOffset + + if pos == 0 { + return fmt.Errorf("no sections index present") + } + + seek := pos + binary.MaxVarintLen64 + if seek > uint64(len(sb.mem)) { + // handling a buffer overflow case. + // a rare case where the backing buffer is not large enough to be read directly via + // a pos+binary.MaxVarintLen64 seek. For eg, this can happen when there is only + // one field to be indexed in the entire batch of data and while writing out + // these fields metadata, you write 1 + 8 bytes whereas the MaxVarintLen64 = 10. + seek = uint64(len(sb.mem)) + } + + // read the number of fields + numFields, sz := binary.Uvarint(sb.mem[pos:seek]) + // here, the pos is incremented by the valid number bytes read from the buffer + // so in the edge case pointed out above the numFields = 1, the sz = 1 as well. + pos += uint64(sz) + sb.incrementBytesRead(uint64(sz)) + + // the following loop will be executed only once in the edge case pointed out above + // since there is only field's offset store which occupies 8 bytes. + // the pointer then seeks to a position preceding the sectionsIndexOffset, at + // which point the responsibility of handling the out-of-bounds cases shifts to + // the specific section's parsing logic. + var fieldID uint64 + for fieldID < numFields { + addr := binary.BigEndian.Uint64(sb.mem[pos : pos+8]) + sb.incrementBytesRead(8) + + fieldSectionMap, err := sb.loadField(uint16(fieldID), addr) + if err != nil { + return err + } + + sb.fieldsSectionsMap = append(sb.fieldsSectionsMap, fieldSectionMap) + + fieldID++ + pos += 8 + } + + return nil +} + +// loadField loads the field metadata for the given fieldID at the given position +func (sb *SegmentBase) loadField(fieldID uint16, pos uint64) ([]uint64, error) { + if pos == 0 { + // there is no indexing structure present for this field/section + return nil, nil + } + + fieldStartPos := pos // to track the number of bytes read + fieldNameLen, sz := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + + fieldName, err := sb.fileReader.process(sb.mem[pos : pos+fieldNameLen]) + if err != nil { + return nil, err + } + pos += fieldNameLen + + // read field options + fieldOptions, sz := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + + sb.fieldsInv = append(sb.fieldsInv, string(fieldName)) + sb.fieldsMap[string(fieldName)] = fieldID + 1 + sb.fieldsOptions[string(fieldName)] = index.FieldIndexingOptions(fieldOptions) + + fieldNumSections, sz := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(sz) + // create an address mapping array for each of the segment sections + // if the field has a valid section index, then the address will be non-zero + // else it will be zero. + fieldSectionMap := make([]uint64, NumSections) + for sectionIdx := uint64(0); sectionIdx < fieldNumSections; sectionIdx++ { + // read section id + fieldSectionType := binary.BigEndian.Uint16(sb.mem[pos : pos+2]) + pos += 2 + fieldSectionAddr := binary.BigEndian.Uint64(sb.mem[pos : pos+8]) + pos += 8 + fieldSectionMap[fieldSectionType] = fieldSectionAddr + } + + // account the bytes read while parsing the sections field index. + sb.incrementBytesRead((pos - uint64(fieldStartPos)) + fieldNameLen) + return fieldSectionMap, nil +} + +// Dictionary returns the term dictionary for the specified field +func (sb *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := sb.dictionary(field) + if err == nil && dict == nil { + return emptyDictionary, nil + } + return dict, err +} + +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 == 0 { + return nil, nil + } + pos := sb.fieldsSectionsMap[fieldIDPlus1-1][SectionInvertedTextIndex] + if pos > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } + // skip the doc value offsets to get to the dictionary portion + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + } + dictLoc, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + fst, bytesRead, err := sb.invIndexCache.loadOrCreate(rv.fieldID, sb.mem[dictLoc:], sb.fileReader) + if err != nil { + return nil, fmt.Errorf("dictionary for field %s err: %v", field, err) + } + rv.fst = fst + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary for field %s, vellum reader err: %v", field, err) + } + rv.bytesRead += bytesRead + } + + return rv, nil +} + +// Thesaurus returns the thesaurus with the specified name, or an empty thesaurus if not found. +func (sb *SegmentBase) Thesaurus(name string) (segment.Thesaurus, error) { + thesaurus, err := sb.thesaurus(name) + if err == nil && thesaurus == nil { + return emptyThesaurus, nil + } + return thesaurus, err +} + +func (sb *SegmentBase) thesaurus(name string) (rv *Thesaurus, err error) { + fieldIDPlus1 := sb.fieldsMap[name] + if fieldIDPlus1 == 0 { + return nil, nil + } + pos := sb.fieldsSectionsMap[fieldIDPlus1-1][SectionSynonymIndex] + if pos > 0 { + rv = &Thesaurus{ + sb: sb, + name: name, + fieldID: fieldIDPlus1 - 1, + } + // skip the doc value offsets as doc values are not supported in thesaurus + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + } + thesLoc, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + fst, synTermMap, bytesRead, err := sb.synIndexCache.loadOrCreate(rv.fieldID, sb.mem[thesLoc:], sb.fileReader) + if err != nil { + return nil, fmt.Errorf("thesaurus name %s err: %v", name, err) + } + rv.fst = fst + rv.synIDTermMap = synTermMap + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("thesaurus name %s vellum reader err: %v", name, err) + } + rv.bytesRead += bytesRead + } + return rv, nil +} + +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + return reuse + }, +} + +// VisitStoredFields invokes the StoredFieldValueVisitor for each stored field +// for the specified doc number +func (sb *SegmentBase) VisitStoredFields(num uint64, visitor segment.StoredFieldValueVisitor) error { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return sb.visitStoredFields(vdc, num, visitor) +} + +func (sb *SegmentBase) visitStoredFields(vdc *visitDocumentCtx, num uint64, + visitor segment.StoredFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < sb.numDocs { + meta, compressed, err := sb.getDocStoredMetaAndCompressed(num) + if err != nil { + return err + } + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + idFieldVal := compressed[:idFieldValLen] + + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } + + for keepGoing { + field, err := binary.ReadUvarint(&vdc.reader) + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + offset, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + l, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + numap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] + for i := 0; i < int(numap); i++ { + ap, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return err + } + arrayPos[i] = ap + } + } + value := uncompressed[offset : offset+l] + keepGoing = visitor(sb.fieldsInv[field], byte(typ), value, arrayPos) + } + + vdc.buf = uncompressed + } + return nil +} + +// DocID returns the value of the _id field for the given docNum +func (sb *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= sb.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed, err := sb.getDocStoredMetaAndCompressed(num) + if err != nil { + return nil, err + } + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + +// Count returns the number of documents in this segment. +func (sb *SegmentBase) Count() uint64 { + return sb.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (sb *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(sb.fieldsMap) > 0 { + idDict, err := sb.dictionary("_id") + if err != nil { + return nil, err + } + + postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + for _, id := range ids { + if id <= sMaxStr { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (sb *SegmentBase) Fields() []string { + return sb.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { + // clear contents from all caches before un-mmapping + s.invIndexCache.Clear() + s.vecIndexCache.Clear() + s.synIndexCache.Clear() + s.nstIndexCache.Clear() + + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkMode() uint32 { + return s.chunkMode +} + +// SectionsIndexOffset returns the sections index offset in the file footer +func (s *Segment) SectionsIndexOffset() uint64 { + return s.sectionsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file footer +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + dictStart := s.fieldsSectionsMap[fieldIDPlus1-1][SectionInvertedTextIndex] + if dictStart == 0 { + return 0, fmt.Errorf("no dictionary for field '%s'", field) + } + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(s.mem[dictStart : dictStart+binary.MaxVarintLen64]) + dictStart += uint64(n) + } + dictLoc, _ := binary.Uvarint(s.mem[dictStart : dictStart+binary.MaxVarintLen64]) + return dictLoc, nil +} + +// VectorAddr is a helper function to compute the file offset where the +// vector index is stored for the specified field. +func (s *Segment) VectorAddr(name string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[name] + if !ok { + return 0, fmt.Errorf("no such field '%s'", name) + } + vectorStart := s.fieldsSectionsMap[fieldIDPlus1-1][SectionFaissVectorIndex] + if vectorStart == 0 { + return 0, fmt.Errorf("no vector index for field '%s'", name) + } + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(s.mem[vectorStart : vectorStart+binary.MaxVarintLen64]) + vectorStart += uint64(n) + } + vectorLoc, _ := binary.Uvarint(s.mem[vectorStart : vectorStart+binary.MaxVarintLen64]) + return vectorLoc, nil +} + +// ThesaurusAddr is a helper function to compute the file offset where the +// thesaurus is stored with the specified name. +func (s *Segment) ThesaurusAddr(name string) (uint64, error) { + fieldIDPlus1, ok := s.fieldsMap[name] + if !ok { + return 0, fmt.Errorf("no such thesaurus '%s'", name) + } + thesaurusStart := s.fieldsSectionsMap[fieldIDPlus1-1][SectionSynonymIndex] + if thesaurusStart == 0 { + return 0, fmt.Errorf("no such thesaurus '%s'", name) + } + for i := 0; i < 2; i++ { + _, n := binary.Uvarint(s.mem[thesaurusStart : thesaurusStart+binary.MaxVarintLen64]) + thesaurusStart += uint64(n) + } + thesLoc, _ := binary.Uvarint(s.mem[thesaurusStart : thesaurusStart+binary.MaxVarintLen64]) + return thesLoc, nil +} + +// EdgeListAddr is the exported helper function to compute the +// file offset where the edge list is stored. +func (s *Segment) EdgeListAddr() (uint64, error) { + return s.getEdgeListOffset(), nil +} + +func (sb *SegmentBase) loadDvReaders() error { + if sb.numDocs == 0 { + return nil + } + for fieldID, sections := range sb.fieldsSectionsMap { + for secID, secOffset := range sections { + if secOffset > 0 { + pos := secOffset + var read uint64 + fieldLocStart, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %v", sb.fieldsInv[fieldID]) + } + pos += uint64(n) + read += uint64(n) + fieldLocEnd, n := binary.Uvarint(sb.mem[pos : pos+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %v", sb.fieldsInv[fieldID]) + } + pos += uint64(n) + read += uint64(n) + + sb.incrementBytesRead(read) + + fieldDvReader, err := sb.loadFieldDocValueReader(sb.fieldsInv[fieldID], fieldLocStart, fieldLocEnd) + if err != nil { + return err + } + if fieldDvReader != nil { + if sb.fieldDvReaders[secID] == nil { + sb.fieldDvReaders[secID] = make([]*docValueReader, len(sb.fieldsInv)) + } + sb.fieldDvReaders[secID][uint16(fieldID)] = fieldDvReader + sb.fieldDvNames = append(sb.fieldDvNames, sb.fieldsInv[fieldID]) + } + } + } + } + + return nil +} + +// Getter method to retrieve updateFieldInfo within segment base +func (s *SegmentBase) GetUpdatedFields() map[string]*index.UpdateFieldInfo { + return s.updatedFields +} + +// Setter method to store updateFieldInfo within segment base +func (s *SegmentBase) SetUpdatedFields(updatedFields map[string]*index.UpdateFieldInfo) { + s.updatedFields = updatedFields +} + +// Ancestors returns a slice of document numbers representing the ancestors of the +// specified document (docNum) within the segment. If the document has no ancestors, +// a slice containing only the document number itself is returned. The prealloc +// parameter allows for reusing a preallocated slice to avoid additional allocations. +func (sb *SegmentBase) Ancestors(docNum uint64, prealloc []index.AncestorID) []index.AncestorID { + return sb.nstIndexCache.ancestry(docNum, prealloc) +} + +// CountRoot returns the number of root documents in the segment, excluding any +// documents that are marked as deleted in the provided bitmap. The deleted bitmap +// may contain both root and sub-document numbers, and the method ensures that +// only root documents are counted. +func (sb *SegmentBase) CountRoot(deleted *roaring.Bitmap) uint64 { + // the formula is as follows: + // Total Docs (T) = Root Docs (R) + Sub Docs (S) + // R = T - S + // Now if we have D deleted docs, some of which may be sub-docs, we need to exclude + // those from the root doc count. Let D = dR + dS, where dR is the number of deleted + // root docs and dS is the number of deleted sub docs. + // dR = D - dS + // Therefore, the count of root docs excluding deleted ones is: + // R - dR = (T - S) - (D - dS) + return (sb.Count() - sb.countNested()) - (sb.nstIndexCache.countRoot(deleted)) +} + +// AddNestedDocuments returns a bitmap containing the original document numbers in drops, +// plus any descendant document numbers for each dropped document. The drops +// parameter represents a set of document numbers to be dropped, and the returned +// bitmap includes both the original drops and all their descendants (if any). +func (sb *SegmentBase) AddNestedDocuments(drops *roaring.Bitmap) *roaring.Bitmap { + // If no drops or no subDocs, nothing to do + if drops == nil || drops.GetCardinality() == 0 || sb.countNested() == 0 { + return drops + } + // Get the edge list for this segment + el := sb.EdgeList() + // Algorithm => iterate through each child->parent mapping in the edge list, + // and for each pair, check if the parent is in the drops bitmap. + // If it is, and the child is also not already in the drops bitmap, + // add the child to the drops. Repeat this process until no + // new additions are made in an iteration. + changed := true + for changed { + changed = false + el.Iterate(func(child uint64, parent uint64) bool { + if drops.Contains(uint32(parent)) && !drops.Contains(uint32(child)) { + drops.Add(uint32(child)) + changed = true + } + return true + }) + } + return drops +} + +// EdgeList returns an EdgeList interface representing the parent-child relationships between documents in the segment. +// The EdgeList interface allows iteration over child-parent document pairs, enabling navigation of document hierarchies. +// The underlying implementation may use a map or a slice, but callers should rely on the interface methods. +func (sb *SegmentBase) EdgeList() EdgeList { + return sb.nstIndexCache.edgeList() +} + +// Utility method to count the number of nested documents in the segment, not exported. +func (sb *SegmentBase) countNested() uint64 { + return sb.nstIndexCache.countNested() +} + +func (sb *SegmentBase) CallbackId() string { + return sb.fileReader.id +} diff --git a/vendor/github.com/blevesearch/zapx/v17/sizes.go b/vendor/github.com/blevesearch/zapx/v17/sizes.go new file mode 100644 index 0000000000..34166ea330 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/sizes.go @@ -0,0 +1,59 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "reflect" +) + +func init() { + var b bool + SizeOfBool = int(reflect.TypeOf(b).Size()) + var f32 float32 + SizeOfFloat32 = int(reflect.TypeOf(f32).Size()) + var f64 float64 + SizeOfFloat64 = int(reflect.TypeOf(f64).Size()) + var i int + SizeOfInt = int(reflect.TypeOf(i).Size()) + var m map[int]int + SizeOfMap = int(reflect.TypeOf(m).Size()) + var ptr *int + SizeOfPtr = int(reflect.TypeOf(ptr).Size()) + var slice []int + SizeOfSlice = int(reflect.TypeOf(slice).Size()) + var str string + SizeOfString = int(reflect.TypeOf(str).Size()) + var u8 uint8 + SizeOfUint8 = int(reflect.TypeOf(u8).Size()) + var u16 uint16 + SizeOfUint16 = int(reflect.TypeOf(u16).Size()) + var u32 uint32 + SizeOfUint32 = int(reflect.TypeOf(u32).Size()) + var u64 uint64 + SizeOfUint64 = int(reflect.TypeOf(u64).Size()) +} + +var SizeOfBool int +var SizeOfFloat32 int +var SizeOfFloat64 int +var SizeOfInt int +var SizeOfMap int +var SizeOfPtr int +var SizeOfSlice int +var SizeOfString int +var SizeOfUint8 int +var SizeOfUint16 int +var SizeOfUint32 int +var SizeOfUint64 int diff --git a/vendor/github.com/blevesearch/zapx/v17/synonym_cache.go b/vendor/github.com/blevesearch/zapx/v17/synonym_cache.go new file mode 100644 index 0000000000..4c0e872e37 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/synonym_cache.go @@ -0,0 +1,141 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + "sync" + + "github.com/blevesearch/vellum" +) + +func newSynonymIndexCache() *synonymIndexCache { + return &synonymIndexCache{ + cache: make(map[uint16]*synonymCacheEntry), + } +} + +type synonymIndexCache struct { + m sync.RWMutex + + cache map[uint16]*synonymCacheEntry +} + +// Clear clears the synonym cache which would mean tha the termID to term map would no longer be available. +func (sc *synonymIndexCache) Clear() { + sc.m.Lock() + sc.cache = nil + sc.m.Unlock() +} + +// loadOrCreate loads the synonym index cache for the specified fieldID if it is already present, +// or creates it if not. The synonym index cache for a fieldID consists of a tuple: +// - A Vellum FST (Finite State Transducer) representing the thesaurus. +// - A map associating synonym IDs to their corresponding terms. +// This function returns the loaded or newly created tuple (FST and map). +func (sc *synonymIndexCache) loadOrCreate(fieldID uint16, mem []byte, r *FileReader) (*vellum.FST, map[uint32][]byte, uint64, error) { + sc.m.RLock() + entry, ok := sc.cache[fieldID] + if ok { + sc.m.RUnlock() + return entry.load() + } + + sc.m.RUnlock() + + sc.m.Lock() + defer sc.m.Unlock() + + entry, ok = sc.cache[fieldID] + if ok { + return entry.load() + } + + return sc.createAndCacheLOCKED(fieldID, mem, r) +} + +// createAndCacheLOCKED creates the synonym index cache for the specified fieldID and caches it. +func (sc *synonymIndexCache) createAndCacheLOCKED(fieldID uint16, mem []byte, r *FileReader) (*vellum.FST, map[uint32][]byte, uint64, error) { + var pos uint64 + vellumLen, read := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + if vellumLen == 0 || read <= 0 { + return nil, nil, 0, fmt.Errorf("vellum length is 0") + } + pos += uint64(read) + fstBytes, err := r.process(mem[pos : pos+vellumLen]) + if err != nil { + return nil, nil, 0, err + } + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, nil, 0, fmt.Errorf("vellum err: %v", err) + } + pos += vellumLen + numSyns, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + if numSyns == 0 { + return nil, nil, 0, fmt.Errorf("no synonyms found") + } + mapLen, n := binary.Uvarint(mem[pos : pos+binary.MaxVarintLen64]) + pos += uint64(n) + if mapLen == 0 { + return nil, nil, 0, fmt.Errorf("synonym term map length is 0") + } + buf, err := r.process(mem[pos : pos+mapLen]) + if err != nil { + return nil, nil, 0, err + } + pos += mapLen + bufLen := uint64(len(buf)) + var bufPos uint64 + synTermMap := make(map[uint32][]byte, numSyns) + for i := 0; i < int(numSyns); i++ { + synID, n := binary.Uvarint(buf[bufPos:min(bufPos+binary.MaxVarintLen64, bufLen)]) + bufPos += uint64(n) + termLen, n := binary.Uvarint(buf[bufPos:min(bufPos+binary.MaxVarintLen64, bufLen)]) + bufPos += uint64(n) + if termLen == 0 { + return nil, nil, 0, fmt.Errorf("term length is 0") + } + term := buf[bufPos : bufPos+uint64(termLen)] + bufPos += uint64(termLen) + synTermMap[uint32(synID)] = term + } + sc.insertLOCKED(fieldID, fst, synTermMap) + return fst, synTermMap, pos, nil +} + +// insertLOCKED inserts the vellum FST and the map of synonymID to term into the cache for the specified fieldID. +func (sc *synonymIndexCache) insertLOCKED(fieldID uint16, fst *vellum.FST, synTermMap map[uint32][]byte) { + _, ok := sc.cache[fieldID] + if !ok { + sc.cache[fieldID] = &synonymCacheEntry{ + fst: fst, + synTermMap: synTermMap, + } + } +} + +// synonymCacheEntry is a tuple of the vellum FST and the map of synonymID to term, +// and is the value stored in the synonym cache, for a given fieldID. +type synonymCacheEntry struct { + fst *vellum.FST + synTermMap map[uint32][]byte +} + +func (ce *synonymCacheEntry) load() (*vellum.FST, map[uint32][]byte, uint64, error) { + return ce.fst, ce.synTermMap, 0, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/synonym_posting.go b/vendor/github.com/blevesearch/zapx/v17/synonym_posting.go new file mode 100644 index 0000000000..9424dd5643 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/synonym_posting.go @@ -0,0 +1,242 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "reflect" + + "github.com/RoaringBitmap/roaring/v2" + "github.com/RoaringBitmap/roaring/v2/roaring64" + segment "github.com/blevesearch/scorch_segment_api/v2" +) + +var reflectStaticSizeSynonymsList int +var reflectStaticSizeSynonymsIterator int +var reflectStaticSizeSynonym int + +func init() { + var sl SynonymsList + reflectStaticSizeSynonymsList = int(reflect.TypeOf(sl).Size()) + var si SynonymsIterator + reflectStaticSizeSynonymsIterator = int(reflect.TypeOf(si).Size()) + var s Synonym + reflectStaticSizeSynonym = int(reflect.TypeOf(s).Size()) +} + +// SynonymsList represents a list of synonyms for a term, stored in a Roaring64 bitmap. +type SynonymsList struct { + sb *SegmentBase + synonymsOffset uint64 + synonyms *roaring64.Bitmap + except *roaring.Bitmap + + synIDTermMap map[uint32][]byte + + buffer *bytes.Reader +} + +// immutable, empty synonyms list +var emptySynonymsList = &SynonymsList{} + +func (p *SynonymsList) Size() int { + sizeInBytes := reflectStaticSizeSynonymsList + SizeOfPtr + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + +// Iterator creates and returns a SynonymsIterator for the SynonymsList. +// If the synonyms bitmap is nil, it returns an empty iterator. +func (s *SynonymsList) Iterator(prealloc segment.SynonymsIterator) segment.SynonymsIterator { + if s.synonyms == nil { + return emptySynonymsIterator + } + + var preallocSI *SynonymsIterator + pi, ok := prealloc.(*SynonymsIterator) + if ok && pi != nil { + preallocSI = pi + } + if preallocSI == emptySynonymsIterator { + preallocSI = nil + } + + return s.iterator(preallocSI) +} + +// iterator initializes a SynonymsIterator for the SynonymsList and returns it. +// If a preallocated iterator is provided, it resets and reuses it; otherwise, it creates a new one. +func (s *SynonymsList) iterator(rv *SynonymsIterator) *SynonymsIterator { + if rv == nil { + rv = &SynonymsIterator{} + } else { + *rv = SynonymsIterator{} // clear the struct + } + rv.synonyms = s + rv.except = s.except + rv.Actual = s.synonyms.Iterator() + rv.ActualBM = s.synonyms + rv.synIDTermMap = s.synIDTermMap + return rv +} + +// read initializes a SynonymsList by reading data from the given synonymsOffset in the Thesaurus. +// It reads and parses the Roaring64 bitmap that represents the synonyms. +func (rv *SynonymsList) read(synonymsOffset uint64, t *Thesaurus) error { + rv.synonymsOffset = synonymsOffset + + var n uint64 + var read int + + var synonymsLen uint64 + synonymsLen, read = binary.Uvarint(t.sb.mem[synonymsOffset+n : synonymsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes, err := t.sb.fileReader.process(t.sb.mem[synonymsOffset+n : synonymsOffset+n+synonymsLen]) + if err != nil { + return err + } + + if rv.synonyms == nil { + rv.synonyms = roaring64.NewBitmap() + } + + rv.buffer.Reset(roaringBytes) + + _, err = rv.synonyms.ReadFrom(rv.buffer) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + return nil +} + +// ----------------------------------------------------------------------------- + +// SynonymsIterator provides a way to iterate through the synonyms list. +type SynonymsIterator struct { + synonyms *SynonymsList + except *roaring.Bitmap + + Actual roaring64.IntPeekable64 + ActualBM *roaring64.Bitmap + + synIDTermMap map[uint32][]byte + nextSyn Synonym +} + +// immutable, empty synonyms iterator +var emptySynonymsIterator = &SynonymsIterator{} + +func (i *SynonymsIterator) Size() int { + sizeInBytes := reflectStaticSizeSynonymsIterator + SizeOfPtr + + i.nextSyn.Size() + + return sizeInBytes +} + +// Next returns the next Synonym in the iteration or an error if the end is reached. +func (i *SynonymsIterator) Next() (segment.Synonym, error) { + return i.next() +} + +// next retrieves the next synonym from the iterator, populates the nextSyn field, +// and returns it. If no valid synonym is found, it returns an error. +func (i *SynonymsIterator) next() (segment.Synonym, error) { + synID, docNum, exists, err := i.nextSynonym() + if err != nil || !exists { + return nil, err + } + + if i.synIDTermMap == nil { + return nil, fmt.Errorf("synIDTermMap is nil") + } + + // If the synonymID is not found in the map, return an error + term, exists := i.synIDTermMap[synID] + if !exists { + return nil, fmt.Errorf("synonymID %d not found in map", synID) + } + + i.nextSyn = Synonym{} // clear the struct + rv := &i.nextSyn + rv.term = string(term) + rv.docNum = docNum + + return rv, nil +} + +// nextSynonym decodes the next synonym from the roaring bitmap iterator, +// ensuring it is not in the "except" set. Returns the synonymID, docNum, +// and a flag indicating success. +func (i *SynonymsIterator) nextSynonym() (uint32, uint32, bool, error) { + // If no synonyms are available, return early + if i.Actual == nil || i.synonyms == nil || i.synonyms == emptySynonymsList { + return 0, 0, false, nil + } + + var code uint64 + var docNum uint32 + var synID uint32 + + // Loop to find the next valid docNum, checking against the except + for i.Actual.HasNext() { + code = i.Actual.Next() + synID, docNum = decodeSynonym(code) + + // If docNum is not in the 'except' set, it's a valid result + if i.except == nil || !i.except.Contains(docNum) { + return synID, docNum, true, nil + } + } + + // If no valid docNum is found, return false + return 0, 0, false, nil +} + +// Synonym represents a single synonym, containing the term, synonymID, and document number. +type Synonym struct { + term string + docNum uint32 +} + +// Size returns the memory size of the Synonym, including the length of the term string. +func (p *Synonym) Size() int { + sizeInBytes := reflectStaticSizeSynonym + SizeOfPtr + + len(p.term) + + return sizeInBytes +} + +// Term returns the term of the Synonym. +func (s *Synonym) Term() string { + return s.term +} + +// Number returns the document number of the Synonym. +func (s *Synonym) Number() uint32 { + return s.docNum +} + +// decodeSynonym decodes a synonymCode into its synonymID and document ID components. +func decodeSynonym(synonymCode uint64) (synonymID uint32, docID uint32) { + return uint32(synonymCode >> 32), uint32(synonymCode) +} diff --git a/vendor/github.com/blevesearch/zapx/v17/thesaurus.go b/vendor/github.com/blevesearch/zapx/v17/thesaurus.go new file mode 100644 index 0000000000..f97aaf293b --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/thesaurus.go @@ -0,0 +1,161 @@ +// Copyright (c) 2024 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "fmt" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" + segment "github.com/blevesearch/scorch_segment_api/v2" + "github.com/blevesearch/vellum" +) + +// Thesaurus is the zap representation of a Thesaurus +type Thesaurus struct { + sb *SegmentBase + name string + fieldID uint16 + synIDTermMap map[uint32][]byte + fst *vellum.FST + + fstReader *vellum.Reader + + bytesRead uint64 +} + +// represents an immutable, empty Thesaurus +var emptyThesaurus = &Thesaurus{} + +// SynonymsList returns the synonyms list for the specified term +func (t *Thesaurus) SynonymsList(term []byte, except *roaring.Bitmap, prealloc segment.SynonymsList) (segment.SynonymsList, error) { + var preallocSL *SynonymsList + sl, ok := prealloc.(*SynonymsList) + if ok && sl != nil { + preallocSL = sl + } + return t.synonymsList(term, except, preallocSL) +} + +func (t *Thesaurus) synonymsList(term []byte, except *roaring.Bitmap, rv *SynonymsList) (*SynonymsList, error) { + if t.fstReader == nil { + if rv == nil || rv == emptySynonymsList { + return emptySynonymsList, nil + } + return t.synonymsListInit(rv, except), nil + } + + synonymsOffset, exists, err := t.fstReader.Get(term) + + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + if rv == nil || rv == emptySynonymsList { + return emptySynonymsList, nil + } + return t.synonymsListInit(rv, except), nil + } + + return t.synonymsListFromOffset(synonymsOffset, except, rv) +} + +func (t *Thesaurus) synonymsListFromOffset(synonymsOffset uint64, except *roaring.Bitmap, rv *SynonymsList) (*SynonymsList, error) { + rv = t.synonymsListInit(rv, except) + + err := rv.read(synonymsOffset, t) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (t *Thesaurus) synonymsListInit(rv *SynonymsList, except *roaring.Bitmap) *SynonymsList { + if rv == nil || rv == emptySynonymsList { + rv = &SynonymsList{} + rv.buffer = bytes.NewReader(nil) + } else { + synonyms := rv.synonyms + buf := rv.buffer + if synonyms != nil { + synonyms.Clear() + } + if buf != nil { + buf.Reset(nil) + } + + *rv = SynonymsList{} // clear the struct + + rv.synonyms = synonyms + rv.buffer = buf + } + rv.sb = t.sb + rv.except = except + rv.synIDTermMap = t.synIDTermMap + return rv +} + +func (t *Thesaurus) Contains(key []byte) (bool, error) { + if t.fst != nil { + return t.fst.Contains(key) + } + return false, nil +} + +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (t *Thesaurus) AutomatonIterator(a segment.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.ThesaurusIterator { + if t.fst != nil { + rv := &ThesaurusIterator{ + t: t, + } + + itr, err := t.fst.Search(a, startKeyInclusive, endKeyExclusive) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + + return rv + } + return emptyThesaurusIterator +} + +var emptyThesaurusIterator = &ThesaurusIterator{} + +// ThesaurusIterator is an iterator for term dictionary +type ThesaurusIterator struct { + t *Thesaurus + itr vellum.Iterator + err error + entry index.ThesaurusEntry +} + +// Next returns the next entry in the dictionary +func (i *ThesaurusIterator) Next() (*index.ThesaurusEntry, error) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } + term, _ := i.itr.Current() + i.entry.Term = string(term) + i.err = i.itr.Next() + return &i.entry, nil +} diff --git a/vendor/github.com/blevesearch/zapx/v17/write.go b/vendor/github.com/blevesearch/zapx/v17/write.go new file mode 100644 index 0000000000..10065134ac --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/write.go @@ -0,0 +1,192 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "io" + + "github.com/RoaringBitmap/roaring/v2" + index "github.com/blevesearch/bleve_index_api" +) + +// writes out the length of the roaring bitmap in bytes as varint +// then writes out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() + if err != nil { + return 0, err + } + + var tw int + + if fw, ok := w.(*FileWriter); ok && fw != nil { + buf = fw.process(buf) + } + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) + nw, err := w.Write(reuseBufVarint[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the roaring bytes + nw, err = w.Write(buf) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistFieldsSection(fieldsInv []string, + fieldsOptions map[string]index.FieldIndexingOptions, w *FileWriter, + opaque map[int]resetable) (uint64, error) { + var rv uint64 + fieldsOffsets := make([]uint64, 0, len(fieldsInv)) + + for fieldID, fieldName := range fieldsInv { + // record start of this field + fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) + fieldOpts := fieldsOptions[fieldName] + fieldName := w.process([]byte(fieldName)) + + // write field name length + _, err := writeUvarints(w, uint64(len(fieldName))) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write(fieldName) + if err != nil { + return 0, err + } + + // write out the field options + _, err = writeUvarints(w, uint64(fieldOpts)) + if err != nil { + return 0, err + } + + // write out the number of field-specific indexes + _, err = writeUvarints(w, uint64(len(segmentSections))) + if err != nil { + return 0, err + } + + // now write pairs of index section ids, and start addresses for each field + // which has a specific section's data. this serves as the starting point + // using which a field's section data can be read and parsed. + for segmentSectionType, segmentSectionImpl := range segmentSections { + binary.Write(w, binary.BigEndian, segmentSectionType) + binary.Write(w, binary.BigEndian, uint64(segmentSectionImpl.AddrForField(opaque, fieldID))) + } + } + + rv = uint64(w.Count()) + // write out number of fields + _, err := writeUvarints(w, uint64(len(fieldsInv))) + if err != nil { + return 0, err + } + // now write out the fields index + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +// FooterSize is the size of the footer record in bytes +// crc + id length + ver + chunk + sectionsIndexOffset + stored offset + num docs +// Does not include the length of the id because it is variable length +const FooterSize = 4 + 4 + 4 + 4 + 8 + 8 + 8 + +func persistFooter(numDocs, storedIndexOffset, sectionsIndexOffset uint64, + chunkMode, crcBeforeFooter uint32, writerIn io.Writer, fileWriterID string) error { + w := NewCountHashWriter(writerIn) + w.crc = crcBeforeFooter + + // Write the writer id + _, err := w.Write([]byte(fileWriterID)) + if err != nil { + return err + } + + // Write the length of the writer id + err = binary.Write(w, binary.BigEndian, uint32(len(fileWriterID))) + if err != nil { + return err + } + + // write out the number of docs + err = binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + + // write out the sections index location + err = binary.Write(w, binary.BigEndian, sectionsIndexOffset) + if err != nil { + return err + } + + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkMode) + if err != nil { + return err + } + + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, Version) + if err != nil { + return err + } + + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.crc) + if err != nil { + return err + } + return nil +} + +func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { + buf := make([]byte, binary.MaxVarintLen64) + for _, val := range vals { + n := binary.PutUvarint(buf, val) + var nw int + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + return tw, err +} diff --git a/vendor/github.com/blevesearch/zapx/v17/zap.md b/vendor/github.com/blevesearch/zapx/v17/zap.md new file mode 100644 index 0000000000..f082dc8d60 --- /dev/null +++ b/vendor/github.com/blevesearch/zapx/v17/zap.md @@ -0,0 +1,338 @@ +# ZAP File Format + +## Legend + +### File Sections + + |========| + | | file section + |========| + +### Fixed-size fields + + |--------| |----| |--| |-| + | | uint64 | | uint32 | | uint16 | | uint8 + |--------| |----| |--| |-| + +### Varints + + |~~~~~~~~| + | | varint(up to uint64) + |~~~~~~~~| + +### Arbitrary-length fields + + |--------...---| + | | arbitrary-length field (string, vellum, roaring bitmap) + |--------...---| + +### Chunked data + + [--------] + [ ] + [--------] + +## Overview + +Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. + + +=================================================================+ + | Stored Fields | + |=================================================================| + +-----> | Stored Fields Index | + | |=================================================================| + | | Inverted Text Index Section | + | |=================================================================| + | | Vector Index Section | + | |=================================================================| + | | Synonym Index Section | + | |=================================================================| + | | Sections Info | + | |=================================================================| + | +-> | Sections Index | + | | |==..==+=======+======+======+=====+======+=====+======+==========| + | | | ID | IDL | D# | SF | S | CF | V | CC | (Footer) | + | | +==..==+=======+======+======+=====+======+=====+======+==========+ + | | | | + +---------------------------------+ | + | | + +-----------------------------------+ + + ID. ID of the Writer Used. + IDL. Length of the Writer ID. + D#. Number of Docs. + SF. Stored Fields Index Offset. + S. Sections Index Offset + CF. Chunk Factor. + V. Version. + CC. CRC32. + +## Stored Fields + +Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. +We also store the EdgeList for nested documents, if present in the segment, to preserve hierarchical relationships. +If there are NE edges, it means there are NE nested or sub-documents, with each edge representing a child -> parent relationship. + + 0 [SF] [SF + D# * 8] + | Stored Fields | Stored Fields Index | Edge List Information | + |================================|==================================|========================================================================| + | | | | + | |--------------------| ||--------|--------|. . .|--------|||~~~~~~~~|~~~~~~~~|~~~~~~~~|~~~~~~~~|~~~~~~~~|. . .|~~~~~~~~~|~~~~~~~~~|| + | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 ||| NE | C1 | P1 | C2 | P2 | | CNE | PNE || + | | |--------------------| ||--------|----|---|. . .|--------|||~~~~~~~~|~~~~~~~~|~~~~~~~~|~~~~~~~~|~~~~~~~~|. . .|~~~~~~~~~|~~~~~~~~~|| + | | | | | | + |===|============================|==============|===================|========================================================================| + + NE. Number of edges in the edge list. + Ci. Child Document Number for edge i. + Pi. Parent Document Number for edge i. + +Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. + + Stored Fields Data + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + | MDS | CDS | MD | CD | + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + + MDS. Metadata size. + CDS. Compressed data size. + MD. Metadata. + CD. Snappy-compressed data. + +## Index Sections + +Sections Index is a set of NF uint64 addresses (0 through F# - 1) each of which are offsets to the records in the Sections Info. Inside the sections info, we have further offsets to specific type of index section for that particular field in the segment file. For example, field 0 may correspond to Vector Indexing and its records would have offsets to the Vector Index Section whereas a field 1 may correspond to Text Indexing and its records would rather point to somewhere within the Inverted Text Index Section. + + (...) [F] [F + F#] + + Sections Info + Sections Index + + |===========================================================================|=================================| + | | | + | +--------+------+---+----+---------+---------+~~~~~+--+...+--+~~~~~~~~~+ | +------+------+...+------+----+ | + +---->| Length | Name | O | NS | S1 Type | S1 Addr | ... | Sn Type | Sn Addr | | | 0 | 1 | | F#-1 | NF | | + | | +--------+------+---+----+---------+---------+~~~~~+--+...+--+~~~~~~~~~+ | +------+----+-+...+------+----+ | + | | | | | + | +===========================================================================+=============|===================+ + | | + +--------------------------------------------------------------------------------------------+ + + NF. Number of fields + NS. Number of index sections + O. Field Indexing Options + Sn. nth index section + +## Inverted Text Index Section + +Each field has its own types of indexes in separate sections as indicated above. This can be a vector index or inverted text index. + +In case of inverted text index, the dictionary is encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. + + +================================================================+- Inverted Text + | | Index Section + | | + | Freq/Norm (chunked) | + | [~~~~~~+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | +->[ Freq | Norm (float32 under varint) ] | + | | [~~~~~~+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | | | + | +------------------------------------------------------------+ | + | Location Details (chunked) | | + | [~~~~~~+~~~~~+~~~~~~~+~~~~~+~~~~~~+~~~~~~~~+~~~~~] | | + | +->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | + | | [~~~~~~+~~~~~+~~~~~~~+~~~~~+~~~~~~+~~~~~~~~+~~~~~] | | + | | | | + | +----------------------+ | | + | Postings List | | | + | +~~~~~~~~+~~~~~+~~+~~~~~~~~+----------+...+-+ | | + | +->+ F/N | LD | Length | ROARING BITMAP | | | + | | +~~~~~+~~|~~~~~~~~|~~~~~~~~+----------+...+-+ | | + | | +----------------------------------------------+ | + | +-------------------------------------------------+ | + | | | + | Dictionary | | + | +~~~~~~~~~~+~~~~~~~+~~~~~~~~+--------------------------+-...-+ | + +-----> DV Start | DV End| Length | VELLUM DATA : (TERM -> OFFSET) | | + | | +~~~~~~~~~~+~~~~~~~+~~~~~~~~+----------------------------...-+ | + | | | + | | | + | |================================================================+- Vector Index Section + | | | + | +================================================================+- Synonym Index Section + | | | + | |================================================================+- Sections Info + +-----------------------------+ | + | | | + | +-------+-----+-----+------+~~~~~~~~+~~~~~~~~+--+...+--+ | + | | ... | ITI | ITI ADDR | NS | Length | Name | | + | +-------+-----+------------+~~~~~~~~+~~~~~~~~+--+...+--+ | + +================================================================+ + + + ITI - Inverted Text Index + +## Vector Index Section + +In a vector index, each vector is assigned a unique, monotonically increasing ID ranging from `0` to `N-1`, where `N` is the total number of vectors in the index. This ID is used internally by the [Faiss](https://github.com/blevesearch/faiss) index. Each vector ID maps to a document ID within the segment, and this mapping is stored as an array of size `N`. + + |================================================================+- Inverted Text Index Section + | | + |================================================================+- Vector Index Section + | | + | +~~~~~~~~~~+~~~~~~~~+~~~~~+~~~~~~+~~~~~~+ | + +-------> DV Start | DV End | VIO | NVEC | ML | | + | | +~~~~~~~~~~+~~~~~~~~+~~~~~+~~~~~~+~~~~~~+ | + | | | + | | +~~~~~~~~~~~~~+ | + | | | DocID_1 | | + | | +~~~~~~~~~~~~~+ | + | | | DocID_2 | | + | | +~~~~~~~~~~~~~+ | + | | | ... | | + | | +~~~~~~~~~~~~~+ | + | | | DocID_N | | + | | +~~~~~~~~~~~~~+ | + | | | + | | +~~~~~~~~~~~~~+ | + | | | INDEX TYPE | | + | | +~~~~~~~~~~~~~+ | + | | +~~~~~~~~~~~~~+ | + | | | INDEX DATA | | + | | +~~~~~~~~~~~~~+ | + | | | + | |================================================================+- Synonym Index Section + | | | + | |================================================================+- Sections Info + +-----------------------------+ | + | | | + | +-------+-----+-----+------+~~~~~~~~+~~~~~~~~+--+...+--+ | + | | ... | VI | VI ADDR | NS | Length | Name | | + | +-------+-----+------------+~~~~~~~~+~~~~~~~~+--+...+--+ | + +================================================================+ + + VI - Vector Index + VIO - Vector Index Optimized for + NVEC - Number of vectors + ML - Length of the vector to document ID map + INDEX TYPE - Type of the vector index + INDEX DATA - Vector index data + +### Vector Index Type - FP32 + +FP32 vector indexes stores a singular FAISS index to perform search + + | +~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | FAISS LEN | | + | +~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | + | +----------+...+-----------+ | + | | SERIALIZED FAISS INDEX | | + | +----------+...+-----------+ | + + FAISS LEN - Length of the serialized faiss index + +### Vector Index Type - Binary + +Binary vector indexes stores two separate FAISS indexes to perform search. The first is a primary binary index and the second is a backing FP32 index + + | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | PRIMARY FAISS LEN | | + | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | + | +---------------+...+--------------+ | + | | SERIALIZED PRIMARY FAISS INDEX | | + | +---------------+...+--------------+ | + | | + | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | BACKING FAISS LEN | | + | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+ | + | | + | +---------------+...+--------------+ | + | | SERIALIZED BACKING FAISS INDEX | | + | +---------------+...+--------------+ | + + PRIMARY FAISS LEN - Length of the serialized primary faiss index + BACKING FAISS LEN - Length of the serialized backing faiss index + +## Synonym Index Section + +In a synonyms index, the relationship between a term and its synonyms is represented using a Thesaurus. The Thesaurus is encoded in the [Vellum](https://github.com/couchbase/vellum) format and consists of pairs in the form `(term, offset)`. Here, the offset specifies the position of the postings list containing the synonyms for the given term. The postings list is stored as a Roaring64 bitmap, with each entry representing an encoded synonym for the term. + + |================================================================+- Inverted Text Index Section + | | + |================================================================+- Vector Index Section + | | + +================================================================+- Synonym Index Section + | | + | (Offset) +~~~~~+----------+...+---+ | + | +--------->| RL | ROARING64 BITMAP | | + | | +~~~~~+----------+...+---+ +------------------------+ + | |(Term -> Offset) | + | | | + | +--------+ | + | | Term ID to Term map (NST Entries) | + | +~~~~+~~~~+~~~~~+~~~~[{~~~~~+~~~~+~~~~~~}{~~~~~+~~~~+~~~~~~}...{~~~~~+~~~~+~~~~~~}] | + | +->| VL | VD | NST | ML || TID | TL | Term || TID | TL | Term | | TID | TL | Term | | + | | +~~~~+~~~~+~~~~~+~~~~[{~~~~~+~~~~+~~~~~~}{~~~~~+~~~~+~~~~~~}...{~~~~~+~~~~+~~~~~~}] | + | | | + | +----------------------------+ | + | | | + | +~~~~~~~~~~+~~~~~~~~+~~~~~~~~~~~~~~~~~+ | + +-----> DV Start | DV End | ThesaurusOffset | | + | | +~~~~~~~~~~+~~~~~~~~+~~~~~~~~~~~~~~~~~+ +------------------------+ + | | | + | | | + | |================================================================+- Sections Info + +-----------------------------+ | + | | | + | +-------+-----+-----+------+~~~~~~~~+~~~~~~~~+--+...+--+ | + | | ... | SI | SI ADDR | NS | Length | Name | | + | +-------+-----+------------+~~~~~~~~+~~~~~~~~+--+...+--+ | + +================================================================+ + + SI - Synonym Index + VL - Vellum Length + VD - Vellum Data (Term -> Offset) + RL - Roaring64 Length + NST - Number of entries in the term ID to term map + ML - Length of the term ID to term map + TID - Term ID (32-bit) + TL - Term Length + +### Synonym Encoding + + ROARING64 BITMAP + + Each 64-bit entry consists of two parts: the first 32 bits represent the Term ID (TID), + and the next 32 bits represent the Document Number (DN). + + [{~~~~~+~~~~}{~~~~~+~~~~}...{~~~~~+~~~~}] + | TID | DN || TID | DN | | TID | DN | + [{~~~~~+~~~~}{~~~~~+~~~~}...{~~~~~+~~~~}] + + TID - Term ID (32-bit) + DN - Document Number (32-bit) + +## Doc Values + +DocValue start and end offsets are stored within the section content of each field. This allows each field having its own type of index to choose whether to store the doc values or not. For example, it may not make sense to store doc values for vector indexing and so, the offsets can be invalid ones for it whereas the fields having text indexing may have valid doc values offsets. + + +================================================================+ + | +------...--+ | + | +->+ DocValues +<-+ | + | | +------...--+ | | + |==|=================|===========================================+- Inverted Text + ++~+~~~~~~~~~+~~~~~~~+~~+~~~~~~~~+-----------------------...--+ | Index Section + || DV START | DV END | LENGTH | VELLUM DATA: TERM -> OFFSET| | + ++~~~~~~~~~~~+~~~~~~~~~~+~~~~~~~~+-----------------------...--+ | + +================================================================+ + +DocValues is chunked Snappy-compressed values for each document and field. + + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + +Last 16 bytes are description of chunks. + + |~~~~~~~~~~~~...~|----------------|----------------| + | Chunk Sizes | Chunk Size Arr | Chunk# | + |~~~~~~~~~~~~...~|----------------|----------------| diff --git a/vendor/github.com/golang/snappy/README b/vendor/github.com/golang/snappy/README index cea12879a0..fd191f78c7 100644 --- a/vendor/github.com/golang/snappy/README +++ b/vendor/github.com/golang/snappy/README @@ -1,8 +1,13 @@ The Snappy compression format in the Go programming language. -To download and install from source: +To use as a library: $ go get github.com/golang/snappy +To use as a binary: +$ go install github.com/golang/snappy/cmd/snappytool@latest +$ cat decoded | ~/go/bin/snappytool -e > encoded +$ cat encoded | ~/go/bin/snappytool -d > decoded + Unless otherwise noted, the Snappy-Go source files are distributed under the BSD-style license found in the LICENSE file. diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s index f8d54adfc5..f0c876a248 100644 --- a/vendor/github.com/golang/snappy/encode_arm64.s +++ b/vendor/github.com/golang/snappy/encode_arm64.s @@ -27,7 +27,7 @@ // The unusual register allocation of local variables, such as R10 for the // source pointer, matches the allocation used at the call site in encodeBlock, // which makes it easier to manually inline this function. -TEXT ·emitLiteral(SB), NOSPLIT, $32-56 +TEXT ·emitLiteral(SB), NOSPLIT, $40-56 MOVD dst_base+0(FP), R8 MOVD lit_base+24(FP), R10 MOVD lit_len+32(FP), R3 @@ -261,7 +261,7 @@ extendMatchEnd: // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An // extra 64 bytes, to call other functions, and an extra 64 bytes, to spill // local variables (registers) during calls gives 32768 + 64 + 64 = 32896. -TEXT ·encodeBlock(SB), 0, $32896-56 +TEXT ·encodeBlock(SB), 0, $32904-56 MOVD dst_base+0(FP), R8 MOVD src_base+24(FP), R7 MOVD src_len+32(FP), R14 diff --git a/vendor/modules.txt b/vendor/modules.txt index 78eb406786..9e62916c8a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -76,8 +76,8 @@ github.com/ProtonMail/go-crypto/openpgp/packet github.com/ProtonMail/go-crypto/openpgp/s2k github.com/ProtonMail/go-crypto/openpgp/x25519 github.com/ProtonMail/go-crypto/openpgp/x448 -# github.com/RoaringBitmap/roaring/v2 v2.4.5 -## explicit; go 1.15 +# github.com/RoaringBitmap/roaring/v2 v2.14.5 +## explicit; go 1.24.0 github.com/RoaringBitmap/roaring/v2 github.com/RoaringBitmap/roaring/v2/internal github.com/RoaringBitmap/roaring/v2/roaring64 @@ -115,11 +115,11 @@ github.com/beorn7/perks/quantile # github.com/bitly/go-simplejson v0.5.0 ## explicit github.com/bitly/go-simplejson -# github.com/bits-and-blooms/bitset v1.22.0 +# github.com/bits-and-blooms/bitset v1.24.2 ## explicit; go 1.16 github.com/bits-and-blooms/bitset -# github.com/blevesearch/bleve/v2 v2.5.7 -## explicit; go 1.23 +# github.com/blevesearch/bleve/v2 v2.6.0 +## explicit; go 1.25.0 github.com/blevesearch/bleve/v2 github.com/blevesearch/bleve/v2/analysis github.com/blevesearch/bleve/v2/analysis/analyzer/custom @@ -161,18 +161,18 @@ github.com/blevesearch/bleve/v2/search/scorer github.com/blevesearch/bleve/v2/search/searcher github.com/blevesearch/bleve/v2/size github.com/blevesearch/bleve/v2/util -# github.com/blevesearch/bleve_index_api v1.2.11 -## explicit; go 1.21 +# github.com/blevesearch/bleve_index_api v1.3.11 +## explicit; go 1.24 github.com/blevesearch/bleve_index_api -# github.com/blevesearch/geo v0.2.4 -## explicit; go 1.21.0 +# github.com/blevesearch/geo v0.2.5 +## explicit; go 1.24 github.com/blevesearch/geo/geojson github.com/blevesearch/geo/r1 github.com/blevesearch/geo/r2 github.com/blevesearch/geo/r3 github.com/blevesearch/geo/s1 github.com/blevesearch/geo/s2 -# github.com/blevesearch/go-faiss v1.0.26 +# github.com/blevesearch/go-faiss v1.1.0 ## explicit; go 1.21 github.com/blevesearch/go-faiss # github.com/blevesearch/go-porterstemmer v1.0.3 @@ -181,11 +181,11 @@ github.com/blevesearch/go-porterstemmer # github.com/blevesearch/gtreap v0.1.1 ## explicit; go 1.13 github.com/blevesearch/gtreap -# github.com/blevesearch/mmap-go v1.0.4 -## explicit; go 1.13 +# github.com/blevesearch/mmap-go v1.2.0 +## explicit; go 1.24.0 github.com/blevesearch/mmap-go -# github.com/blevesearch/scorch_segment_api/v2 v2.3.13 -## explicit; go 1.21 +# github.com/blevesearch/scorch_segment_api/v2 v2.4.7 +## explicit; go 1.24.0 github.com/blevesearch/scorch_segment_api/v2 # github.com/blevesearch/segment v0.9.1 ## explicit; go 1.18 @@ -197,30 +197,33 @@ github.com/blevesearch/snowballstem/english # github.com/blevesearch/upsidedown_store_api v1.0.2 ## explicit; go 1.18 github.com/blevesearch/upsidedown_store_api -# github.com/blevesearch/vellum v1.1.0 -## explicit; go 1.21 +# github.com/blevesearch/vellum v1.2.0 +## explicit; go 1.24.0 github.com/blevesearch/vellum github.com/blevesearch/vellum/levenshtein github.com/blevesearch/vellum/regexp github.com/blevesearch/vellum/utf8 -# github.com/blevesearch/zapx/v11 v11.4.2 +# github.com/blevesearch/zapx/v11 v11.4.3 ## explicit; go 1.21 github.com/blevesearch/zapx/v11 -# github.com/blevesearch/zapx/v12 v12.4.2 +# github.com/blevesearch/zapx/v12 v12.4.3 ## explicit; go 1.21 github.com/blevesearch/zapx/v12 -# github.com/blevesearch/zapx/v13 v13.4.2 +# github.com/blevesearch/zapx/v13 v13.4.3 ## explicit; go 1.21 github.com/blevesearch/zapx/v13 -# github.com/blevesearch/zapx/v14 v14.4.2 +# github.com/blevesearch/zapx/v14 v14.4.3 ## explicit; go 1.21 github.com/blevesearch/zapx/v14 -# github.com/blevesearch/zapx/v15 v15.4.2 +# github.com/blevesearch/zapx/v15 v15.4.3 ## explicit; go 1.21 github.com/blevesearch/zapx/v15 -# github.com/blevesearch/zapx/v16 v16.2.8 -## explicit; go 1.23 +# github.com/blevesearch/zapx/v16 v16.3.4 +## explicit; go 1.24 github.com/blevesearch/zapx/v16 +# github.com/blevesearch/zapx/v17 v17.1.2 +## explicit; go 1.25.0 +github.com/blevesearch/zapx/v17 # github.com/bluele/gcache v0.0.2 ## explicit; go 1.15 github.com/bluele/gcache @@ -700,7 +703,7 @@ github.com/golang/groupcache/lru github.com/golang/protobuf/jsonpb github.com/golang/protobuf/proto github.com/golang/protobuf/ptypes/empty -# github.com/golang/snappy v0.0.4 +# github.com/golang/snappy v1.0.0 ## explicit github.com/golang/snappy # github.com/google/go-cmp v0.7.0