-
Notifications
You must be signed in to change notification settings - Fork 709
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Severity
P0 - Critical breaking issue or missing functionality
Current Behavior
Deeplake produces a segmentation fault when queried with the clause WHERE (NOT ((f9['e1'] IS NOT NULL))).
Steps to Reproduce
The following testcase produces this error: deeplake_bug_trigger.py.txt. The sample logic of the code is shown below.
import deeplake as deeplake_lib
from deeplake import types as dl_types
import numpy as np
import faulthandler
faulthandler.enable()
# create collection, reset old collections if any
path = "mem://test_collection"
try:
deeplake_lib.delete(path)
except:
pass
ds = deeplake_lib.create(path)
# define schema
ds.add_column("f0", dl_types.Bool())
ds.add_column("f1", dl_types.Int64())
ds.add_column("f2", dl_types.Embedding(35, dtype='float32'))
ds.add_column("f3", dl_types.Text())
ds.add_column("f4", dl_types.Float32())
ds.add_column("f5", dl_types.Dict())
ds.add_column("f6", dl_types.Array("float32", 1))
ds.add_column("f7", dl_types.Text())
ds.add_column("f8", dl_types.Dict())
ds.add_column("f9", dl_types.Dict())
ds.add_column("f10", dl_types.Embedding(95, dtype='float32'))
# the data to be inserted
data_list = [ ... ]
# insert data
pkey_name = 'f7'
pk_to_idx = {}
if pkey_name is not None and len(ds) > 0:
col = ds[pkey_name].numpy().flatten()
for i, v in enumerate(col):
pk_to_idx[v] = i
for data in data_list:
pk = data.get(pkey_name) if pkey_name is not None else None
idx = pk_to_idx.get(pk) if pk is not None else None
if idx is None:
row = {k: [v] for k, v in data.items()}
ds.append(row)
if pk is not None:
pk_to_idx[pk] = len(ds) - 1
else:
for field, v in data.items():
if field in ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10']:
ds[field][idx] = v
ds.commit()
# create index
ds["f1"].create_index(dl_types.Inverted)
ds["f3"].create_index(dl_types.Inverted)
ds["f4"].create_index(dl_types.Inverted)
ds["f8"].create_index(dl_types.Inverted)
# build query
target_vector = [0, -0.73974, -0.56952, 0.32329, 0, 0.18876, 0, -0.48297, -0.76431, -0.561, 0, 0.38881, 0, -0.51161, 0, 0.3196, -0.50481, 0.36672, 0.7916, 0.72589, -0.15253, -0.81857, 0.54581, 0.54685, 0.53233, 0.87906, 0.46351, 0.80606, 0.02326, -0.99057, 0.01479, 0.74169, 0.72541, 0.48778, 0.19673]
anns_field = 'f2'
limit = 8875
q = "SELECT * FROM (SELECT *, COSINE_SIMILARITY(f2, ARRAY[0.0,-0.73974,-0.56952,0.32329,0.0,0.18876,0.0,-0.48297,-0.76431,-0.561,0.0,0.38881,0.0,-0.51161,0.0,0.3196,-0.50481,0.36672,0.7916,0.72589,-0.15253,-0.81857,0.54581,0.54685,0.53233,0.87906,0.46351,0.80606,0.02326,-0.99057,0.01479,0.74169,0.72541,0.48778,0.19673]) AS distance_score ) WHERE (NOT ((f9['e1'] IS NOT NULL))) ORDER BY distance_score DESC LIMIT 8875"
# this query should trigger the segmentation fault
view = ds.query(q)Expected/Desired Behavior
ds.query(q) should return the matching data.
Python Version
3.10
OS
Ubuntu 22.04
IDE
No response
Packages
No response
Additional Context
No response
Possible Solution
No response
Are you willing to submit a PR?
- I'm willing to submit a PR (Thank you!)
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working