diff --git a/.gitignore b/.gitignore index c470b97..90e0a0a 100644 --- a/.gitignore +++ b/.gitignore @@ -91,7 +91,8 @@ kolibrie/src/cuda/cudajoin.lib python/.venv/ # Some other directories -benchmark_dataset/ +**/benchmark_results/ +**/benchmark_dataset/ # IntelliJ .idea/ diff --git a/Cargo.lock b/Cargo.lock index 344531c..f8b8a4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1202,6 +1202,7 @@ dependencies = [ "serde", "serde_json", "shared", + "sysinfo", "url", "winapi", ] @@ -1338,6 +1339,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "num" version = "0.4.3" @@ -2173,6 +2183,21 @@ dependencies = [ "walkdir", ] +[[package]] +name = "sysinfo" +version = "0.29.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd727fc423c2060f6c92d9534cef765c65a6ed3f428a03d7def74a8c4348e666" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "winapi", +] + [[package]] name = "target-lexicon" version = "0.12.16" diff --git a/datalog/src/reasoning.rs b/datalog/src/reasoning.rs index 4581e67..a1a76a7 100644 --- a/datalog/src/reasoning.rs +++ b/datalog/src/reasoning.rs @@ -15,9 +15,12 @@ pub mod repairs; pub mod helpers; use shared::dictionary::Dictionary; +use shared::rule::FilterCondition; +use shared::terms::Term; +use shared::terms::TriplePattern; use shared::triple::Triple; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use shared::index_manager::*; +use shared::index_manager::TripleIndex; use shared::rule_index::RuleIndex; use shared::rule::Rule; use shared::provenance::Provenance; @@ -33,8 +36,7 @@ use crate::reasoning::rules::join_rule; pub struct Reasoner { pub dictionary: Arc>, pub rules: Vec, // List of dynamic rules - - pub index_manager: UnifiedIndex, + pub index_manager: Box, pub rule_index: RuleIndex, pub constraints: Vec, pub probability_seeds: HashMap, // Input probabilities for provenance seeding @@ -55,10 +57,14 @@ pub fn convert_string_binding_to_u32( impl Reasoner { pub fn new() -> Self { + Self::with_index(Box::new(shared::index_manager::HexastoreIndex::new())) + } + + pub fn with_index(index: Box) -> Self { Self { dictionary: Arc::new(RwLock::new(Dictionary::new())), rules: Vec::new(), - index_manager: UnifiedIndex::new(), + index_manager: index, rule_index: RuleIndex::new(), constraints: Vec::new(), probability_seeds: HashMap::new(), @@ -185,3 +191,319 @@ impl Reasoner { repairs } } + +fn unify_patterns( + pattern1: &TriplePattern, + pattern2: &TriplePattern, + bindings: &HashMap, +) -> Option> { + let mut new_bindings = bindings.clone(); + + if !unify_terms(&pattern1.0, &pattern2.0, &mut new_bindings) { + return None; + } + if !unify_terms(&pattern1.1, &pattern2.1, &mut new_bindings) { + return None; + } + if !unify_terms(&pattern1.2, &pattern2.2, &mut new_bindings) { + return None; + } + + Some(new_bindings) +} + +fn unify_terms(term1: &Term, term2: &Term, bindings: &mut HashMap) -> bool { + let term1 = resolve_term(term1, bindings); + let term2 = resolve_term(term2, bindings); + + match (&term1, &term2) { + (Term::Constant(c1), Term::Constant(c2)) => c1 == c2, + (Term::Variable(v), Term::Constant(c)) | (Term::Constant(c), Term::Variable(v)) => { + bindings.insert(v.clone(), Term::Constant(*c)); + true + } + (Term::Variable(v1), Term::Variable(v2)) => { + if v1 != v2 { + bindings.insert(v1.clone(), Term::Variable(v2.clone())); + } + true + } + (Term::Variable(_), Term::QuotedTriple(_)) => todo!(), + (Term::Constant(_), Term::QuotedTriple(_)) => todo!(), + (Term::QuotedTriple(_), Term::Variable(_)) => todo!(), + (Term::QuotedTriple(_), Term::Constant(_)) => todo!(), + (Term::QuotedTriple(_), Term::QuotedTriple(_)) => todo!(), + } +} + +pub fn resolve_term<'a>(term: &'a Term, bindings: &'a HashMap) -> Term { + match term { + Term::Variable(v) => { + if let Some(bound_term) = bindings.get(v) { + resolve_term(bound_term, bindings) + } else { + term.clone() + } + } + _ => term.clone(), + } +} + +fn substitute(pattern: &TriplePattern, bindings: &HashMap) -> TriplePattern { + let s = substitute_term(&pattern.0, bindings); + let p = substitute_term(&pattern.1, bindings); + let o = substitute_term(&pattern.2, bindings); + (s, p, o) +} + +fn substitute_term(term: &Term, bindings: &HashMap) -> Term { + match term { + Term::Variable(var_name) => { + if let Some(bound_term) = bindings.get(var_name) { + substitute_term(bound_term, bindings) + } else { + Term::Variable(var_name.clone()) + } + } + Term::Constant(value) => Term::Constant(*value), + Term::QuotedTriple(_) => todo!(), + } +} + +fn triple_to_pattern(triple: &Triple) -> TriplePattern { + ( + Term::Constant(triple.subject), + Term::Constant(triple.predicate), + Term::Constant(triple.object), + ) +} + +fn rename_rule_variables(rule: &Rule, counter: &mut usize) -> Rule { + let mut var_map = HashMap::new(); + + fn rename_term( + term: &Term, + var_map: &mut HashMap, + counter: &mut usize, + ) -> Term { + match term { + Term::Variable(v) => { + if let Some(new_v) = var_map.get(v) { + Term::Variable(new_v.clone()) + } else { + let new_v = format!("v{}", *counter); + *counter += 1; + var_map.insert(v.clone(), new_v.clone()); + Term::Variable(new_v) + } + } + Term::Constant(c) => Term::Constant(*c), + Term::QuotedTriple(_) => todo!(), + } + } + + let mut new_premise = Vec::new(); + for p in &rule.premise { + let s = rename_term(&p.0, &mut var_map, counter); + let p_term = rename_term(&p.1, &mut var_map, counter); + let o = rename_term(&p.2, &mut var_map, counter); + new_premise.push((s, p_term, o)); + } + + let mut new_negative_premise = Vec::new(); + for p in &rule.negative_premise { + let s = rename_term(&p.0, &mut var_map, counter); + let p_term = rename_term(&p.1, &mut var_map, counter); + let o = rename_term(&p.2, &mut var_map, counter); + new_negative_premise.push((s, p_term, o)); + } + + // Rename all conclusions + let mut new_conclusions = Vec::new(); + for conclusion in &rule.conclusion { + let conclusion_s = rename_term(&conclusion.0, &mut var_map, counter); + let conclusion_p = rename_term(&conclusion.1, &mut var_map, counter); + let conclusion_o = rename_term(&conclusion.2, &mut var_map, counter); + new_conclusions.push((conclusion_s, conclusion_p, conclusion_o)); + } + + Rule { + premise: new_premise, + negative_premise: new_negative_premise, + conclusion: new_conclusions, + filters: rule.filters.clone(), + } +} + +/// Construct a new Triple from a conclusion pattern and bound variables +pub fn construct_triple( + conclusion: &TriplePattern, + vars: &HashMap, + dict: &mut Dictionary +) -> Triple { + let subject = match &conclusion.0 { + Term::Variable(v) => { + vars.get(v).copied().unwrap_or_else(|| { + eprintln!("Warning: Variable '{}' not found in bindings. Available variables: {:?}", v, vars.keys().collect::>()); + 0 + }) + }, + Term::Constant(c) => *c, + Term::QuotedTriple(_) => todo!(), + }; + + let predicate = match &conclusion.1 { + Term::Variable(v) => { + vars.get(v).copied().unwrap_or_else(|| { + eprintln!("Warning: Variable '{}' not found in bindings. Available variables: {:?}", v, vars.keys().collect::>()); + 0 + }) + }, + Term::Constant(c) => *c, + Term::QuotedTriple(_) => todo!(), + }; + + let object = match &conclusion.2 { + Term::Variable(v) => { + // Check if this variable is bound in the current context + if let Some(&bound_value) = vars.get(v) { + bound_value + } else { + // If not bound, create a new placeholder in the dictionary + dict.encode(&format!("ml_output_placeholder_{}", v)) + } + }, + Term::Constant(c) => *c, + Term::QuotedTriple(_) => todo!(), + }; + + Triple { + subject, + predicate, + object, + } +} + +pub fn matches_rule_pattern( + pattern: &TriplePattern, + fact: &Triple, + variable_bindings: &mut HashMap, +) -> bool { + // Create a copy of bindings to test against (rollback on failure) + let mut temp_bindings = variable_bindings.clone(); + + // Subject + let s_ok = match &pattern.0 { + Term::Variable(v) => { + if let Some(&bound) = temp_bindings.get(v) { + bound == fact.subject + } else { + temp_bindings.insert(v.clone(), fact.subject); + true + } + } + Term::Constant(c) => *c == fact.subject, + Term::QuotedTriple(_) => todo!(), + }; + if !s_ok { + return false; // Don't modify original bindings on failure + } + + // Predicate + let p_ok = match &pattern.1 { + Term::Variable(v) => { + if let Some(&bound) = temp_bindings.get(v) { + bound == fact.predicate + } else { + temp_bindings.insert(v.clone(), fact.predicate); + true + } + } + Term::Constant(c) => *c == fact.predicate, + Term::QuotedTriple(_) => todo!(), + }; + if !p_ok { + return false; // Don't modify original bindings on failure + } + + // Object + let o_ok = match &pattern.2 { + Term::Variable(v) => { + if let Some(&bound) = temp_bindings.get(v) { + bound == fact.object + } else { + temp_bindings.insert(v.clone(), fact.object); + true + } + } + Term::Constant(c) => *c == fact.object, + Term::QuotedTriple(_) => todo!(), + }; + + // Only if ALL parts match, commit the bindings + if s_ok && p_ok && o_ok { + *variable_bindings = temp_bindings; + true + } else { + false + } +} + +fn evaluate_filters( + bindings: &HashMap, + filters: &Vec, + dict: &Dictionary +) -> bool { + for filter in filters { + if let Some(&value_code) = bindings.get(&filter.variable) { + let value_str = dict.decode(value_code).unwrap_or(""); + // Try to parse both the bound value and the filter's value as numbers. + let bound_num: f64 = value_str.parse().unwrap_or(0.0); + let filter_num: f64 = filter.value.parse().unwrap_or(0.0); + match filter.operator.as_str() { + ">" if bound_num <= filter_num => return false, + "<" if bound_num >= filter_num => return false, + ">=" if bound_num < filter_num => return false, + "<=" if bound_num > filter_num => return false, + "=" if (bound_num - filter_num).abs() > std::f64::EPSILON => return false, + "!=" if (bound_num - filter_num).abs() <= std::f64::EPSILON => return false, + _ => {} + } + } + } + true +} + +/// Given a rule, a set of all facts, and a binding that matches some premise +fn join_remaining( + rule: &Rule, + changed_idx: usize, + all_facts: &HashSet, + binding: HashMap, +) -> Vec> { + let mut results = vec![binding]; + let n = rule.premise.len(); + + // For each other premise j (order can be arbitrary) + for j in 0..n { + if j == changed_idx { + continue; + } + let mut new_results = Vec::new(); + // For every binding so far + for partial_binding in results.into_iter() { + // And for every fact in all_facts + for fact in all_facts.iter() { + let mut b = partial_binding.clone(); + if matches_rule_pattern(&rule.premise[j], fact, &mut b) { + new_results.push(b); + } + } + } + results = new_results; + if results.is_empty() { + break; + } + } + results +} diff --git a/datalog/src/reasoning/materialisation/semi_naive_with_repairs.rs b/datalog/src/reasoning/materialisation/semi_naive_with_repairs.rs index 06fbecc..b58cbe0 100644 --- a/datalog/src/reasoning/materialisation/semi_naive_with_repairs.rs +++ b/datalog/src/reasoning/materialisation/semi_naive_with_repairs.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::HexastoreIndex; use shared::triple::Triple; use crate::reasoning::materialisation::replace_variables_with_bound_values; use crate::reasoning::Reasoner; @@ -17,7 +17,7 @@ impl Reasoner { let repairs = self.compute_repairs(&all_facts); if let Some(best_repair) = repairs.into_iter().max_by_key(|r| r.len()) { // Clear index manager and reinsert repaired facts - self.index_manager = UnifiedIndex::new(); + self.index_manager = Box::new(HexastoreIndex::new()); for fact in &best_repair { self.index_manager.insert(fact); } diff --git a/datalog/src/reasoning_experimental.rs b/datalog/src/reasoning_experimental.rs index 82f449f..ef314f0 100644 --- a/datalog/src/reasoning_experimental.rs +++ b/datalog/src/reasoning_experimental.rs @@ -10,6 +10,7 @@ use shared::rule::Rule; use shared::triple::Triple; +use shared::index_manager::TripleIndex; use crate::reasoning::Reasoner; use std::collections::{BTreeMap, HashMap, HashSet}; use shared::terms::Term; diff --git a/kolibrie/Cargo.toml b/kolibrie/Cargo.toml index a04586c..51128ab 100644 --- a/kolibrie/Cargo.toml +++ b/kolibrie/Cargo.toml @@ -17,6 +17,7 @@ exclude = ["target/"] build = "build.rs" [dependencies] +sysinfo = "0.29" quick-xml = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -324,6 +325,10 @@ path = "examples/real_scenario/mqtt_example.rs" name = "mqtt_real_scenario" path = "examples/real_scenario/mqtt_real_scenario.rs" +[[example]] +name = "synthetic_stream_benchmark" +path = "examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs" + [[example]] name = "fraud_detection_system" path = "examples/real_scenario/fraud_detection_system.rs" diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs index cb07245..7136feb 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/contradictions.rs @@ -8,6 +8,7 @@ * you can obtain one at https://mozilla.org/MPL/2.0/. */ +use shared::index_manager::TripleIndex; use shared::terms::Term; use shared::rule::Rule; use datalog::reasoning::Reasoner; @@ -153,4 +154,4 @@ fn print_all_facts(kg: &Reasoner) { dict.decode(fact.object).unwrap_or("unknown") ); } -} +} \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs index 50f55ac..c1c51b5 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/deep_taxonomy.rs @@ -1,6 +1,7 @@ use datalog::reasoning::Reasoner; use shared::terms::Term; use shared::rule::Rule; +use shared::index_manager::TripleIndex; use kolibrie::sparql_database::SparqlDatabase; use std::fs; use std::time::Instant; diff --git a/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs b/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs index c9b8858..39a9fe4 100644 --- a/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs +++ b/kolibrie/examples/sparql_syntax/knowledge_graph/knowledge_graph.rs @@ -11,6 +11,7 @@ use shared::terms::Term; use shared::rule::Rule; use datalog::reasoning::*; +use shared::index_manager::TripleIndex; use datalog::parser_n3_logic::parse_n3_rule; use datalog::reasoning::backward_chaining::resolve_term; @@ -288,5 +289,4 @@ fn main() { test2(); println!("======================================="); inconsistency(); -} - +} \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs index dbd23e2..ea2e956 100644 --- a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M.rs @@ -1,108 +1,41 @@ /* - * Copyright © 2025 Volodymyr Kadzhaia - * Copyright © 2025 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - * - * - * - * NOTE 1: We are using the benchmark dataset from: - * Waterloo SPARQL Diversity Test Suite (WatDiv) v0.6 - * Source: https://dsg.uwaterloo.ca/watdiv/ - * - * NOTE 2: Before running with the 10M-triple dataset, ensure you have: - * 1) Downloaded `watdiv.10M.nt` into a `benchmark_dataset` directory - * at the project root. - * 2) Created the `benchmark_dataset` directory next to `kolibrie/`. - * (e.g., `mkdir benchmark_dataset && mv watdiv.10M.nt benchmark_dataset/`) - * - * NOTE 3: The watdiv.10M.nt file is approximately 1.5 GB in size. - * - */ +* Copyright © 2025 Volodymyr Kadzhaia +* Copyright © 2025 Pieter Bonte +* KU Leuven — Stream Intelligence Lab, Belgium +* +* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this file, +* you can obtain one at https://mozilla.org/MPL/2.0/. +* +* +* +* NOTE 1: We are using the benchmark dataset from: +* Waterloo SPARQL Diversity Test Suite (WatDiv) v0.6 +* Source: https://dsg.uwaterloo.ca/watdiv/ +* +* NOTE 2: Before running with the 10M-triple dataset, ensure you have: +* 1) Downloaded `watdiv.10M.nt` into a `benchmark_dataset` directory +* at the project root. +* 2) Created the `benchmark_dataset` directory next to `kolibrie/`. +* (e.g., `mkdir benchmark_dataset && mv watdiv.10M.nt benchmark_dataset/`) +* +* NOTE 3: The watdiv.10M.nt file is approximately 1.5 GB in size. +* +*/ use kolibrie::execute_query::*; use kolibrie::sparql_database::*; -use std::fs::File; -use std::io::{BufRead, BufReader}; +use shared::index_manager::*; +use std::collections::{BTreeMap, HashSet}; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader, Write}; +use std::path::Path; use std::time::Instant; -fn parse_large_ntriples_file( - file_path: &str, -) -> Result> { - println!("Starting to parse N-Triples file: {}", file_path); - let start_time = Instant::now(); - - let mut db = SparqlDatabase::new(); - - // Much smaller buffer and more aggressive memory management - let file = File::open(file_path)?; - let reader = BufReader::with_capacity(64 * 1024, file); // Reduced buffer size - - let mut line_count = 0; - let mut batch_lines = Vec::new(); - const BATCH_SIZE: usize = 10_000; // Much smaller batch size - - for line_result in reader.lines() { - let line = line_result?; - - if line.trim().is_empty() || line.starts_with('#') { - continue; - } - - batch_lines.push(line); - line_count += 1; - - if batch_lines.len() >= BATCH_SIZE { - // Process batch immediately - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); - - // Aggressive cleanup - batch_lines.clear(); - batch_lines.shrink_to_fit(); - - // Progress info every 100k triples - if line_count % 100_000 == 0 { - println!("Processed {} triples", line_count); - std::hint::black_box(()); - - // Optional: small delay to let the system breathe - std::thread::sleep(std::time::Duration::from_millis(10)); - } - } - } +type QuerySpec = (&'static str, &'static str); - // Process remaining batch - if !batch_lines.is_empty() { - let batch_data = batch_lines.join("\n"); - db.parse_ntriples_and_add(&batch_data); - } - db.get_or_build_stats(); - - println!( - "Finished parsing {} triples in {:.2} seconds", - line_count, - start_time.elapsed().as_secs_f64() - ); - - // Build indexes after parsing - this is where the magic happens - println!("Building indexes..."); - let index_start = Instant::now(); - db.build_all_indexes(); - println!("Indexes built in {:.2} seconds", index_start.elapsed().as_secs_f64()); - - Ok(db) -} - -fn run_all_queries(db: &mut SparqlDatabase) { - const ITERATIONS: usize = 20; - - // (name, query) - let queries: &[(&str, &str)] = &[ - // C1 +fn workload_queries() -> Vec { + vec![ ( "C1", r#"PREFIX wsdbm: @@ -567,20 +500,191 @@ fn run_all_queries(db: &mut SparqlDatabase) { } "#, ), - ]; + ] +} + +fn queries_for_index_manager(workload: &[QuerySpec]) -> Vec { + workload.iter().map(|(_, q)| q.trim().to_string()).collect() +} + +fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); + + let config = match index_type.as_str() { + "hexastore" | "" => IndexConfig::Hexastore, + "spo" => IndexConfig::SPO, + "pos" => IndexConfig::POS, + "osp" => IndexConfig::OSP, + "pso" => IndexConfig::PSO, + "ops" => IndexConfig::OPS, + "sop" => IndexConfig::SOP, + "table" => IndexConfig::SingleTable, + "partial_hexastore" => IndexConfig::PartialHexastore { queries }, + "buckets" => IndexConfig::Buckets { queries }, + other => { + eprintln!( + "WARNING: Unknown INDEX_TYPE '{}', falling back to hexastore.", + other + ); + IndexConfig::Hexastore + } + }; + + (index_type, config) +} + +fn parse_large_ntriples_file( + file_path: &str, + workload: &[QuerySpec], +) -> Result> { + let (index_name, config) = make_config_from_env(queries_for_index_manager(workload)); + println!("INDEX_TYPE = {}", index_name); + println!("Starting to parse N-Triples file: {}", file_path); + + let start_time = Instant::now(); + let mut db = SparqlDatabase::with_config(config); - for (name, query) in queries.iter() { + let file = File::open(file_path)?; + let reader = BufReader::with_capacity(64 * 1024, file); + + let mut line_count = 0; + let mut batch_lines = Vec::new(); + const BATCH_SIZE: usize = 10_000; + + for line_result in reader.lines() { + let line = line_result?; + + if line.trim().is_empty() || line.starts_with('#') { + continue; + } + + batch_lines.push(line); + line_count += 1; + + if batch_lines.len() >= BATCH_SIZE { + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); + + batch_lines.clear(); + batch_lines.shrink_to_fit(); + + if line_count % 100_000 == 0 { + println!("Processed {} triples", line_count); + std::thread::sleep(std::time::Duration::from_millis(10)); + } + } + } + + if !batch_lines.is_empty() { + let batch_data = batch_lines.join("\n"); + db.parse_ntriples_and_add(&batch_data); + } + + db.get_or_build_stats(); + + println!( + "Finished parsing {} triples in {:.2} seconds", + line_count, + start_time.elapsed().as_secs_f64() + ); + + println!("Building indexes..."); + let index_start = Instant::now(); + db.build_all_indexes(); + println!( + "Indexes built in {:.2} seconds", + index_start.elapsed().as_secs_f64() + ); + + Ok(db) +} + +/// Helper function to serialize result sets into deterministic, sorted text format +fn serialize_results(results: &[Vec]) -> Vec { + let mut lines = Vec::with_capacity(results.len()); + for row in results { + // Filter out empty rows just in case the engine returns an unpopulated tuple + if row.iter().all(|s| s.is_empty()) { + continue; + } + lines.push(row.join("|")); + } + + lines.sort_unstable(); + lines +} + +fn run_all_queries(db: &mut SparqlDatabase, workload: &[QuerySpec]) { + const ITERATIONS: usize = 10; + let dir_path = Path::new("../benchmark_dataset"); + + for (name, query) in workload.iter() { println!("=============================================="); - println!("Running query {} ({} iterations)...", name, ITERATIONS); + println!("Running query {}...", name); - let mut total_time = 0.0; - // let mut last_result:Vec> = Vec::new(); + // Run one validation loop to cache/verify results + if *name != "C3" { + let initial_run_start = Instant::now(); + let validation_results = execute_query_rayon_parallel2_volcano(query, db); + println!( + "Validation run completed in {:.4} seconds", + initial_run_start.elapsed().as_secs_f64() + ); + + let ground_truth_file = dir_path.join(format!("ground_truth_{}.txt", name)); + let serialized_current = serialize_results(&validation_results); + + if ground_truth_file.exists() { + println!( + "[VALIDATION] Checking results against ground truth: {:?}", + ground_truth_file + ); + let file = File::open(&ground_truth_file).unwrap(); + let reader = BufReader::new(file); + let mut cached_lines = Vec::new(); + for line in reader.lines() { + if let Ok(l) = line { + if !l.trim().is_empty() { + cached_lines.push(l); + } + } + } + + let current_set: HashSet<_> = serialized_current.into_iter().collect(); + let cached_set: HashSet<_> = cached_lines.into_iter().collect(); + + if current_set != cached_set { + let missing: Vec<_> = cached_set.difference(¤t_set).collect(); + let extra: Vec<_> = current_set.difference(&cached_set).collect(); + panic!( + "[FATAL] Query '{}' produced INVALID results!\nMissing {} lines.\nExtra {} lines.\nFirst few missing: {:?}\nFirst few extra: {:?}", + name, missing.len(), extra.len(), missing.iter().take(5).collect::>(), extra.iter().take(5).collect::>() + ); + } + println!("[✓] Validation passed for {}!", name); + } else { + println!( + "[VALIDATION] Ground truth does not exist. Caching results to {:?}", + ground_truth_file + ); + let mut file = File::create(&ground_truth_file).unwrap(); + for line in &serialized_current { + writeln!(file, "{}", line).unwrap(); + } + println!("Results cached. Note: Make sure the first run uses the 'hexastore' INDEX_TYPE!"); + } + } + + // Run the timed benchmark loop + println!("Running {} timed iterations...", ITERATIONS); + let mut total_time = 0.0; for _ in 0..ITERATIONS { let start = Instant::now(); let _ = execute_query_rayon_parallel2_volcano(query, db); - let elapsed = start.elapsed().as_secs_f64(); - total_time += elapsed; + total_time += start.elapsed().as_secs_f64(); } let avg = total_time / (ITERATIONS as f64); @@ -589,23 +693,20 @@ fn run_all_queries(db: &mut SparqlDatabase) { } fn main() { - // Set current directory to the root of the project std::env::set_current_dir(std::path::Path::new(env!("CARGO_MANIFEST_DIR"))) - .expect("Failed to set project root as current directory"); + .expect("Failed to set project root as current directory"); let file_path = "../benchmark_dataset/watdiv.10M.nt"; + let workload = workload_queries(); - match parse_large_ntriples_file(file_path) { + match parse_large_ntriples_file(file_path, &workload) { Ok(mut db) => { println!("Successfully processed N-Triples file"); - run_all_queries(&mut db); + run_all_queries(&mut db, &workload); } Err(e) => { eprintln!("Error processing file '{}': {}", file_path, e); - println!( - "File not found or error occurred. \ -Make sure ../benchmark_dataset/watdiv.10M.nt exists." - ); + println!("Make sure ../benchmark_dataset/watdiv.10M.nt exists."); } } } diff --git a/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh new file mode 100755 index 0000000..a45c408 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/n_triples_data/n_triple_10M_all_indexes.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# +# run_all_indexes.sh — Run the n_triple_10M benchmark for every index type +# and save all output to a specified directory. +# +# Usage: +# ./run_all_indexes.sh [output_dir] +# +# If output_dir is not specified, defaults to ./benchmark_results +# + +set -euo pipefail + +OUTPUT_DIR="${1:-./benchmark_results}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +RESULT_DIR="${OUTPUT_DIR}/${TIMESTAMP}" + +INDEX_TYPES=( + "buckets" + "buckets" + "pso" + "partial_hexastore" + "hexastore" + "ops" + "osp" + "pos" + "sop" + "spo" + "table" +) + +echo "==============================================" +echo " Kolibrie Index Benchmark Runner" +echo "==============================================" +echo "Output directory: ${RESULT_DIR}" +echo "Index types: ${INDEX_TYPES[*]}" +echo "==============================================" + +mkdir -p "${RESULT_DIR}" + +echo "" +echo "[BUILD] Compiling in release mode..." +cargo build --release --example n_triple_10M 2>&1 | tee "${RESULT_DIR}/build.log" +echo "[BUILD] Done." +echo "" + +for idx_type in "${INDEX_TYPES[@]}"; do + OUTPUT_FILE="${RESULT_DIR}/${idx_type}.txt" + + echo "==============================================" + echo "[RUN] INDEX_TYPE=${idx_type}" + echo " Output: ${OUTPUT_FILE}" + echo "==============================================" + + INDEX_TYPE="${idx_type}" \ + cargo run --release --example n_triple_10M \ + 2>&1 | tee "${OUTPUT_FILE}" + + echo "" + echo "[DONE] ${idx_type} -> ${OUTPUT_FILE}" + echo "" +done + +echo "==============================================" +echo " All benchmarks complete!" +echo " Results in: ${RESULT_DIR}" +echo "==============================================" \ No newline at end of file diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py new file mode 100644 index 0000000..4ab6924 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/generate_synthetic_stream.py @@ -0,0 +1,103 @@ +import random +import json +import argparse + +def generate_synthetic_data(num_triples, num_subjects, num_predicates, num_objects, window_size, slide_size, output_nt, output_queries): + subjects = [f"" for i in range(num_subjects)] + predicates = [f"" for i in range(num_predicates)] + objects = [f"" for i in range(num_objects)] + + # 1. Generate the triple stream + with open(output_nt, 'w') as f: + for _ in range(num_triples): + s = random.choice(subjects) + p = random.choice(predicates) + o = random.choice(objects) + f.write(f"{s} {p} {o} .\n") + + def get_distinct(pool, k): + return random.sample(pool, k) + + # 2. Pick specific constants for queries to ensure they match generated data + p_vars = get_distinct(predicates, 9) + o_vars = get_distinct(objects, 5) + s_vars = get_distinct(subjects, 4) + p_q6 = get_distinct(predicates, 20) + + # 3. Formulate standard SPARQL SELECT queries + # NOTE: we are not using RSPQL window functions as for some reason these are insanely slow, obscuring any performance difference due to indexing strategy + # Q6 helper: matches a subject with 20 distinct properties + q6_where = " ".join([f"?s {p_q6[i]} ?o{i} ." for i in range(20)]) + q6_select = " ".join([f"?o{i}" for i in range(20)]) + + # Generate 500 distinct predicates to force BucketIndex to create 500 buckets + p_q9 = get_distinct(predicates, 500) + + q9_where = " ".join([f"?s {p_q9[i]} ?o{i} ." for i in range(500)]) + q9_select = " ".join([f"?o{i}" for i in range(500)]) + + p_q10 = get_distinct(predicates, 500) + s_q10 = get_distinct(subjects, 5) + o_q10 = get_distinct(objects, 5) + + # 2. Build the Q10 WHERE clause explicitly + q10_where_clauses = [ + # --- SAVE TABLE --- + # 1. S _ _ : Highly selective subject bound (~20 matches). Table does 1 scan. + f"{s_q10[0]} ?p_start ?v_core .", + + # 2. _ P O : The 20 matches join against this. Probability of a match is near zero. + # The join collapses immediately. + f"?v_core {p_q10[0]} {o_q10[0]} .", + + # --- KILL PARTIAL HEXASTORE --- + # Force the dynamic indexer to build all remaining permutations statically + f"{s_q10[1]} {p_q10[1]} ?v_core .", # S P _ + f"{s_q10[2]} ?p_mid {o_q10[1]} .", # S _ O + f"?v_core ?p_end {o_q10[2]} .", # _ _ O + f"{s_q10[3]} {p_q10[2]} {o_q10[3]} .", # S P O + ] + + # --- KILL BUCKETS --- + # Add 490+ distinct _ P _ patterns to force massive bucket allocation + for i in range(3, 500): + q10_where_clauses.append(f"?v_core {p_q10[i]} ?v_ext_{i} .") + + q10_where = " ".join(q10_where_clauses) + + queries = { + "Q1": f"SELECT ?s WHERE {{ ?s {p_vars[0]} {o_vars[0]} . }}", + "Q2": f"SELECT ?s ?o2 ?o3 WHERE {{ ?s {p_vars[1]} ?o2 . ?s {p_vars[2]} ?o3 . }}", + "Q3": f"SELECT * WHERE {{ {s_vars[0]} {p_vars[3]} {o_vars[1]} . }}", + "Q4": f"SELECT ?v1 ?v5 WHERE {{ ?v1 {p_vars[4]} ?v2 . ?v2 {p_vars[5]} ?v3 . ?v4 {p_vars[6]} ?v3 . ?v4 {p_vars[7]} ?v5 . }}", + "Q5": f"SELECT ?p WHERE {{ {s_vars[1]} ?p {o_vars[2]} . {s_vars[2]} ?p {o_vars[3]} . }}", + "Q6": f"SELECT ?s {q6_select} WHERE {{ {q6_where} }}", + "Q7": f"SELECT ?s4 ?p5 ?o6 WHERE {{ {s_vars[3]} ?p1 ?o1 . ?s2 {p_vars[8]} ?o2 . ?s3 ?p3 {o_vars[4]} . ?s4 ?p1 ?o4 . ?s5 ?p5 ?o2 . ?s3 ?p6 ?o6 . ?s4 ?p5 ?o6 }}", + "Q8": f"SELECT * WHERE {{ ?n1 ?e12 ?n2 . ?n2 ?e23 ?n3 . ?n3 ?e34 ?n4 . ?n4 ?e41 ?n1 . ?n1 ?e13 ?n3 . ?n3 ?e31 ?n1 . ?n2 ?e24 ?n4 . ?n4 ?e42 ?n2 . ?n2 ?e21 ?n1 . ?n4 ?e43 ?n3 . }}", + "Q9": f"SELECT ?s {q9_select} WHERE {{ {q9_where} }}", + "Q10": f"SELECT * WHERE {{ {q10_where} }}" + } + + with open(output_queries, 'w') as f: + json.dump(queries, f, indent=4) + + print(f"Generated {num_triples} triples in {output_nt}") + print(f"Generated {len(queries)} SPARQL queries in {output_queries}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--triples", type=int, default=100_000) + parser.add_argument("--subjects", type=int, default=5_000) + parser.add_argument("--predicates", type=int, default=50) + parser.add_argument("--objects", type=int, default=30_000) + parser.add_argument("--window_size", type=int, default=5000) + parser.add_argument("--slide_size", type=int, default=1000) + parser.add_argument("--output_nt", type=str, default="benchmark_dataset/synthetic_1M.nt") + parser.add_argument("--output_queries", type=str, default="benchmark_dataset/synthetic_queries.json") + args = parser.parse_args() + + generate_synthetic_data( + args.triples, args.subjects, args.predicates, args.objects, + args.window_size, args.slide_size, + args.output_nt, args.output_queries + ) diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs new file mode 100644 index 0000000..4c54e7d --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.rs @@ -0,0 +1,164 @@ +use kolibrie::execute_query::*; +use kolibrie::sparql_database::*; +use shared::index_manager::*; +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::io::{BufRead, BufReader, Write, Seek, SeekFrom}; +use std::time::Instant; + +fn load_queries(path: &str) -> HashMap { + let file = File::open(path).expect("Failed to open queries JSON file"); + let reader = BufReader::new(file); + serde_json::from_reader(reader).expect("Failed to parse JSON queries") +} + +fn make_config_from_env(queries: Vec) -> (String, IndexConfig) { + let index_type = std::env::var("INDEX_TYPE") + .unwrap_or_else(|_| "hexastore".to_string()) + .to_lowercase(); + + let config = match index_type.as_str() { + "hexastore" | "" => IndexConfig::Hexastore, + "spo" => IndexConfig::SPO, + "pos" => IndexConfig::POS, + "pso" => IndexConfig::PSO, + "sop" => IndexConfig::SOP, + "ops" => IndexConfig::OPS, + "osp" => IndexConfig::OSP, + "table" => IndexConfig::SingleTable, + "partial_hexastore" => IndexConfig::PartialHexastore { queries }, + "buckets" => IndexConfig::Buckets { queries }, + other => { + eprintln!("Unknown INDEX_TYPE '{}', falling back to hexastore.", other); + IndexConfig::Hexastore + } + }; + + (index_type, config) +} + +fn main() { + let window_size_str = std::env::var("WINDOW_SIZE").unwrap_or_else(|_| "50000".to_string()); + let window_size: usize = window_size_str.parse().unwrap(); + let slide_size_str = std::env::var("SLIDE_SIZE").unwrap_or_else(|_| "10000".to_string()); + let slide_size: usize = slide_size_str.parse().unwrap(); + + // Setup CSV file for output + let csv_path = "./benchmark_dataset/benchmark_results.csv"; + let mut csv_file = OpenOptions::new() + .create(true) + .append(true) + .open(csv_path) + .expect("Failed to open CSV file"); + + // Write CSV header if the file is new/empty + if csv_file.metadata().expect("Failed to get metadata").len() == 0 { + writeln!(csv_file, "Index_Type,Query_ID,Window_Size,Slide_Size,Total_Windows,Insertion_Time_s,Deletion_Time_s,Query_Time_s,Throughput_win_per_s") + .expect("Failed to write CSV header"); + } + + // 1. Load queries generated by Python + let query_map = load_queries("./benchmark_dataset/synthetic_queries.json"); + + // Sort queries to run them in a consistent order (Q1, Q2, etc.) + let mut query_keys: Vec<_> = query_map.keys().cloned().collect(); + query_keys.sort(); + + // 2. Load all triples into memory once to avoid disk I/O bottlenecks during the benchmark + let file = File::open("./benchmark_dataset/synthetic_1M.nt").expect("Run Python script first"); + let reader = BufReader::new(file); + + let mut all_triples = Vec::new(); + for line in reader.lines() { + if let Ok(l) = line { + if !l.trim().is_empty() { + all_triples.push(l); + } + } + } + println!("Loaded {} triples from disk.", all_triples.len()); + + // 3. Iterate over each query individually + for query_id in query_keys { + + let query_string = query_map.get(&query_id).unwrap().clone(); + + // Pass ONLY the current query to the config so partial indexes are built specifically for it + let (index_name, config) = make_config_from_env(vec![query_string.clone()]); + + if ((query_id == "Q8" || query_id == "Q9") && index_name != "hexastore" && index_name != "partial_hexastore" && index_name != "buckets") { + continue; + } + + println!("\n=========================================================="); + println!("INDEX_TYPE = {} | QUERY = {}", index_name, query_id); + println!("WINDOW_SIZE = {} | SLIDE_SIZE = {}", window_size, slide_size); + println!("=========================================================="); + + // Initialize a fresh database for this query run + let mut db = SparqlDatabase::with_config(config); + let mut current_window = Vec::new(); + + let mut total_insertion_time = 0.0; + let mut total_deletion_time = 0.0; + let mut total_query_time = 0.0; + let mut window_count = 0; + let mut first = true; + + for chunk in all_triples.chunks(slide_size) { + // --- SLIDE IN --- + let insert_start = Instant::now(); + let batch_data = chunk.join("\n"); + db.parse_ntriples_and_add(&batch_data); + + // Indexes have to be built on first window + if first { + db.get_or_build_stats(); + db.build_all_indexes(); + first = false; + } + + current_window.extend_from_slice(chunk); + total_insertion_time += insert_start.elapsed().as_secs_f64(); + + // --- SLIDE OUT (Manual Window Management) --- + if current_window.len() > window_size { + let overflow = current_window.len() - window_size; + let to_remove: Vec = current_window.drain(0..overflow).collect(); + let batch_delete_data = to_remove.join("\n"); + + let delete_start = Instant::now(); + db.parse_ntriples_and_remove(&batch_delete_data); + total_deletion_time += delete_start.elapsed().as_secs_f64(); + } + + // --- EXECUTE ONLY CURRENT QUERY --- + let query_start = Instant::now(); + let _results = execute_query_rayon_parallel2_volcano(&query_string, &mut db); + total_query_time += query_start.elapsed().as_secs_f64(); + + window_count += 1; + + if window_count % 5 == 0 { + println!(" Window {} processed. Active Triples: {}", window_count, current_window.len()); + } + } + + let total_time = total_insertion_time + total_deletion_time + total_query_time; + let throughput = (window_count as f64) / total_time; + + println!("\n--- Final Benchmark Results: {} on {} ---", query_id, index_name); + println!("Total Window Slide Operations: {}", window_count); + println!("Total Insertion Time: {:.4} s", total_insertion_time); + println!("Total Deletion Time: {:.4} s", total_deletion_time); + println!("Total Query Time: {:.4} s", total_query_time); + println!("Overall Throughput: {:.2} windows/sec", throughput); + + // Append to CSV + writeln!( + csv_file, + "{},{},{},{},{},{:.4},{:.4},{:.4},{:.2}", + index_name, query_id, window_size, slide_size, window_count, total_insertion_time, total_deletion_time, total_query_time, throughput + ).expect("Failed to write to CSV file"); + } +} diff --git a/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh new file mode 100755 index 0000000..d742ff0 --- /dev/null +++ b/kolibrie/examples/sparql_syntax/synthetic_stream_benchmark/synthetic_stream_benchmark.sh @@ -0,0 +1,49 @@ +#!//usr/bin/env bash + +set -e + +# Configuration +TRIPLES=1000000 +WINDOW_SIZE=100000 +SLIDE_SIZE=50000 +SUBJECTS=300000 +PREDICATES=1000 +OBJECTS=100000 + +echo "Building Kolibrie benchmark..." +cargo build --release --example synthetic_stream_benchmark + +# Setup Data Directory +mkdir -p benchmark_dataset + +# Clear previous CSV results +rm -f benchmark_dataset/benchmark_results.csv + +echo "Generating native Kolibrie synthetic dataset and rules..." +python generate_synthetic_stream.py \ + --triples $TRIPLES \ + --subjects $SUBJECTS \ + --predicates $PREDICATES \ + --objects $OBJECTS \ + --window_size $WINDOW_SIZE \ + --slide_size $SLIDE_SIZE \ + --output_nt benchmark_dataset/synthetic_1M.nt \ + --output_queries benchmark_dataset/synthetic_queries.json + +INDEXES=("buckets" "partial_hexastore" "hexastore" "pso" "spo" "pos" "table") + +for IDX in "${INDEXES[@]}"; do + echo "==========================================================" + echo "Running Stream Benchmark for Index Type: $IDX" + echo "Window Size: $WINDOW_SIZE | Slide Size: $SLIDE_SIZE" + echo "==========================================================" + + export INDEX_TYPE=$IDX + export SLIDE_SIZE=$SLIDE_SIZE + export WINDOW_SIZE=$WINDOW_SIZE + + "../../../.././target/release/examples/synthetic_stream_benchmark" + + echo "Finished $IDX" + echo "" +done diff --git a/kolibrie/src/disk_storage/lsm_tree.rs b/kolibrie/src/disk_storage/lsm_tree.rs index 75b60d7..83f7240 100644 --- a/kolibrie/src/disk_storage/lsm_tree.rs +++ b/kolibrie/src/disk_storage/lsm_tree.rs @@ -9,7 +9,7 @@ */ use shared::triple::Triple; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::*; use std::collections::VecDeque; use std::sync::{Arc, RwLock, Mutex}; use std::path::PathBuf; @@ -480,16 +480,21 @@ impl LSMTree { } /// Build UnifiedIndex from all data in LSM-Tree - pub fn build_unified_index(&self) -> UnifiedIndex { - let mut index = UnifiedIndex::new(); + pub fn build_unified_index(&self) -> HexastoreIndex { + let mut index = HexastoreIndex::new(); let all_triples = self.get_all_triples(); index.build_from_triples(&all_triples); index } - /// Export to UnifiedIndex for use in SparqlDatabase - pub fn export_to_unified_index(&self) -> UnifiedIndex { - self.build_unified_index() + /// Export as Box for use in SparqlDatabase + pub fn export_to_trait_index(&self) -> Box { + Box::new(self.build_unified_index()) + } + + /// Keep old name for backward compat, now returns boxed + pub fn export_to_unified_index(&self) -> Box { + self.export_to_trait_index() } } diff --git a/kolibrie/src/disk_storage/sstable.rs b/kolibrie/src/disk_storage/sstable.rs index cab3138..f5cedef 100644 --- a/kolibrie/src/disk_storage/sstable.rs +++ b/kolibrie/src/disk_storage/sstable.rs @@ -9,7 +9,7 @@ */ use shared::triple::Triple; -use shared::index_manager::UnifiedIndex; +use shared::index_manager::*; use std::path::{Path, PathBuf}; use std::fs::File; use serde::{Serialize, Deserialize}; @@ -23,7 +23,7 @@ pub struct SSTable { /// Level in LSM tree (0, 1, 2, ...) pub level: usize, /// UnifiedIndex containing all 6 permutations - pub index: UnifiedIndex, + pub index: HexastoreIndex, /// Min and max keys for range queries (optimization) pub min_key: Triple, pub max_key: Triple, @@ -43,7 +43,7 @@ impl SSTable { memtable: &MemTable, data_dir: &Path, ) -> Result { - let mut index = UnifiedIndex::new(); + let mut index = HexastoreIndex::new(); let mut triples: Vec = Vec::new(); // Only include non-deleted triples @@ -90,7 +90,7 @@ impl SSTable { sstables: Vec<&SSTable>, data_dir: &Path, ) -> Result { - let mut merged_index = UnifiedIndex::new(); + let mut merged_index = HexastoreIndex::new(); // Merge all indexes for sstable in &sstables { diff --git a/kolibrie/src/query_engine.rs b/kolibrie/src/query_engine.rs index 5366545..967ff9c 100644 --- a/kolibrie/src/query_engine.rs +++ b/kolibrie/src/query_engine.rs @@ -112,7 +112,7 @@ impl QueryEngine { // Extract the encoded triples let triples = self.storage_manager.get_memory_database() - .index_manager + .index_manager.as_ref().expect("Cannot query index before building it") .query(None, None, None); // Insert into LSM-Tree @@ -121,7 +121,7 @@ impl QueryEngine { // Clear memory database self.storage_manager.get_memory_database_mut().triples.clear(); self.storage_manager.get_memory_database_mut().index_manager = - shared::index_manager::UnifiedIndex::new(); + Some(Box::new(shared::index_manager::HexastoreIndex::new())); // Build statistics self.storage_manager.get_memory_database_mut().get_or_build_stats(); diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index 9e0aa11..56c4618 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -1,24 +1,25 @@ /* - * Copyright © 2025 Volodymyr Kadzhaia - * Copyright © 2025 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - */ +* Copyright © 2025 Volodymyr Kadzhaia +* Copyright © 2025 Pieter Bonte +* KU Leuven — Stream Intelligence Lab, Belgium +* +* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this file, +* you can obtain one at https://mozilla.org/MPL/2.0/. +*/ +#[cfg(feature = "cuda")] +use crate::cuda::cuda_join::*; use shared::dictionary::Dictionary; use shared::query::{FilterExpression, ModelDecl, NeuralRelationDecl, TrainNeuralRelationDecl}; use shared::quoted_triple_store::{QuotedTripleStore, is_quoted_triple_id}; use shared::triple::Triple; use crate::parser; +use crate::parser::convert_triple_pattern; +use crate::query_builder::QueryBuilder; +use crate::streamertail_optimizer::DatabaseStats; use crate::utils; use crate::utils::ClonableFn; -#[cfg(feature = "cuda")] -use crate::cuda::cuda_join::*; -use shared::index_manager::UnifiedIndex; -use crate::query_builder::QueryBuilder; use crossbeam::channel::unbounded; use crossbeam::scope; use percent_encoding::percent_decode; @@ -26,15 +27,22 @@ use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Reader; use rayon::prelude::*; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -use std::arch::x86_64::*; +use shared::index_manager::TripleIndex; +use shared::index_manager::{ + BucketIndex, HexastoreIndex, IndexConfig, OPSSingleIndex, + OSPSingleIndex, POSSingleIndex, PSOSingleIndex, SOPSingleIndex, SPOSingleIndex, + SingleTableIndex, PartialHexastoreIndex +}; +use shared::terms::TriplePattern; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use std::arch::x86_64::*; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::sync::{Mutex, RwLock}; +use sysinfo::{ProcessExt, System, SystemExt}; use url::Url; -use crate::streamertail_optimizer::DatabaseStats; const MIN_CHUNK_SIZE: usize = 1024; const HASHMAP_INITIAL_CAPACITY: usize = 4096; @@ -48,7 +56,7 @@ pub struct SparqlDatabase { pub dictionary: Arc>, pub prefixes: HashMap, pub udfs: HashMap, - pub index_manager: UnifiedIndex, + pub index_manager: Option>, pub rule_map: HashMap, pub model_decls: HashMap, pub neural_relation_decls: HashMap, @@ -58,18 +66,24 @@ pub struct SparqlDatabase { pub ml_predict_materialized_triples: HashMap>, pub probability_seeds: HashMap, pub cached_stats: Option>, + pub index_config: IndexConfig, pub quoted_triple_store: Arc>, } #[allow(dead_code)] impl SparqlDatabase { pub fn new() -> Self { + Self::with_config(IndexConfig::Hexastore) + } + + /// Creates a new database with a user-chosen indexing strategy. + pub fn with_config(config: IndexConfig) -> Self { Self { triples: BTreeSet::new(), dictionary: Arc::new(RwLock::new(Dictionary::new())), prefixes: HashMap::new(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: None, rule_map: HashMap::new(), model_decls: HashMap::new(), neural_relation_decls: HashMap::new(), @@ -79,9 +93,30 @@ impl SparqlDatabase { ml_predict_materialized_triples: HashMap::new(), probability_seeds: HashMap::new(), cached_stats: None, + index_config: config, quoted_triple_store: Arc::new(RwLock::new(QuotedTripleStore::new())), } } + pub fn set_prefixes(&mut self, prefixes: HashMap) { + self.prefixes = prefixes; + } + + fn make_initial_index(config: &IndexConfig) -> Box { + match config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + // Pattern-dependent indexes start as hexastore; + // `build_all_indexes` will swap them out. + IndexConfig::Buckets { .. } => Box::new(HexastoreIndex::new()), + IndexConfig::PartialHexastore { .. } => Box::new(HexastoreIndex::new()), + } + } /// Encode a term that may be a quoted triple `<< s p o >>` (recursive). /// Returns the u32 ID for the term. @@ -197,20 +232,377 @@ impl SparqlDatabase { } } - pub fn set_prefixes(&mut self, prefixes: HashMap){ - self.prefixes=prefixes; + fn resolve_planned_access_patterns( + &mut self, + raw_queries: &[String], + ) -> Vec { + use crate::streamertail_optimizer::operators::PhysicalOperator; + use crate::streamertail_optimizer::utils::build_logical_plan; + use crate::streamertail_optimizer::Streamertail; + use shared::query::PlannedAccessPattern; + use shared::terms::Term; + use std::collections::HashSet; + + let mut planned_patterns = Vec::new(); + + for query_str in raw_queries { + if let Ok(( + _, + ( + _insert_clause, + variables, + patterns, + filters, + _group_vars, + parsed_prefixes, + values_clause, + binds, + _subqueries, + _limit, + _, + _order_conditions, + ), + )) = crate::parser::parse_sparql_query(query_str) + { + let mut prefixes = self.prefixes.clone(); + for (k, v) in parsed_prefixes { + prefixes.insert(k, v); + } + + let logical_plan = build_logical_plan( + variables.iter().map(|(t, v, _)| (*t, *v)).collect(), + patterns, + filters.clone(), + &prefixes, + self, + &binds, + values_clause.as_ref(), + ); + + let stats = self.get_or_build_stats(); + let mut optimizer = Streamertail::with_cached_stats(stats.clone()); + let optimized_plan = optimizer.find_best_plan(&logical_plan); + + let mut bound_vars = HashSet::new(); + + if let Some(vc) = values_clause { + for var in &vc.variables { + let mut v = var.to_string(); + if !v.starts_with('?') { + v = format!("?{}", v); + } + bound_vars.insert(v); + } + } + + fn traverse_physical( + op: &PhysicalOperator, + bound_vars: &mut HashSet, + out: &mut Vec, + ) { + match op { + PhysicalOperator::TableScan { pattern } + | PhysicalOperator::IndexScan { pattern } => { + let (s, p, o) = pattern; + + let bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject, + bound_predicate, + bound_object, + }); + + if let Term::Variable(v) = s { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = p { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = o { + bound_vars.insert(v.clone()); + } + } + PhysicalOperator::StarJoin { + join_var: _, + patterns, + } => { + let mut sorted_patterns = patterns.clone(); + + sorted_patterns.sort_by_key(|p| { + let mut constants = 0; + if matches!(p.0, Term::Constant(_)) { + constants += 1; + } + if matches!(p.1, Term::Constant(_)) { + constants += 1; + } + if matches!(p.2, Term::Constant(_)) { + constants += 1; + } + std::cmp::Reverse(constants) + }); + + let initial_bound_vars = bound_vars.clone(); + + for (i, pattern) in sorted_patterns.iter().enumerate() { + let (s, p, o) = pattern; + + // The pattern evaluated independently (Hash Join path fallback) + let original_bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => initial_bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let original_bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => initial_bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let original_bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => initial_bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject: original_bound_subject, + bound_predicate: original_bound_predicate, + bound_object: original_bound_object, + }); + + // For i > 0, it might also be executed as a Bind Join + if i > 0 { + let accum_bound_subject = match s { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let accum_bound_predicate = match p { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + let accum_bound_object = match o { + Term::Constant(_) => true, + Term::Variable(v) => bound_vars.contains(v), + Term::QuotedTriple(_) => false, + }; + + if accum_bound_subject != original_bound_subject + || accum_bound_predicate != original_bound_predicate + || accum_bound_object != original_bound_object + { + out.push(PlannedAccessPattern { + pattern: pattern.clone(), + bound_subject: accum_bound_subject, + bound_predicate: accum_bound_predicate, + bound_object: accum_bound_object, + }); + } + } + + // Update accumulated bound_vars + if let Term::Variable(v) = s { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = p { + bound_vars.insert(v.clone()); + } + if let Term::Variable(v) = o { + bound_vars.insert(v.clone()); + } + } + } + PhysicalOperator::ParallelJoin { left, right } => { + // Left executes independently + let mut left_vars = bound_vars.clone(); + traverse_physical(left, &mut left_vars, out); + + // Right can execute independently (Hash/Merge join) OR dependently (Bind join) + let mut right_vars_unbound = bound_vars.clone(); + traverse_physical(right, &mut right_vars_unbound, out); + + let mut right_vars_bound = left_vars.clone(); + traverse_physical(right, &mut right_vars_bound, out); + + bound_vars.extend(left_vars); + bound_vars.extend(right_vars_unbound); + } + PhysicalOperator::NestedLoopJoin { left, right } + | PhysicalOperator::HashJoin { left, right } + | PhysicalOperator::OptimizedHashJoin { left, right } => { + // Both sides evaluate independently using ONLY the pre-join bounds + let mut left_vars = bound_vars.clone(); + let mut right_vars = bound_vars.clone(); + traverse_physical(left, &mut left_vars, out); + traverse_physical(right, &mut right_vars, out); + + bound_vars.extend(left_vars); + bound_vars.extend(right_vars); + } + PhysicalOperator::Filter { input, .. } + | PhysicalOperator::Projection { input, .. } => { + traverse_physical(input, bound_vars, out); + } + PhysicalOperator::Subquery { inner, .. } => { + traverse_physical(inner, bound_vars, out); + } + PhysicalOperator::Bind { + input, + output_variable, + .. + } + | PhysicalOperator::MLPredict { + input, + output_variable, + .. + } => { + traverse_physical(input, bound_vars, out); + bound_vars.insert(output_variable.clone()); + } + PhysicalOperator::Values { variables, .. } => { + for var in variables { + let mut v = var.clone(); + if !v.starts_with('?') { + v = format!("?{}", v); + } + bound_vars.insert(v); + } + } + PhysicalOperator::InMemoryBuffer { .. } => {} + } + } + + traverse_physical(&optimized_plan, &mut bound_vars, &mut planned_patterns); + } + } + + planned_patterns + } + + fn resolve_query_patterns(&self, raw_queries: &[String]) -> Vec { + let mut patterns = Vec::new(); + + for query_str in raw_queries { + // parse_sparql_query returns a big tuple; field index 2 + // is the Vec of raw (&str, &str, &str) triple patterns, + // field index 5 is the HashMap of prefixes. + if let Ok((_rest, parsed)) = crate::parser::parse_sparql_query(query_str) { + let raw_patterns = parsed.2; // Vec<(&str, &str, &str)> + let query_prefixes = parsed.5; // HashMap + + // Merge query prefixes with database prefixes + let mut all_prefixes = self.prefixes.clone(); + for (k, v) in query_prefixes { + all_prefixes.insert(k, v); + } + + let mut dict = self.dictionary.write().unwrap(); + for triple in raw_patterns { + patterns.push(convert_triple_pattern(triple, &mut dict, &all_prefixes)); + } + } + } + + patterns + } + + pub fn build_all_indexes(&mut self) { + // Memory usage logging + let mut sys = System::new_all(); + let pid = sysinfo::get_current_pid().unwrap(); + sys.refresh_process(pid); + let mem_before = sys.process(pid).unwrap().memory(); + + let triples: Vec = self.triples.iter().cloned().collect(); + + // Clone the config to avoid holding an immutable borrow of `self` + let config = self.index_config.clone(); + + let mut index: Box = match config { + IndexConfig::Hexastore => Box::new(HexastoreIndex::new()), + IndexConfig::SPO => Box::new(SPOSingleIndex::new()), + IndexConfig::POS => Box::new(POSSingleIndex::new()), + IndexConfig::OSP => Box::new(OSPSingleIndex::new()), + IndexConfig::PSO => Box::new(PSOSingleIndex::new()), + IndexConfig::OPS => Box::new(OPSSingleIndex::new()), + IndexConfig::SOP => Box::new(SOPSingleIndex::new()), + IndexConfig::SingleTable => Box::new(SingleTableIndex::new()), + + IndexConfig::Buckets { queries } => { + let patterns = self.resolve_planned_access_patterns(&queries); + Box::new(BucketIndex::new(patterns)) + } + + IndexConfig::PartialHexastore { queries } => { + let parsed_patterns = self.resolve_planned_access_patterns(&queries); + Box::new(PartialHexastoreIndex::new(parsed_patterns)) + } + + // Future index types go here: + // IndexConfig::YourNewIndex { some_param, queries } => { + // let patterns = self.resolve_query_patterns(&queries); + // Box::new(YourNewIndex::new(patterns, some_param)) + // } + }; + + index.build_from_triples(&triples); + index.optimize(); + + // Memory usage logging + sys.refresh_process(pid); + let mem_after = sys.process(pid).unwrap().memory(); + println!( + "[Memory Debug] Index Build memory cost: {} MB", + (mem_after - mem_before) / 1024 / 1024 + ); + + self.index_manager = Some(index); + } + + /// Get a reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index(&self) -> &dyn TripleIndex { + self.index_manager + .as_deref() + .expect("index not built — call build_all_indexes() first") + } + + /// Get a mutable reference to the index. + /// Panics if `build_all_indexes()` hasn't been called yet. + pub fn index_mut(&mut self) -> &mut dyn TripleIndex { + self.index_manager + .as_deref_mut() + .expect("index not built — call build_all_indexes() first") } pub fn get_or_build_stats(&mut self) -> Arc { if let Some(stats) = &self.cached_stats { - return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats + return stats.clone(); // ← Clone the Arc (cheap), not the DatabaseStats } - + let stats = Arc::new(DatabaseStats::gather_stats_fast(self)); self.cached_stats = Some(stats.clone()); stats } - + pub fn invalidate_stats_cache(&mut self) { self.cached_stats = None; } @@ -221,13 +613,17 @@ impl SparqlDatabase { pub fn add_triple(&mut self, triple: Triple) { self.triples.insert(triple.clone()); - self.index_manager.insert(&triple); + if let Some(ref mut idx) = self.index_manager { + idx.insert(&triple); + } } - + pub fn delete_triple(&mut self, triple: &Triple) -> bool { let removed = self.triples.remove(triple); if removed { - self.index_manager.delete(triple); + if let Some(ref mut idx) = self.index_manager { + idx.delete(triple); + } } removed } @@ -280,7 +676,7 @@ impl SparqlDatabase { let mut xml = String::new(); xml.push_str("\n"); xml.push_str("\n"); - + // Group triples by subject let dict = self.dictionary.read().unwrap(); let mut subjects: BTreeMap> = BTreeMap::new(); @@ -300,10 +696,13 @@ impl SparqlDatabase { let subject = dict.decode(triple.subject); let predicate = dict.decode(triple.predicate); let object = dict.decode(triple.object); - subjects.entry(subject.unwrap().to_string()).or_default().push((predicate.unwrap().to_string(), object.unwrap().to_string())); + subjects + .entry(subject.unwrap().to_string()) + .or_default() + .push((predicate.unwrap().to_string(), object.unwrap().to_string())); } drop(dict); - + // For each subject, create an element. for (subject, po_pairs) in subjects { xml.push_str(&format!(" \n", subject)); @@ -312,7 +711,7 @@ impl SparqlDatabase { } xml.push_str(" \n"); } - + xml.push_str("\n"); xml } @@ -518,7 +917,9 @@ impl SparqlDatabase { // Skip empty or whitespace-only text if !trimmed_object.is_empty() { if let Ok(subject_str) = std::str::from_utf8(¤t_subject) { - if let Ok(predicate_str) = std::str::from_utf8(¤t_predicate) { + if let Ok(predicate_str) = + std::str::from_utf8(¤t_predicate) + { let resolved_predicate = self.resolve_term(predicate_str); // Lock the dictionary for encoding let mut dict = dictionary.write().unwrap(); @@ -908,16 +1309,18 @@ impl SparqlDatabase { pub fn parse_n3(&mut self, n3_data: &str) { let lines: Vec = n3_data.lines().map(|l| l.trim().to_string()).collect(); let chunk_size = 1000; - let chunks: Vec> = lines - .chunks(chunk_size) - .map(|c| c.to_vec()) - .collect(); - - let partial_results: Vec<(BTreeSet, Arc>, HashMap)> = - chunks.par_iter().map(|chunk| { + let chunks: Vec> = lines.chunks(chunk_size).map(|c| c.to_vec()).collect(); + + let partial_results: Vec<( + BTreeSet, + Arc>, + HashMap, + )> = chunks + .par_iter() + .map(|chunk| { let mut local_db = SparqlDatabase::new(); let mut statement = String::new(); - + for raw_line in chunk { let mut line = raw_line.as_str(); if let Some(comment_start) = line.find('#') { @@ -932,7 +1335,10 @@ impl SparqlDatabase { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let prefix = parts[0].trim_end_matches(':').to_string(); - let uri = parts[1].trim_start_matches('<').trim_end_matches('>').to_string(); + let uri = parts[1] + .trim_start_matches('<') + .trim_end_matches('>') + .to_string(); local_db.prefixes.insert(prefix, uri); } else { eprintln!("Invalid prefix declaration: {}", line); @@ -946,10 +1352,11 @@ impl SparqlDatabase { } } } - + (local_db.triples, local_db.dictionary, local_db.prefixes) - }).collect(); - + }) + .collect(); + for (triples, dict_arc, pref) in partial_results { for t in triples { self.triples.insert(t); @@ -970,11 +1377,21 @@ impl SparqlDatabase { let partial_results = self.parse_ntriples(ntriples_data); let encoded_triples = self.encode_triples(partial_results); - for encoded_triple in encoded_triples{ + for encoded_triple in encoded_triples { self.add_triple(encoded_triple); } } + // Parse_ntriples and remove from DB function + pub fn parse_ntriples_and_remove(&mut self, ntriples_data: &str) { + let partial_results = self.parse_ntriples(ntriples_data); + + let encoded_triples = self.encode_triples(partial_results); + for encoded_triple in encoded_triples { + self.delete_triple(&encoded_triple); + } + } + // Parses ntriples pub fn parse_ntriples(&mut self, ntriples_data: &str) -> Vec> { let lines: Vec<&str> = ntriples_data.lines().collect(); @@ -1004,7 +1421,9 @@ impl SparqlDatabase { let line_without_dot = &line[..line.len() - 1].trim(); // Parse the triple - if let Some((subject, predicate, object)) = self.parse_ntriples_line(line_without_dot) { + if let Some((subject, predicate, object)) = + self.parse_ntriples_line(line_without_dot) + { local_triples.push((subject, predicate, object)); } } @@ -1031,7 +1450,7 @@ impl SparqlDatabase { encoded_triples } - pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec{ + pub fn parse_and_encode_ntriples(&mut self, ntriples_data: &str) -> Vec { let partial_results = self.parse_ntriples(ntriples_data); self.encode_triples(partial_results) @@ -1187,7 +1606,11 @@ impl SparqlDatabase { let object = self.clean_ntriples_term(&parts[2]); Some((subject, predicate, object)) } else { - eprintln!("Invalid N-Triples line (expected 3 parts, got {}): {}", parts.len(), line); + eprintln!( + "Invalid N-Triples line (expected 3 parts, got {}): {}", + parts.len(), + line + ); None } } @@ -1203,9 +1626,9 @@ impl SparqlDatabase { // Handle URIs if term.starts_with('<') && term.ends_with('>') { - return term[1..term.len()-1].to_string(); + return term[1..term.len() - 1].to_string(); } - + // Handle literals (keep quotes and datatype/language info) if term.starts_with('"') { if let Some(close_quote_pos) = term[1..].find('"') { @@ -1221,7 +1644,7 @@ impl SparqlDatabase { } } } - + // Return as-is for other cases term.to_string() } @@ -1335,7 +1758,7 @@ impl SparqlDatabase { pub fn register_prefixes_from_query(&mut self, query: &str) { // Simple regex to extract PREFIX declarations let prefix_pattern = regex::Regex::new(r"PREFIX\s+([a-zA-Z0-9_]+):\s*<([^>]+)>").unwrap(); - + for captures in prefix_pattern.captures_iter(query) { if captures.len() >= 3 { let prefix = captures[1].to_string(); @@ -1344,7 +1767,7 @@ impl SparqlDatabase { } } } - + // Method to ensure prefixes are properly shared between components pub fn share_prefixes_with(&self, prefixes: &mut HashMap) { for (prefix, uri) in &self.prefixes { @@ -1370,11 +1793,11 @@ impl SparqlDatabase { let mut parts = term.splitn(2, ':'); let prefix = parts.next().unwrap(); let local_name = parts.next().unwrap_or(""); - + // First check the passed prefixes map if let Some(uri) = prefixes.get(prefix) { format!("{}{}", uri, local_name) - } + } // Then check the database's own prefixes map as a fallback else if let Some(uri) = self.prefixes.get(prefix) { format!("{}{}", uri, local_name) @@ -1399,16 +1822,20 @@ impl SparqlDatabase { match filter_expr { FilterExpression::Comparison(var, operator, value) => { // Check if either side contains arithmetic operations - let has_arithmetic = var.contains('+') || var.contains('-') || - var.contains('*') || var.contains('/') || - value.contains('+') || value.contains('-') || - value.contains('*') || value.contains('/'); - + let has_arithmetic = var.contains('+') + || var.contains('-') + || var.contains('*') + || var.contains('/') + || value.contains('+') + || value.contains('-') + || value.contains('*') + || value.contains('/'); + if has_arithmetic { // Use the non-SIMD arithmetic expression evaluator for complex expressions let left_result = self.evaluate_arithmetic_string(result, var); let right_result = self.evaluate_arithmetic_string(result, value); - + match (left_result, right_result) { (Ok(left_val), Ok(right_val)) => { // Both sides are numeric, perform comparison @@ -1421,8 +1848,8 @@ impl SparqlDatabase { "<=" => left_val <= right_val, _ => false, } - }, - _ => false // At least one expression couldn't be evaluated + } + _ => false, // At least one expression couldn't be evaluated } } else { // For simple expressions without arithmetic operators, use the SIMD approach @@ -1430,12 +1857,12 @@ impl SparqlDatabase { // First, try parsing both values as numbers let var_value_num = var_value_str.parse::(); let filter_value_num = value.parse::(); - + if var_value_num.is_ok() && filter_value_num.is_ok() { // Both values are numeric, perform SIMD numeric comparison let var_value = var_value_num.unwrap(); let filter_value = filter_value_num.unwrap(); - + // On x86 (SSE2) or x86_64 (SSE2) use SIMD intrinsics #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { @@ -1444,37 +1871,51 @@ impl SparqlDatabase { let var_simd = _mm_set1_epi32(var_value); let filter_simd = _mm_set1_epi32(filter_value); return match *operator { - "=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, - "!=" => _mm_movemask_epi8(_mm_cmpeq_epi32( - var_simd, - filter_simd, - )) != 0xFFFF, - ">" => _mm_movemask_epi8(_mm_cmpgt_epi32( - var_simd, - filter_simd, - )) == 0xFFFF, + "=" => { + _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) == 0xFFFF + } + "!=" => { + _mm_movemask_epi8(_mm_cmpeq_epi32( + var_simd, + filter_simd, + )) != 0xFFFF + } + ">" => { + _mm_movemask_epi8(_mm_cmpgt_epi32( + var_simd, + filter_simd, + )) == 0xFFFF + } ">=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let gt = _mm_cmpgt_epi32(var_simd, filter_simd); - _mm_movemask_epi8(_mm_or_si128(eq, gt)) == 0xFFFF + let eq = + _mm_cmpeq_epi32(var_simd, filter_simd); + let gt = + _mm_cmpgt_epi32(var_simd, filter_simd); + _mm_movemask_epi8(_mm_or_si128(eq, gt)) + == 0xFFFF + } + "<" => { + _mm_movemask_epi8(_mm_cmpgt_epi32( + filter_simd, + var_simd, + )) == 0xFFFF } - "<" => _mm_movemask_epi8(_mm_cmpgt_epi32( - filter_simd, - var_simd, - )) == 0xFFFF, "<=" => { - let eq = _mm_cmpeq_epi32(var_simd, filter_simd); - let lt = _mm_cmpgt_epi32(filter_simd, var_simd); - _mm_movemask_epi8(_mm_or_si128(eq, lt)) == 0xFFFF + let eq = + _mm_cmpeq_epi32(var_simd, filter_simd); + let lt = + _mm_cmpgt_epi32(filter_simd, var_simd); + _mm_movemask_epi8(_mm_or_si128(eq, lt)) + == 0xFFFF } _ => false, }; } } - + // On ARM (aarch64) use NEON intrinsics #[cfg(target_arch = "aarch64")] { @@ -1485,54 +1926,72 @@ impl SparqlDatabase { "=" => { let cmp = vceqq_s32(var_neon, filter_neon); (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) } "!=" => { let cmp = vceqq_s32(var_neon, filter_neon); !((vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF)) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF)) } ">" => { let cmp = vcgtq_s32(var_neon, filter_neon); (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) } ">=" => { let eq = vceqq_s32(var_neon, filter_neon); let gt = vcgtq_s32(var_neon, filter_neon); let cmp = vorrq_u32(eq, gt); (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) } "<" => { let cmp = vcgtq_s32(filter_neon, var_neon); (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) } "<=" => { let eq = vceqq_s32(var_neon, filter_neon); let lt = vcgtq_s32(filter_neon, var_neon); let cmp = vorrq_u32(eq, lt); (vgetq_lane_u32(cmp, 0) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 1) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 2) == 0xFFFFFFFF) - && (vgetq_lane_u32(cmp, 3) == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 1) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 2) + == 0xFFFFFFFF) + && (vgetq_lane_u32(cmp, 3) + == 0xFFFFFFFF) } _ => false, - } + }; } } - + // Fallback (or if compiled for a non‐SIMD platform) #[cfg(not(any( target_arch = "x86", @@ -1554,10 +2013,10 @@ impl SparqlDatabase { // At least one value is a string, perform string comparison let var_bytes = var_value_str.as_bytes(); let filter_bytes = value.as_bytes(); - + let var_len = var_bytes.len(); let filter_len = filter_bytes.len(); - + // If lengths differ, they can't be equal if var_len != filter_len { return match *operator { @@ -1566,19 +2025,20 @@ impl SparqlDatabase { _ => false, // Other operators are not supported for strings }; } - + let mut i = 0; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { unsafe { while i + 16 <= var_len { - let var_chunk = _mm_loadu_si128( - var_bytes[i..].as_ptr() as *const __m128i, - ); - let filter_chunk = _mm_loadu_si128( - filter_bytes[i..].as_ptr() as *const __m128i, - ); - let cmp = _mm_cmpeq_epi8(var_chunk, filter_chunk); + let var_chunk = + _mm_loadu_si128(var_bytes[i..].as_ptr() + as *const __m128i); + let filter_chunk = + _mm_loadu_si128(filter_bytes[i..].as_ptr() + as *const __m128i); + let cmp = + _mm_cmpeq_epi8(var_chunk, filter_chunk); let mask = _mm_movemask_epi8(cmp); if mask != 0xFFFF { return match *operator { @@ -1591,15 +2051,18 @@ impl SparqlDatabase { } } } - + #[cfg(target_arch = "aarch64")] { unsafe { while i + 16 <= var_len { - let var_chunk = vld1q_u8(var_bytes[i..].as_ptr()); - let filter_chunk = vld1q_u8(filter_bytes[i..].as_ptr()); + let var_chunk = + vld1q_u8(var_bytes[i..].as_ptr()); + let filter_chunk = + vld1q_u8(filter_bytes[i..].as_ptr()); let cmp = vceqq_u8(var_chunk, filter_chunk); - let cmp_arr: [u8; 16] = std::mem::transmute(cmp); + let cmp_arr: [u8; 16] = + std::mem::transmute(cmp); if cmp_arr.iter().any(|&lane| lane != 0xFF) { return match *operator { "=" => false, @@ -1611,7 +2074,7 @@ impl SparqlDatabase { } } } - + // Handle remaining bytes if i < var_len { for j in i..var_len { @@ -1624,7 +2087,7 @@ impl SparqlDatabase { } } } - + // Strings are equal match *operator { "=" => true, @@ -1636,15 +2099,15 @@ impl SparqlDatabase { false } } - }, + } FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) - }, + self.evaluate_filter_expression(result, left) + && self.evaluate_filter_expression(result, right) + } FilterExpression::Or(left, right) => { - self.evaluate_filter_expression(result, left) || - self.evaluate_filter_expression(result, right) - }, + self.evaluate_filter_expression(result, left) + || self.evaluate_filter_expression(result, right) + } FilterExpression::Not(expr) => { !self.evaluate_filter_expression(result, expr) }, @@ -1681,7 +2144,7 @@ impl SparqlDatabase { fn evaluate_arithmetic_expression<'a>( &self, result: &BTreeMap<&'a str, String>, - expr: &shared::query::ArithmeticExpression<'a> + expr: &shared::query::ArithmeticExpression<'a>, ) -> Result { match expr { shared::query::ArithmeticExpression::Operand(operand) => { @@ -1689,39 +2152,48 @@ impl SparqlDatabase { if operand.starts_with('?') { if let Some(var_value) = result.get(*operand) { // Parse the variable value as a number - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) } else { Err(format!("Variable '{}' not found", operand)) } - } + } // Check if it's a numeric literal else if operand.chars().all(|c| c.is_digit(10) || c == '.') { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) - } + operand + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", operand)) + } // Check if it's a string literal else if operand.starts_with('"') && operand.ends_with('"') { - Err(format!("Cannot perform arithmetic on string literal '{}'", operand)) - } + Err(format!( + "Cannot perform arithmetic on string literal '{}'", + operand + )) + } // Parse it as a number else { - operand.parse::().map_err(|_| format!("Cannot parse '{}' as a number", operand)) + operand + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", operand)) } - }, + } shared::query::ArithmeticExpression::Add(left, right) => { let left_val = self.evaluate_arithmetic_expression(result, left)?; let right_val = self.evaluate_arithmetic_expression(result, right)?; Ok(left_val + right_val) - }, + } shared::query::ArithmeticExpression::Subtract(left, right) => { let left_val = self.evaluate_arithmetic_expression(result, left)?; let right_val = self.evaluate_arithmetic_expression(result, right)?; Ok(left_val - right_val) - }, + } shared::query::ArithmeticExpression::Multiply(left, right) => { let left_val = self.evaluate_arithmetic_expression(result, left)?; let right_val = self.evaluate_arithmetic_expression(result, right)?; Ok(left_val * right_val) - }, + } shared::query::ArithmeticExpression::Divide(left, right) => { let left_val = self.evaluate_arithmetic_expression(result, left)?; let right_val = self.evaluate_arithmetic_expression(result, right)?; @@ -1738,38 +2210,48 @@ impl SparqlDatabase { fn evaluate_arithmetic_string<'a>( &self, result: &BTreeMap<&'a str, String>, - expr_str: &'a str + expr_str: &'a str, ) -> Result { // Check for parenthesized expressions and remove them if needed let expr_to_parse = if expr_str.starts_with('(') && expr_str.ends_with(')') { - &expr_str[1..expr_str.len()-1] + &expr_str[1..expr_str.len() - 1] } else { expr_str }; - - if expr_to_parse.contains('+') || expr_to_parse.contains('-') || - expr_to_parse.contains('*') || expr_to_parse.contains('/') { + + if expr_to_parse.contains('+') + || expr_to_parse.contains('-') + || expr_to_parse.contains('*') + || expr_to_parse.contains('/') + { // Parse the expression string into an ArithmeticExpression match parser::parse_arithmetic_expression(expr_to_parse) { Ok((_, arithmetic_expr)) => { // Evaluate the parsed expression self.evaluate_arithmetic_expression(result, &arithmetic_expr) - }, + } Err(e) => { // Print the error - eprintln!("Failed to parse arithmetic expression '{}': {:?}", expr_to_parse, e); - + eprintln!( + "Failed to parse arithmetic expression '{}': {:?}", + expr_to_parse, e + ); + // If parsing fails, try to treat it as a simple operand if expr_to_parse.starts_with('?') { // It's a variable if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) } else { Err(format!("Variable '{}' not found", expr_to_parse)) } } else { // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + expr_to_parse + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) } } } @@ -1778,13 +2260,17 @@ impl SparqlDatabase { if expr_to_parse.starts_with('?') { // It's a variable if let Some(var_value) = result.get(expr_to_parse) { - var_value.parse::().map_err(|_| format!("Cannot parse '{}' as a number", var_value)) + var_value + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", var_value)) } else { Err(format!("Variable '{}' not found", expr_to_parse)) } } else { // Parse as a number - expr_to_parse.parse::().map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) + expr_to_parse + .parse::() + .map_err(|_| format!("Cannot parse '{}' as a number", expr_to_parse)) } } } @@ -1793,14 +2279,14 @@ impl SparqlDatabase { fn evaluate_filter_expression<'a>( &self, result: &BTreeMap<&'a str, String>, - filter_expr: &FilterExpression<'a> + filter_expr: &FilterExpression<'a>, ) -> bool { match filter_expr { FilterExpression::Comparison(left, operator, right) => { // Evaluate both sides as arithmetic expressions let left_result = self.evaluate_arithmetic_string(result, left); let right_result = self.evaluate_arithmetic_string(result, right); - + match (left_result, right_result) { (Ok(left_val), Ok(right_val)) => { // Both sides are numeric, perform numeric comparison @@ -1813,7 +2299,7 @@ impl SparqlDatabase { "<=" => left_val <= right_val, _ => false, } - }, + } _ => { let left_str = if left.starts_with('?') { // Fix for the type mismatch error - convert to string @@ -1824,7 +2310,7 @@ impl SparqlDatabase { } else { left }; - + let right_str = if right.starts_with('?') { // Fix for the type mismatch error - convert to string match result.get(right) { @@ -1834,7 +2320,7 @@ impl SparqlDatabase { } else { right }; - + match *operator { "=" => left_str == right_str, "!=" => left_str != right_str, @@ -1842,11 +2328,11 @@ impl SparqlDatabase { } } } - }, + } FilterExpression::And(left, right) => { - self.evaluate_filter_expression(result, left) && - self.evaluate_filter_expression(result, right) - }, + self.evaluate_filter_expression(result, left) + && self.evaluate_filter_expression(result, right) + } FilterExpression::Or(left, right) => { self.evaluate_filter_expression(result, left) || self.evaluate_filter_expression(result, right) @@ -1890,10 +2376,8 @@ impl SparqlDatabase { // Re-encode triples from the other database using the merged dictionary let mut re_encoded_triples = BTreeSet::new(); for triple in &other.triples { - let subject = - merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); - let predicate = - merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); + let subject = merged_dictionary.encode(other_dict.decode(triple.subject).unwrap()); + let predicate = merged_dictionary.encode(other_dict.decode(triple.predicate).unwrap()); let object = merged_dictionary.encode(other_dict.decode(triple.object).unwrap()); re_encoded_triples.insert(Triple { subject, @@ -1918,7 +2402,7 @@ impl SparqlDatabase { dictionary: Arc::new(RwLock::new(merged_dictionary)), prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: Some(self.index().clone_empty()), rule_map: HashMap::new(), model_decls: self.model_decls.clone(), neural_relation_decls: self.neural_relation_decls.clone(), @@ -1928,6 +2412,7 @@ impl SparqlDatabase { ml_predict_materialized_triples: self.ml_predict_materialized_triples.clone(), probability_seeds: merged_seeds, cached_stats: None, + index_config: self.index_config.clone(), quoted_triple_store: Arc::clone(&self.quoted_triple_store), } } @@ -1995,7 +2480,7 @@ impl SparqlDatabase { dictionary: Arc::clone(&self.dictionary), prefixes: self.prefixes.clone(), udfs: HashMap::new(), - index_manager: UnifiedIndex::new(), + index_manager: Some(self.index().clone_empty()), rule_map: HashMap::new(), model_decls: self.model_decls.clone(), neural_relation_decls: self.neural_relation_decls.clone(), @@ -2005,6 +2490,7 @@ impl SparqlDatabase { ml_predict_materialized_triples: self.ml_predict_materialized_triples.clone(), probability_seeds: HashMap::new(), cached_stats: None, + index_config: self.index_config.clone(), quoted_triple_store: Arc::clone(&self.quoted_triple_store), } } @@ -2235,7 +2721,8 @@ impl SparqlDatabase { let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); // Partition final_results into groups based on variable bindings. - let mut both_vars_bound: HashMap<(String, String), Vec>> = HashMap::new(); + let mut both_vars_bound: HashMap<(String, String), Vec>> = + HashMap::new(); let mut subject_var_bound: HashMap>> = HashMap::new(); let mut object_var_bound: HashMap>> = HashMap::new(); let mut neither_var_bound: Vec> = Vec::new(); @@ -2252,10 +2739,16 @@ impl SparqlDatabase { .push(result); } (Some(subj_val), None) => { - subject_var_bound.entry(subj_val.clone()).or_default().push(result); + subject_var_bound + .entry(subj_val.clone()) + .or_default() + .push(result); } (None, Some(obj_val)) => { - object_var_bound.entry(obj_val.clone()).or_default().push(result); + object_var_bound + .entry(obj_val.clone()) + .or_default() + .push(result); } (None, None) => { neither_var_bound.push(result); @@ -2681,13 +3174,13 @@ impl SparqlDatabase { // Preallocate with capacity estimation to avoid rehashing let estimated_capacity = (final_results.len() / 4).max(HASHMAP_INITIAL_CAPACITY); - + // Use with_capacity to preallocate hashmap space - let mut both_vars_bound: HashMap<(String, String), Vec> = + let mut both_vars_bound: HashMap<(String, String), Vec> = HashMap::with_capacity(estimated_capacity); - let mut subject_var_bound: HashMap> = + let mut subject_var_bound: HashMap> = HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = + let mut object_var_bound: HashMap> = HashMap::with_capacity(estimated_capacity); let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); @@ -2730,29 +3223,33 @@ impl SparqlDatabase { // Calculate optimal chunk size based on available processors and dataset size let chunk_size = (triples.len() / rayon::current_num_threads()).max(MIN_CHUNK_SIZE); - + // Process triples in chunks for better cache locality and load balancing let results = triples .par_chunks(chunk_size) .flat_map(|triple_chunk| { // Preallocate result vector for this chunk based on estimated hit rate let mut local_results = Vec::with_capacity(triple_chunk.len() / 4); - + // Process each triple in the chunk for triple in triple_chunk { // Step 1: Quick predicate check first (early filter) let pred_opt = dictionary.decode(triple.predicate); - if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { + if pred_opt.is_none() + || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes + { continue; } - + // Step 2: Filter check if needed if let Some(filter_bytes) = &literal_filter_bytes { let obj_opt = dictionary.decode(triple.object); - if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { + if obj_opt.is_none() + || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes + { continue; } - + // Decode subject only if predicate and object pass filters if let Some(subj) = dictionary.decode(triple.subject) { process_join( @@ -2772,7 +3269,7 @@ impl SparqlDatabase { // No filter - decode both subject and object let subj_opt = dictionary.decode(triple.subject); let obj_opt = dictionary.decode(triple.object); - + if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { process_join( &subj, @@ -2789,7 +3286,7 @@ impl SparqlDatabase { } } } - + local_results }) .collect(); @@ -2817,12 +3314,12 @@ impl SparqlDatabase { let literal_filter_bytes = literal_filter.as_ref().map(|s| s.as_bytes()); let estimated_capacity = (final_results.len() / 3).max(HASHMAP_INITIAL_CAPACITY1); - - let mut both_vars_bound: HashMap<(String, String), Vec> = - HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller - let mut subject_var_bound: HashMap> = + + let mut both_vars_bound: HashMap<(String, String), Vec> = + HashMap::with_capacity(estimated_capacity / 2); // This tends to be smaller + let mut subject_var_bound: HashMap> = HashMap::with_capacity(estimated_capacity); - let mut object_var_bound: HashMap> = + let mut object_var_bound: HashMap> = HashMap::with_capacity(estimated_capacity); let mut neither_var_bound: Vec = Vec::with_capacity(final_results.len() / 2); @@ -2863,12 +3360,13 @@ impl SparqlDatabase { let object_var_bound_arc = Arc::new(object_var_bound); let neither_var_bound_arc = Arc::new(neither_var_bound); - let chunk_size = ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); - + let chunk_size = + ((triples.len() / rayon::current_num_threads()) * 3 / 2).max(MIN_CHUNK_SIZE1); + let results = triples .par_chunks(chunk_size) .fold( - || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size + || Vec::with_capacity(chunk_size / 4), // Local vector capacity based on chunk size |mut local_results, triple_chunk| { // Create a local result buffer process_triple_chunk( @@ -2885,7 +3383,7 @@ impl SparqlDatabase { &mut local_results, dictionary, ); - + local_results }, ) @@ -2898,7 +3396,7 @@ impl SparqlDatabase { if chunk.is_empty() { return acc; } - + // Pre-allocate to avoid reallocation during append if acc.capacity() < acc.len() + chunk.len() { acc.reserve(chunk.len()); @@ -3223,45 +3721,17 @@ impl SparqlDatabase { self.udfs.insert(name.to_string(), ClonableFn::new(f)); } - /// Rebuild all indexes from the current state of `self.triples`. - pub fn build_all_indexes(&mut self) { - // Clear existing indexes - self.index_manager.clear(); - - // Get all triples as a vector for parallel processing - let triples: Vec = self.triples.iter().cloned().collect(); - - // Calculate optimal chunk size based on available cores and data size - let num_threads = rayon::current_num_threads(); - let chunk_size = (triples.len() / num_threads).max(1000); - - // Build indexes in parallel chunks - let partial_indexes: Vec<_> = triples - .par_chunks(chunk_size) - .map(|chunk| { - let mut local_index = shared::index_manager::UnifiedIndex::new(); - for triple in chunk { - local_index.insert(triple); - } - local_index - }) - .collect(); - - // Merge all partial indexes - for partial_index in partial_indexes { - self.index_manager.merge_from(partial_index); - } - - // Optimize the final merged index - self.index_manager.optimize(); - } - /// Triple to string pub fn triple_to_string(&self, triple: &Triple, dict: &Dictionary) -> String { let subject = dict.decode(triple.subject); let predicate = dict.decode(triple.predicate); let object = dict.decode(triple.object); - format!("{} {} {}", subject.unwrap(), predicate.unwrap(), object.unwrap()) + format!( + "{} {} {}", + subject.unwrap(), + predicate.unwrap(), + object.unwrap() + ) } pub fn decode_triple(&self, triple: &Triple) -> Option<(String, String, String)> { @@ -3270,12 +3740,15 @@ impl SparqlDatabase { let predicate = dict.decode(triple.predicate)?.to_string(); let object = dict.decode(triple.object)?.to_string(); drop(dict); - + Some((subject, predicate, object)) } } -#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "sse2"))] +#[cfg_attr( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature(enable = "sse2") +)] #[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] pub unsafe fn simd_eq(a: &[u8], b: &[u8]) -> bool { if a.len() != b.len() { @@ -3439,7 +3912,7 @@ fn process_join<'a>( // Process neither_var_bound - least restrictive case last for &idx in neither_var_bound.iter() { let base_result = &final_results_arc[idx]; - + // Check both consistency constraints let subject_consistent = base_result .get(subject_var) @@ -3450,7 +3923,7 @@ fn process_join<'a>( if subject_consistent && object_consistent { let mut extended_result = base_result.clone(); - + // Only insert if not already present if !base_result.contains_key(subject_var) { extended_result.insert(subject_var, subject.to_string()); @@ -3458,7 +3931,7 @@ fn process_join<'a>( if !base_result.contains_key(object_var) { extended_result.insert(object_var, object.to_string()); } - + local_results.push(extended_result); } } @@ -3485,13 +3958,13 @@ fn process_triple_chunk<'a>( if pred_opt.is_none() || pred_opt.as_ref().unwrap().as_bytes() != predicate_bytes { continue; } - + if let Some(filter_bytes) = literal_filter_bytes { let obj_opt = dictionary.decode(triple.object); if obj_opt.is_none() || obj_opt.as_ref().unwrap().as_bytes() != *filter_bytes { continue; } - + if let Some(subj) = dictionary.decode(triple.subject) { process_join_efficiently( &subj, @@ -3509,7 +3982,7 @@ fn process_triple_chunk<'a>( } else { let subj_opt = dictionary.decode(triple.subject); let obj_opt = dictionary.decode(triple.object); - + if let (Some(subj), Some(obj)) = (subj_opt, obj_opt) { process_join_efficiently( &subj, @@ -3528,7 +4001,6 @@ fn process_triple_chunk<'a>( } } - #[inline(always)] fn process_join_efficiently<'a>( subject: &str, @@ -3587,7 +4059,7 @@ fn process_join_efficiently<'a>( // Process least restrictive case - neither var bound for &idx in neither_var_bound.iter() { let base_result = &final_results_arc[idx]; - + // Check both consistency constraints let subject_consistent = base_result .get(subject_var) @@ -3598,7 +4070,7 @@ fn process_join_efficiently<'a>( if subject_consistent && object_consistent { let mut extended_result = base_result.clone(); - + // Only insert if not already present if !base_result.contains_key(subject_var) { extended_result.insert(subject_var, subject.to_string()); @@ -3606,7 +4078,7 @@ fn process_join_efficiently<'a>( if !base_result.contains_key(object_var) { extended_result.insert(object_var, object.to_string()); } - + local_results.push(extended_result); } } diff --git a/kolibrie/src/storage_manager.rs b/kolibrie/src/storage_manager.rs index 6b6b1f3..ff5a61f 100644 --- a/kolibrie/src/storage_manager.rs +++ b/kolibrie/src/storage_manager.rs @@ -13,6 +13,7 @@ use crate::disk_storage::lsm_tree::{LSMTree, LSMConfig}; use crate::storage_trait::{StorageTrait, QueryAnalysis, QueryAnalyzer, StorageMode}; use crate::execute_query::{execute_query, execute_query_rayon_parallel2_volcano}; use shared::triple::Triple; +use shared::index_manager::TripleIndex; /// Storage backend type - determines where data is physically stored #[derive(Debug, Clone, PartialEq, Eq)] @@ -160,7 +161,7 @@ impl StorageManager { match self.current_backend { StorageBackend::Memory => { // Query the UnifiedIndex in memory database - self.memory_database.index_manager.query(s, p, o) + self.memory_database.index().query(s, p, o) } StorageBackend::Disk => { if let Some(ref lsm) = self.disk_database { @@ -187,7 +188,7 @@ impl StorageManager { // Create temporary database with the disk's index let mut temp_db = SparqlDatabase::new(); - temp_db.index_manager = unified_index; + temp_db.index_manager = Some(unified_index); // Share dictionary and prefixes from memory database temp_db.dictionary = self.memory_database.dictionary.clone(); @@ -221,7 +222,7 @@ impl StorageManager { } // Get all triples from memory's UnifiedIndex - let triples = self.memory_database.index_manager.query(None, None, None); + let triples = self.memory_database.index().query(None, None, None); // Insert into disk if let Some(ref lsm) = self.disk_database { @@ -258,7 +259,7 @@ impl StorageManager { /// Get statistics about current storage pub fn get_storage_stats(&self) -> StorageStats { - let memory_triples = self.memory_database.index_manager.query(None, None, None).len(); + let memory_triples = self.memory_database.index().query(None, None, None).len(); let disk_triples = if let Some(ref lsm) = self.disk_database { lsm.get_all_triples().len() } else { @@ -446,4 +447,4 @@ mod tests { // Cleanup std::fs::remove_dir_all("./test_storage_manager_migrate").ok(); } -} \ No newline at end of file +} diff --git a/kolibrie/src/streamertail_optimizer/execution/engine.rs b/kolibrie/src/streamertail_optimizer/execution/engine.rs index 367b51d..24b7619 100644 --- a/kolibrie/src/streamertail_optimizer/execution/engine.rs +++ b/kolibrie/src/streamertail_optimizer/execution/engine.rs @@ -1245,17 +1245,11 @@ impl ExecutionEngine { match pattern { // FULLY BOUND (3 constants) - just check if triple exists (Term::Constant(s), Term::Constant(p), Term::Constant(o)) => { - // Use SPO index to check existence - if let Some(pred_map) = database.index_manager.spo.get(s) { - if let Some(objects) = pred_map.get(p) { - if objects.contains(o) { - // Triple exists - return empty binding (no variables to bind) - return vec![HashMap::new()]; - } - } + if !database.index().query(Some(*s), Some(*p), Some(*o)).is_empty() { + return vec![HashMap::new()]; + } else { + return Vec::new(); } - // Triple doesn't exist - Vec::new() } // TWO BOUNDS (2 constants, 1 variable) @@ -1282,7 +1276,7 @@ impl ExecutionEngine { // FULLY UNBOUND (0 constants, 3 variables) - table scan is appropriate (Term::Variable(s), Term::Variable(p), Term::Variable(o)) => { - println!("INFO: Full table scan for fully unbound pattern (? {}, ?{}, ?{})", s, p, o); + //println!("INFO: Full table scan for fully unbound pattern (? {}, ?{}, ?{})", s, p, o); Self::execute_table_scan_with_ids(database, pattern) } @@ -1299,22 +1293,25 @@ impl ExecutionEngine { predicate: u32, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(pred_map) = database.index_manager.spo.get(&subject) { - if let Some(objects) = pred_map.get(&predicate) { - // Use pre-compute the key - objects.iter().map(|&object| { - let mut result = HashMap::with_capacity(1); // Pre-size - result.insert(object_var.clone(), object); // Still need clone in closure - result - }).collect() - } else { - Vec::new() - } + // Try efficient two-key scan first + if let Some(objects) = database.index().scan_sp(subject, predicate) { + objects.iter().map(|&object| { + let mut result = HashMap::with_capacity(1); + result.insert(object_var.clone(), object); + result + }).collect() } else { - Vec::new() + // Fallback: query(Some(s), Some(p), None) + database.index().query(Some(subject), Some(predicate), None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(1); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } } @@ -1325,22 +1322,24 @@ impl ExecutionEngine { object: u32, predicate_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - if let Some(obj_map) = database.index_manager.sop.get(&subject) { - if let Some(predicates) = obj_map.get(&object) { - // Use iterator with pre-sized HashMap - predicates.iter().map(|&predicate| { + if let Some(predicates) = database.index().scan_so(subject, object) { + predicates.iter().map(|&predicate| { + let mut result = HashMap::with_capacity(1); + result.insert(predicate_var.clone(), predicate); + result + }).collect() + } else { + // Fallback: query(Some(s), None, Some(o)) + database.index().query(Some(subject), None, Some(object)) + .into_iter() + .map(|triple| { let mut result = HashMap::with_capacity(1); - result.insert(predicate_var.clone(), predicate); + result.insert(predicate_var.clone(), triple.predicate); result - }).collect() - } else { - Vec::new() - } - } else { - Vec::new() + }) + .collect() } } @@ -1351,22 +1350,24 @@ impl ExecutionEngine { object: u32, subject_var: String, ) -> Vec> { - // Strip '?' prefix from variable name let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); - if let Some(obj_map) = database.index_manager.pos.get(&predicate) { - if let Some(subjects) = obj_map.get(&object) { - // Use iterator with pre-sized HashMap - subjects.iter().map(|&subject| { + if let Some(subjects) = database.index().scan_po(predicate, object) { + subjects.iter().map(|&subject| { + let mut result = HashMap::with_capacity(1); + result.insert(subject_var.clone(), subject); + result + }).collect() + } else { + // Fallback: query(None, Some(p), Some(o)) + database.index().query(None, Some(predicate), Some(object)) + .into_iter() + .map(|triple| { let mut result = HashMap::with_capacity(1); - result.insert(subject_var.clone(), subject); + result.insert(subject_var.clone(), triple.subject); result - }).collect() - } else { - Vec::new() - } - } else { - Vec::new() + }) + .collect() } } @@ -1377,25 +1378,18 @@ impl ExecutionEngine { predicate_var: String, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(pred_map) = database.index_manager.spo.get(&subject) { - // Clone variable names once before flat_map - pred_map.iter().flat_map(|(&predicate, objects)| { - let predicate_var = predicate_var.clone(); - let object_var = object_var.clone(); - objects.iter().map(move |&object| { - let mut result = HashMap::with_capacity(2); - result.insert(predicate_var.clone(), predicate); - result.insert(object_var.clone(), object); - result - }) - }).collect() - } else { - Vec::new() - } + database.index().query(Some(subject), None, None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(predicate_var.clone(), triple.predicate); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } /// Scans P index (Predicate -> (Subject, Object)) @@ -1405,25 +1399,18 @@ impl ExecutionEngine { subject_var: String, object_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let object_var = object_var.strip_prefix('?').unwrap_or(&object_var).to_string(); - if let Some(obj_map) = database.index_manager.pos.get(&predicate) { - // Clone variable names once before flat_map - obj_map.iter().flat_map(|(&object, subjects)| { - let subject_var = subject_var.clone(); - let object_var = object_var.clone(); - subjects.iter().map(move |&subject| { - let mut result = HashMap::with_capacity(2); - result.insert(subject_var.clone(), subject); - result.insert(object_var.clone(), object); - result - }) - }).collect() - } else { - Vec::new() - } + database.index().query(None, Some(predicate), None) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(subject_var.clone(), triple.subject); + result.insert(object_var.clone(), triple.object); + result + }) + .collect() } /// Scans O index (Object -> (Subject, Predicate)) @@ -1433,24 +1420,17 @@ impl ExecutionEngine { subject_var: String, predicate_var: String, ) -> Vec> { - // Strip '?' prefix from variable names let subject_var = subject_var.strip_prefix('?').unwrap_or(&subject_var).to_string(); let predicate_var = predicate_var.strip_prefix('?').unwrap_or(&predicate_var).to_string(); - if let Some(subj_map) = database.index_manager.osp.get(&object) { - // Clone variable names once before flat_map - subj_map.iter().flat_map(|(&subject, predicates)| { - let subject_var = subject_var.clone(); - let predicate_var = predicate_var.clone(); - predicates.iter().map(move |&predicate| { - let mut result = HashMap::with_capacity(2); - result.insert(subject_var.clone(), subject); - result.insert(predicate_var.clone(), predicate); - result - }) - }).collect() - } else { - Vec::new() - } + database.index().query(None, None, Some(object)) + .into_iter() + .map(|triple| { + let mut result = HashMap::with_capacity(2); + result.insert(subject_var.clone(), triple.subject); + result.insert(predicate_var.clone(), triple.predicate); + result + }) + .collect() } -} \ No newline at end of file +} diff --git a/kolibrie/tests/rdf_star_test.rs b/kolibrie/tests/rdf_star_test.rs index 998c0a0..89cbc4e 100644 --- a/kolibrie/tests/rdf_star_test.rs +++ b/kolibrie/tests/rdf_star_test.rs @@ -299,7 +299,7 @@ fn test_bind_subject_predicate_object() { "#; db.parse_ntriples_and_add(ntriples); let triples_vec: Vec<_> = db.triples.iter().cloned().collect(); - db.index_manager.build_from_triples(&triples_vec); + Option::expect(db.index_manager.as_mut(), "index should not be None").build_from_triples(&triples_vec); db.get_or_build_stats(); // Query using BIND(SUBJECT(?t) AS ?s) @@ -324,7 +324,7 @@ fn test_bind_triple_constructor() { "#; db.parse_ntriples_and_add(ntriples); let triples_vec: Vec<_> = db.triples.iter().cloned().collect(); - db.index_manager.build_from_triples(&triples_vec); + Option::expect(db.index_manager.as_mut(), "index should not be None").build_from_triples(&triples_vec); db.get_or_build_stats(); // Use BIND(TRIPLE(...) AS ?t) to construct a quoted triple @@ -405,7 +405,7 @@ fn test_delete_where() { "#; db.parse_ntriples_and_add(ntriples); let triples_vec: Vec<_> = db.triples.iter().cloned().collect(); - db.index_manager.build_from_triples(&triples_vec); + Option::expect(db.index_manager.as_mut(), "index should not be None").build_from_triples(&triples_vec); db.get_or_build_stats(); assert_eq!(db.triples.len(), 3, "Should start with 3 triples"); diff --git a/shared/src/index_manager/buckets.rs b/shared/src/index_manager/buckets.rs new file mode 100644 index 0000000..887e0bb --- /dev/null +++ b/shared/src/index_manager/buckets.rs @@ -0,0 +1,698 @@ +use std::collections::{HashMap, HashSet}; + +use crate::index_manager::*; +use crate::query::PlannedAccessPattern; +use crate::terms::*; +use crate::triple::Triple; + +#[derive(Debug, Clone)] +pub enum BucketStore { + // 0 Dynamic Variables + D0_F0(bool), + D0_F1(HashSet), + D0_F2(HashSet<[u32; 2]>), + D0_F3(HashSet<[u32; 3]>), + + // >0 Dynamic Variables, 0 Free Variables (Existence checks) + D1_F0(HashSet), + D2_F0(HashSet<[u32; 2]>), + D3_F0(HashSet<[u32; 3]>), + + // >0 Dynamic Variables, >0 Free Variables (Map lookups) + D1_F1(HashMap>), + D1_F2(HashMap>), + D2_F1(HashMap<[u32; 2], HashSet>), +} + +#[derive(Debug, Clone)] +pub struct DirectedBucket { + pub pattern: TriplePattern, + pub c_positions: Vec, // Constants + pub d_positions: Vec, // Dynamic (Pipeline-bound) + pub f_positions: Vec, // Free (Unbound) + pub c_values: Vec, // The actual constant values + pub data: BucketStore, +} + +impl DirectedBucket { + pub fn new(planned: PlannedAccessPattern) -> Self { + let mut c_positions = Vec::new(); + let mut d_positions = Vec::new(); + let mut f_positions = Vec::new(); + let mut c_values = Vec::new(); + + let mut check_pos = |term: &Term, is_bound: bool, pos: usize| match term { + Term::Constant(c) => { + c_positions.push(pos); + c_values.push(*c); + } + Term::Variable(_) => { + if is_bound { + d_positions.push(pos); + } else { + f_positions.push(pos); + } + } + Term::QuotedTriple(_) => { + if is_bound { + d_positions.push(pos); + } else { + f_positions.push(pos); + } + } + }; + + check_pos(&planned.pattern.0, planned.bound_subject, 0); + check_pos(&planned.pattern.1, planned.bound_predicate, 1); + check_pos(&planned.pattern.2, planned.bound_object, 2); + + let data = match (d_positions.len(), f_positions.len()) { + (0, 0) => BucketStore::D0_F0(false), + (0, 1) => BucketStore::D0_F1(HashSet::new()), + (0, 2) => BucketStore::D0_F2(HashSet::new()), + (0, 3) => BucketStore::D0_F3(HashSet::new()), + (1, 0) => BucketStore::D1_F0(HashSet::new()), + (2, 0) => BucketStore::D2_F0(HashSet::new()), + (3, 0) => BucketStore::D3_F0(HashSet::new()), + (1, 1) => BucketStore::D1_F1(HashMap::new()), + (1, 2) => BucketStore::D1_F2(HashMap::new()), + (2, 1) => BucketStore::D2_F1(HashMap::new()), + _ => unreachable!("Invalid number of variables in triple"), + }; + + Self { + pattern: planned.pattern, + c_positions, + d_positions, + f_positions, + c_values, + data, + } + } + + #[inline(always)] + fn get_triple_field(triple: &Triple, pos: usize) -> u32 { + match pos { + 0 => triple.subject, + 1 => triple.predicate, + 2 => triple.object, + _ => unreachable!(), + } + } + + pub fn matches(&self, triple: &Triple) -> bool { + for (i, &pos) in self.c_positions.iter().enumerate() { + if Self::get_triple_field(triple, pos) != self.c_values[i] { + return false; + } + } + true + } + + pub fn insert(&mut self, triple: &Triple) -> bool { + let get_val = |pos| Self::get_triple_field(triple, pos); + + match &mut self.data { + BucketStore::D0_F0(b) => { + let old = *b; + *b = true; + !old + } + BucketStore::D0_F1(s) => s.insert(get_val(self.f_positions[0])), + BucketStore::D0_F2(s) => { + s.insert([get_val(self.f_positions[0]), get_val(self.f_positions[1])]) + } + BucketStore::D0_F3(s) => s.insert([ + get_val(self.f_positions[0]), + get_val(self.f_positions[1]), + get_val(self.f_positions[2]), + ]), + BucketStore::D1_F0(s) => s.insert(get_val(self.d_positions[0])), + BucketStore::D2_F0(s) => { + s.insert([get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + } + BucketStore::D3_F0(s) => s.insert([ + get_val(self.d_positions[0]), + get_val(self.d_positions[1]), + get_val(self.d_positions[2]), + ]), + BucketStore::D1_F1(m) => m + .entry(get_val(self.d_positions[0])) + .or_default() + .insert(get_val(self.f_positions[0])), + BucketStore::D1_F2(m) => m + .entry(get_val(self.d_positions[0])) + .or_default() + .insert([get_val(self.f_positions[0]), get_val(self.f_positions[1])]), + BucketStore::D2_F1(m) => m + .entry([get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + .or_default() + .insert(get_val(self.f_positions[0])), + } + } + + pub fn remove(&mut self, triple: &Triple) -> bool { + let get_val = |pos| Self::get_triple_field(triple, pos); + + match &mut self.data { + BucketStore::D0_F0(b) => { + let old = *b; + *b = false; + old + } + BucketStore::D0_F1(s) => s.remove(&get_val(self.f_positions[0])), + BucketStore::D0_F2(s) => { + s.remove(&[get_val(self.f_positions[0]), get_val(self.f_positions[1])]) + } + BucketStore::D0_F3(s) => s.remove(&[ + get_val(self.f_positions[0]), + get_val(self.f_positions[1]), + get_val(self.f_positions[2]), + ]), + BucketStore::D1_F0(s) => s.remove(&get_val(self.d_positions[0])), + BucketStore::D2_F0(s) => { + s.remove(&[get_val(self.d_positions[0]), get_val(self.d_positions[1])]) + } + BucketStore::D3_F0(s) => s.remove(&[ + get_val(self.d_positions[0]), + get_val(self.d_positions[1]), + get_val(self.d_positions[2]), + ]), + BucketStore::D1_F1(m) => { + let k = get_val(self.d_positions[0]); + if let Some(set) = m.get_mut(&k) { + let removed = set.remove(&get_val(self.f_positions[0])); + if set.is_empty() { + m.remove(&k); + } + removed + } else { + false + } + } + BucketStore::D1_F2(m) => { + let k = get_val(self.d_positions[0]); + if let Some(set) = m.get_mut(&k) { + let removed = + set.remove(&[get_val(self.f_positions[0]), get_val(self.f_positions[1])]); + if set.is_empty() { + m.remove(&k); + } + removed + } else { + false + } + } + BucketStore::D2_F1(m) => { + let k = [get_val(self.d_positions[0]), get_val(self.d_positions[1])]; + if let Some(set) = m.get_mut(&k) { + let removed = set.remove(&get_val(self.f_positions[0])); + if set.is_empty() { + m.remove(&k); + } + removed + } else { + false + } + } + } + } + + pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + // Extract constants and queried dynamics once per query call + let mut t_base = [0u32; 3]; + for (i, &pos) in self.c_positions.iter().enumerate() { + t_base[pos] = self.c_values[i]; + } + for &pos in &self.d_positions { + t_base[pos] = match pos { + 0 => s.unwrap(), + 1 => p.unwrap(), + 2 => o.unwrap(), + _ => unreachable!(), + }; + } + + // Inline macro/closure to quickly instantiate the triple without inner loops + let mut push_res = |f_vals: &[u32]| { + let mut t = t_base; + for (i, &pos) in self.f_positions.iter().enumerate() { + t[pos] = f_vals[i]; + } + results.push(Triple { + subject: t[0], + predicate: t[1], + object: t[2], + }); + }; + + match &self.data { + BucketStore::D0_F0(b) => { + if *b { + push_res(&[]); + } + } + BucketStore::D0_F1(set) => { + for &f in set { + push_res(&[f]); + } + } + BucketStore::D0_F2(set) => { + for &f in set { + push_res(&f); + } + } + BucketStore::D0_F3(set) => { + for &f in set { + push_res(&f); + } + } + BucketStore::D1_F0(set) => { + if set.contains(&t_base[self.d_positions[0]]) { + push_res(&[]); + } + } + BucketStore::D2_F0(set) => { + if set.contains(&[t_base[self.d_positions[0]], t_base[self.d_positions[1]]]) { + push_res(&[]); + } + } + BucketStore::D3_F0(set) => { + if set.contains(&[ + t_base[self.d_positions[0]], + t_base[self.d_positions[1]], + t_base[self.d_positions[2]], + ]) { + push_res(&[]); + } + } + BucketStore::D1_F1(map) => { + if let Some(set) = map.get(&t_base[self.d_positions[0]]) { + for &f in set { + push_res(&[f]); + } + } + } + BucketStore::D1_F2(map) => { + if let Some(set) = map.get(&t_base[self.d_positions[0]]) { + for &f in set { + push_res(&f); + } + } + } + BucketStore::D2_F1(map) => { + if let Some(set) = + map.get(&[t_base[self.d_positions[0]], t_base[self.d_positions[1]]]) + { + for &f in set { + push_res(&[f]); + } + } + } + } + + // Return immediately without the slow `.into_iter().filter().collect()` + results + } + + pub fn get_all_triples(&self) -> Vec { + let mut results = Vec::new(); + + let reconstruct = |d_vals: &[u32], f_vals: &[u32]| { + let mut t = [0; 3]; + for (i, &pos) in self.c_positions.iter().enumerate() { + t[pos] = self.c_values[i]; + } + for (i, &pos) in self.d_positions.iter().enumerate() { + t[pos] = d_vals[i]; + } + for (i, &pos) in self.f_positions.iter().enumerate() { + t[pos] = f_vals[i]; + } + Triple { + subject: t[0], + predicate: t[1], + object: t[2], + } + }; + + match &self.data { + BucketStore::D0_F0(b) => { + if *b { + results.push(reconstruct(&[], &[])); + } + } + BucketStore::D0_F1(set) => { + for &f in set { + results.push(reconstruct(&[], &[f])); + } + } + BucketStore::D0_F2(set) => { + for &f in set { + results.push(reconstruct(&[], &f)); + } + } + BucketStore::D0_F3(set) => { + for &f in set { + results.push(reconstruct(&[], &f)); + } + } + BucketStore::D1_F0(set) => { + for &d in set { + results.push(reconstruct(&[d], &[])); + } + } + BucketStore::D2_F0(set) => { + for &d in set { + results.push(reconstruct(&d, &[])); + } + } + BucketStore::D3_F0(set) => { + for &d in set { + results.push(reconstruct(&d, &[])); + } + } + BucketStore::D1_F1(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&[d], &[f])); + } + } + } + BucketStore::D1_F2(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&[d], &f)); + } + } + } + BucketStore::D2_F1(map) => { + for (&d, set) in map { + for &f in set { + results.push(reconstruct(&d, &[f])); + } + } + } + } + results + } + + pub fn clear(&mut self) { + match &mut self.data { + BucketStore::D0_F0(b) => *b = false, + BucketStore::D0_F1(s) => s.clear(), + BucketStore::D0_F2(s) => s.clear(), + BucketStore::D0_F3(s) => s.clear(), + BucketStore::D1_F0(s) => s.clear(), + BucketStore::D2_F0(s) => s.clear(), + BucketStore::D3_F0(s) => s.clear(), + BucketStore::D1_F1(m) => m.clear(), + BucketStore::D1_F2(m) => m.clear(), + BucketStore::D2_F1(m) => m.clear(), + } + } + + pub fn shrink_to_fit(&mut self) { + match &mut self.data { + BucketStore::D0_F0(_) => {} + BucketStore::D0_F1(s) => s.shrink_to_fit(), + BucketStore::D0_F2(s) => s.shrink_to_fit(), + BucketStore::D0_F3(s) => s.shrink_to_fit(), + BucketStore::D1_F0(s) => s.shrink_to_fit(), + BucketStore::D2_F0(s) => s.shrink_to_fit(), + BucketStore::D3_F0(s) => s.shrink_to_fit(), + BucketStore::D1_F1(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } + BucketStore::D1_F2(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } + BucketStore::D2_F1(m) => { + for v in m.values_mut() { + v.shrink_to_fit(); + } + m.shrink_to_fit(); + } + } + } +} + +#[derive(Debug, Clone)] +pub struct BucketIndex { + pub buckets: Vec, +} + +impl BucketIndex { + pub fn new(patterns: Vec) -> Self { + println!("[Bucket Debug] --- BucketIndex Initialization ---"); + println!("[Bucket Debug] Requested planned patterns: {}", patterns.len()); + + let mut seen_configs = HashSet::new(); + let mut unique_buckets = Vec::new(); + + for planned in patterns { + // 1. Determine positions exactly as DirectedBucket::new would + let mut c_pos_vals = Vec::new(); + let mut d_positions = Vec::new(); + + let mut check = |term: &Term, is_bound: bool, pos: usize| match term { + Term::Constant(c) => c_pos_vals.push((pos, *c)), + Term::Variable(_) => { + if is_bound { + d_positions.push(pos); + } + } + Term::QuotedTriple(_) => { + if is_bound { + d_positions.push(pos); + } + } + }; + + check(&planned.pattern.0, planned.bound_subject, 0); + check(&planned.pattern.1, planned.bound_predicate, 1); + check(&planned.pattern.2, planned.bound_object, 2); + + // 2. Create a unique key for this storage configuration + // We sort D because the internal logic of DirectedBucket + // usually processes them in positional order. + d_positions.sort_unstable(); + let config_key = (c_pos_vals, d_positions); + + // 3. Only create the bucket if we haven't seen this storage config yet + if seen_configs.insert(config_key) { + let b = DirectedBucket::new(planned); + println!( + "[Bucket Debug] Created Bucket [{}]: C={:?}, D={:?}, F={:?}", + unique_buckets.len(), b.c_positions, b.d_positions, b.f_positions + ); + unique_buckets.push(b); + } else { + println!("[Bucket Debug] Pruned duplicate pattern: {:?}", planned.pattern); + } + } + + println!("[Bucket Debug] Final unique bucket count: {}", unique_buckets.len()); + Self { buckets: unique_buckets } + } + fn bucket_covers_query( + bucket_pat: &TriplePattern, + q_s: Option, + q_p: Option, + q_o: Option, + ) -> bool { + let (b_s, b_p, b_o) = bucket_pat; + let s_safe = match b_s { + Variable(_) => true, + Constant(c) => q_s == Some(*c), + QuotedTriple(_) => true, + }; + let p_safe = match b_p { + Variable(_) => true, + Constant(c) => q_p == Some(*c), + QuotedTriple(_) => true, + }; + let o_safe = match b_o { + Variable(_) => true, + Constant(c) => q_o == Some(*c), + QuotedTriple(_) => true, + }; + s_safe && p_safe && o_safe + } +} + +impl TripleIndex for BucketIndex { + fn clone_empty(&self) -> Box { + let mut patterns = Vec::new(); + for b in &self.buckets { + let mut bound_subject = false; + let mut bound_predicate = false; + let mut bound_object = false; + + for &pos in &b.d_positions { + match pos { + 0 => bound_subject = true, + 1 => bound_predicate = true, + 2 => bound_object = true, + _ => {} + } + } + + patterns.push(PlannedAccessPattern { + pattern: b.pattern.clone(), + bound_subject, + bound_predicate, + bound_object, + }); + } + Box::new(BucketIndex::new(patterns)) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + let mut unique: HashSet = HashSet::new(); + for bucket in &self.buckets { + unique.extend(bucket.get_all_triples()); + } + unique.len() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, + so: false, + po: false, + ps: false, + os: false, + op: false, + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let mut inserted_anywhere = false; + for bucket in &mut self.buckets { + if bucket.matches(triple) && bucket.insert(triple) { + inserted_anywhere = true; + } + } + inserted_anywhere + } + + fn delete(&mut self, triple: &Triple) -> bool { + let mut deleted_anywhere = false; + for bucket in &mut self.buckets { + if bucket.matches(triple) && bucket.remove(triple) { + deleted_anywhere = true; + } + } + deleted_anywhere + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + self.clear(); + for triple in triples { + self.insert(triple); + } + self.optimize(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let provided_positions = { + let mut pos = Vec::new(); + if s.is_some() { + pos.push(0); + } + if p.is_some() { + pos.push(1); + } + if o.is_some() { + pos.push(2); + } + pos + }; + + for b in self.buckets.iter() { + if Self::bucket_covers_query(&b.pattern, s, p, o) { + // Check if lengths match first to avoid allocating and sorting unless necessary + if b.c_positions.len() + b.d_positions.len() == provided_positions.len() { + let mut expected_provided = b.c_positions.clone(); + expected_provided.extend(&b.d_positions); + expected_provided.sort_unstable(); + + if expected_provided == provided_positions { + return b.query(s, p, o); + } + } + } + } + + panic!( + "[FATAL] NO EXACT MATCH FOUND! Query cannot be satisfied optimally by any bucket.\n\ + Query required: s={:?}, p={:?}, o={:?}\n\ + Provided Positions: {:?}", + s, p, o, provided_positions + ); + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + for bucket in &mut self.buckets { + bucket.clear(); + } + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { + None + } + + fn optimize(&mut self) { + for bucket in &mut self.buckets { + bucket.shrink_to_fit(); + } + } +} diff --git a/shared/src/index_manager.rs b/shared/src/index_manager/hexastore.rs similarity index 87% rename from shared/src/index_manager.rs rename to shared/src/index_manager/hexastore.rs index 974fdec..856d21b 100644 --- a/shared/src/index_manager.rs +++ b/shared/src/index_manager/hexastore.rs @@ -1,541 +1,552 @@ -/* - * Copyright © 2024 Volodymyr Kadzhaia - * Copyright © 2024 Pieter Bonte - * KU Leuven — Stream Intelligence Lab, Belgium - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this file, - * you can obtain one at https://mozilla.org/MPL/2.0/. - */ - -use serde::{Serialize, Deserialize}; -use std::collections::{HashMap, HashSet}; -use crate::terms::*; -use crate::terms::Term::*; -use crate::triple::Triple; - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct UnifiedIndex { - // The six permutations, using HashMap of HashMap of HashSet. - pub spo: HashMap>>, - pub pos: HashMap>>, - pub osp: HashMap>>, - pub pso: HashMap>>, - pub ops: HashMap>>, - pub sop: HashMap>>, -} - -impl UnifiedIndex { - pub fn new() -> Self { - Self { - spo: HashMap::new(), - pos: HashMap::new(), - osp: HashMap::new(), - pso: HashMap::new(), - ops: HashMap::new(), - sop: HashMap::new(), - } - } - - /// Insert a single triple into all six indexes - pub fn insert(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - if let Some(pred_map) = self.spo.get(&s) { - if let Some(objects) = pred_map.get(&p) { - if objects.contains(&o) { - return false; // triple already stored - } - } - } - self.spo.entry(s).or_default().entry(p).or_default().insert(o); - self.pos.entry(p).or_default().entry(o).or_default().insert(s); - self.osp.entry(o).or_default().entry(s).or_default().insert(p); - self.pso.entry(p).or_default().entry(s).or_default().insert(o); - self.ops.entry(o).or_default().entry(p).or_default().insert(s); - self.sop.entry(s).or_default().entry(o).or_default().insert(p); - true - } - - /// Delete a single triple from all six indexes - pub fn delete(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - - let exists = self.spo - .get(&s) - .and_then(|pred_map| pred_map.get(&p)) - .map_or(false, |objects| objects.contains(&o)); - - if !exists { - return false; // triple doesn't exist - } - - // Remove from all six indexes using helper function - remove_from_index(&mut self.spo, s, p, o); - remove_from_index(&mut self.pos, p, o, s); - remove_from_index(&mut self.osp, o, s, p); - remove_from_index(&mut self.pso, p, s, o); - remove_from_index(&mut self.ops, o, p, s); - remove_from_index(&mut self.sop, s, o, p); - true - } - - /// Bulk-build the index from a list of triples - pub fn build_from_triples(&mut self, triples: &[Triple]) { - use rayon::prelude::*; - - self.clear(); - - if triples.is_empty() { - return; - } - - // Pre-allocate with capacity estimates - let capacity = triples.len() / 100; - - self.spo.reserve(capacity); - self.pos.reserve(capacity); - self.osp.reserve(capacity); - self.pso.reserve(capacity); - self.ops.reserve(capacity); - self.sop.reserve(capacity); - - // Build indexes in parallel by creating partial indexes and merging - let num_threads = rayon::current_num_threads(); - let chunk_size = (triples.len() / num_threads).max(10_000); - - let partial_indexes: Vec = triples - .par_chunks(chunk_size) - .map(|chunk| { - let mut local_index = UnifiedIndex::new(); - - // Pre-allocate local index - let local_capacity = chunk.len() / 50; - local_index.spo.reserve(local_capacity); - local_index.pos.reserve(local_capacity); - local_index.osp.reserve(local_capacity); - local_index.pso.reserve(local_capacity); - local_index.ops.reserve(local_capacity); - local_index.sop.reserve(local_capacity); - - // Insert triples into local index - for triple in chunk { - local_index.insert_optimized(triple); - } - - local_index - }) - .collect(); - - // Sequentially merge partial indexes - for partial_index in partial_indexes { - self.merge_from(partial_index); - } - - // Optimize memory layout after building - self.optimize_post_build(); - } - - #[inline] - fn insert_optimized(&mut self, triple: &Triple) -> bool { - let Triple { subject: s, predicate: p, object: o } = *triple; - - // Check for duplicates only in SPO index (most selective) - if let Some(pred_map) = self.spo.get(&s) { - if let Some(objects) = pred_map.get(&p) { - if objects.contains(&o) { - return false; - } - } - } - - // Batch insert into all indexes - self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(8)) - .entry(p).or_insert_with(|| HashSet::with_capacity(16)) - .insert(o); - - self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) - .entry(o).or_insert_with(|| HashSet::with_capacity(8)) - .insert(s); - - self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(8)) - .entry(s).or_insert_with(|| HashSet::with_capacity(16)) - .insert(p); - - self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) - .entry(s).or_insert_with(|| HashSet::with_capacity(8)) - .insert(o); - - self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) - .entry(p).or_insert_with(|| HashSet::with_capacity(8)) - .insert(s); - - self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(8)) - .entry(o).or_insert_with(|| HashSet::with_capacity(16)) - .insert(p); - - true - } - - fn optimize_post_build(&mut self) { - use rayon::prelude::*; - - // Parallelize the optimization of each index - rayon::scope(|s| { - s.spawn(|_| { - // SPO index - self.spo.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.shrink_to_fit(); - pred_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - }); - self.spo.shrink_to_fit(); - }); - - s.spawn(|_| { - // POS index - self.pos.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.shrink_to_fit(); - obj_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - }); - self.pos.shrink_to_fit(); - }); - - s.spawn(|_| { - // OSP index - self.osp.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.shrink_to_fit(); - subj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - }); - self.osp.shrink_to_fit(); - }); - - s.spawn(|_| { - // PSO index - self.pso.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.shrink_to_fit(); - subj_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - }); - self.pso.shrink_to_fit(); - }); - - s.spawn(|_| { - // OPS index - self.ops.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.shrink_to_fit(); - pred_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - }); - self.ops.shrink_to_fit(); - }); - - s.spawn(|_| { - // SOP index - self.sop.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.shrink_to_fit(); - obj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - }); - self.sop.shrink_to_fit(); - }); - }); - } - - /// Query the index - pub fn query(&self, s: Option, p: Option, o: Option) -> Vec { - let mut results = Vec::new(); - - match (s, p, o) { - // Fully bound - (Some(ss), Some(pp), Some(oo)) => { - if let Some(pred_map) = self.spo.get(&ss) { - if let Some(objects) = pred_map.get(&pp) { - if objects.contains(&oo) { - results.push(Triple { subject: ss, predicate: pp, object: oo }); - } - } - } - } - // (S, P, -) - (Some(ss), Some(pp), None) => { - if let Some(pred_map) = self.spo.get(&ss) { - if let Some(objects) = pred_map.get(&pp) { - for &obj in objects { - results.push(Triple { subject: ss, predicate: pp, object: obj }); - } - } - } - } - // (S, -, O) - (Some(ss), None, Some(oo)) => { - if let Some(obj_map) = self.sop.get(&ss) { - if let Some(predicates) = obj_map.get(&oo) { - for &pred in predicates { - results.push(Triple { subject: ss, predicate: pred, object: oo }); - } - } - } - } - // (-, P, O) - (None, Some(pp), Some(oo)) => { - if let Some(obj_map) = self.pos.get(&pp) { - if let Some(subjects) = obj_map.get(&oo) { - for &subj in subjects { - results.push(Triple { subject: subj, predicate: pp, object: oo }); - } - } - } - } - // (S, -, -) - (Some(ss), None, None) => { - if let Some(pred_map) = self.spo.get(&ss) { - for (&pred, objects) in pred_map { - for &obj in objects { - results.push(Triple { subject: ss, predicate: pred, object: obj }); - } - } - } - } - // (-, P, -) - (None, Some(pp), None) => { - if let Some(obj_map) = self.pso.get(&pp) { - for (&subj, objects) in obj_map { - for &obj in objects { - results.push(Triple { subject: subj, predicate: pp, object: obj }); - } - } - } - } - // (-, -, O) - (None, None, Some(oo)) => { - if let Some(pred_map) = self.ops.get(&oo) { - for (&pred, subjects) in pred_map { - for &subj in subjects { - results.push(Triple { subject: subj, predicate: pred, object: oo }); - } - } - } - } - // (-, -, -) => all - (None, None, None) => { - for (&subj, pred_map) in &self.spo { - for (&pred, objects) in pred_map { - for &obj in objects { - results.push(Triple { subject: subj, predicate: pred, object: obj }); - } - } - } - } - } - - results - } - - /// Return all triples that match a given `TriplePattern` - pub fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { - let (s, p, o) = pattern; - let sub = match s { - Constant(x) => Some(*x), - Variable(_) | QuotedTriple(_) => None, - }; - let pre = match p { - Constant(x) => Some(*x), - Variable(_) | QuotedTriple(_) => None, - }; - let obj = match o { - Constant(x) => Some(*x), - Variable(_) | QuotedTriple(_) => None, - }; - - self.query(sub, pre, obj) - } - - /// Clear all data in the indexes - pub fn clear(&mut self) { - self.spo.clear(); - self.pos.clear(); - self.osp.clear(); - self.pso.clear(); - self.ops.clear(); - self.sop.clear(); - } - - /// Scan using the Subject-Predicate index (spo) - pub fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { - self.spo - .get(&s) - .and_then(|pred_map| pred_map.get(&p)) - } - - /// Scan using the Subject-Object index (sop) - pub fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { - self.sop - .get(&s) - .and_then(|obj_map| obj_map.get(&o)) - } - - /// Scan using the Predicate-Object index (pos) - pub fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { - self.pos - .get(&p) - .and_then(|obj_map| obj_map.get(&o)) - } - - pub fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { - self.pso - .get(&p) - .and_then(|subj_map| subj_map.get(&s)) - } - - pub fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { - self.osp - .get(&o) - .and_then(|subj_map| subj_map.get(&s)) - } - - pub fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { - self.ops - .get(&o) - .and_then(|pred_map| pred_map.get(&p)) - } - - /// Efficiently merge another index into this one using parallel processing where possible - pub fn merge_from(&mut self, other: UnifiedIndex) { - // Merge SPO index - for (s, pred_map) in other.spo { - let entry = self.spo.entry(s).or_insert_with(HashMap::new); - for (p, obj_set) in pred_map { - entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); - } - } - - // Merge PSO index - for (p, subj_map) in other.pso { - let entry = self.pso.entry(p).or_insert_with(HashMap::new); - for (s, obj_set) in subj_map { - entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); - } - } - - // Merge OPS index - for (o, pred_map) in other.ops { - let entry = self.ops.entry(o).or_insert_with(HashMap::new); - for (p, subj_set) in pred_map { - entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); - } - } - - // Merge POS index - for (p, obj_map) in other.pos { - let entry = self.pos.entry(p).or_insert_with(HashMap::new); - for (o, subj_set) in obj_map { - entry.entry(o).or_insert_with(HashSet::new).extend(subj_set); - } - } - - // Merge OSP index - for (o, subj_map) in other.osp { - let entry = self.osp.entry(o).or_insert_with(HashMap::new); - for (s, pred_set) in subj_map { - entry.entry(s).or_insert_with(HashSet::new).extend(pred_set); - } - } - - // Merge SOP index - for (s, obj_map) in other.sop { - let entry = self.sop.entry(s).or_insert_with(HashMap::new); - for (o, pred_set) in obj_map { - entry.entry(o).or_insert_with(HashSet::new).extend(pred_set); - } - } - } - - pub fn optimize(&mut self) { - use rayon::prelude::*; - - // Optimize SPO index - self.spo.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - pred_map.shrink_to_fit(); - }); - self.spo.shrink_to_fit(); - - // Optimize PSO index - self.pso.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.par_iter_mut().for_each(|(_, obj_set)| { - obj_set.shrink_to_fit(); - }); - subj_map.shrink_to_fit(); - }); - self.pso.shrink_to_fit(); - - // Optimize OPS index - self.ops.par_iter_mut().for_each(|(_, pred_map)| { - pred_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - pred_map.shrink_to_fit(); - }); - self.ops.shrink_to_fit(); - - // Optimize POS index - self.pos.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.par_iter_mut().for_each(|(_, subj_set)| { - subj_set.shrink_to_fit(); - }); - obj_map.shrink_to_fit(); - }); - self.pos.shrink_to_fit(); - - // Optimize OSP index - self.osp.par_iter_mut().for_each(|(_, subj_map)| { - subj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - subj_map.shrink_to_fit(); - }); - self.osp.shrink_to_fit(); - - // Optimize SOP index - self.sop.par_iter_mut().for_each(|(_, obj_map)| { - obj_map.par_iter_mut().for_each(|(_, pred_set)| { - pred_set.shrink_to_fit(); - }); - obj_map.shrink_to_fit(); - }); - self.sop.shrink_to_fit(); - } -} - -/// Helper function to remove a triple from a nested index structure and clean up empty collections -#[inline] -fn remove_from_index( - index: &mut HashMap>>, - key1: u32, - key2: u32, - value: u32, -) { - if let Some(inner_map) = index.get_mut(&key1) { - if let Some(set) = inner_map.get_mut(&key2) { - set.remove(&value); - // Clean up empty inner set - if set.is_empty() { - inner_map.remove(&key2); - } - } - // Clean up empty inner map - if inner_map.is_empty() { - index.remove(&key1); - } - } -} +/* + * Copyright © 2024 Volodymyr Kadzhaia + * Copyright © 2024 Pieter Bonte + * KU Leuven — Stream Intelligence Lab, Belgium + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * you can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HexastoreIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub spo: HashMap>>, + pub pos: HashMap>>, + pub osp: HashMap>>, + pub pso: HashMap>>, + pub ops: HashMap>>, + pub sop: HashMap>>, +} + +impl TripleIndex for HexastoreIndex { + fn clone_empty(&self) -> Box { + Box::new(HexastoreIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + // Efficient: count directly from SPO index + self.spo.values() + .map(|pred_map| pred_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: true, so: true, po: true, + ps: true, os: true, op: true + } + } + + /// Insert a single triple into all six indexes + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; // triple already stored + } + } + } + self.spo.entry(s).or_default().entry(p).or_default().insert(o); + self.pos.entry(p).or_default().entry(o).or_default().insert(s); + self.osp.entry(o).or_default().entry(s).or_default().insert(p); + self.pso.entry(p).or_default().entry(s).or_default().insert(o); + self.ops.entry(o).or_default().entry(p).or_default().insert(s); + self.sop.entry(s).or_default().entry(o).or_default().insert(p); + true + } + + /// Delete a single triple from all six indexes + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.spo + .get(&s) + .and_then(|pred_map| pred_map.get(&p)) + .map_or(false, |objects| objects.contains(&o)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from all six indexes using helper function + remove_from_index(&mut self.spo, s, p, o); + remove_from_index(&mut self.pos, p, o, s); + remove_from_index(&mut self.osp, o, s, p); + remove_from_index(&mut self.pso, p, s, o); + remove_from_index(&mut self.ops, o, p, s); + remove_from_index(&mut self.sop, s, o, p); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.spo.reserve(capacity); + self.pos.reserve(capacity); + self.osp.reserve(capacity); + self.pso.reserve(capacity); + self.ops.reserve(capacity); + self.sop.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = HexastoreIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.spo.reserve(local_capacity); + local_index.pos.reserve(local_capacity); + local_index.osp.reserve(local_capacity); + local_index.pso.reserve(local_capacity); + local_index.ops.reserve(local_capacity); + local_index.sop.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objects) = pred_map.get(&pp) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objects) = pred_map.get(&pp) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(predicates) = obj_map.get(&oo) { + for &pred in predicates { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subjects) = obj_map.get(&oo) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: oo }); + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objects) in pred_map { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + if let Some(obj_map) = self.pso.get(&pp) { + for (&subj, objects) in obj_map { + for &obj in objects { + results.push(Triple { subject: subj, predicate: pp, object: obj }); + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&subj, pred_map) in &self.spo { + for (&pred, objects) in pred_map { + for &obj in objects { + results.push(Triple { subject: subj, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.spo.clear(); + self.pos.clear(); + self.osp.clear(); + self.pso.clear(); + self.ops.clear(); + self.sop.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo + .get(&s) + .and_then(|pred_map| pred_map.get(&p)) + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop + .get(&s) + .and_then(|obj_map| obj_map.get(&o)) + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos + .get(&p) + .and_then(|obj_map| obj_map.get(&o)) + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso + .get(&p) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp + .get(&o) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)) + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize SPO index + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.spo.shrink_to_fit(); + + // Optimize PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.pso.shrink_to_fit(); + + // Optimize OPS index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.ops.shrink_to_fit(); + + // Optimize POS index + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.pos.shrink_to_fit(); + + // Optimize OSP index + self.osp.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.osp.shrink_to_fit(); + + // Optimize SOP index + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.sop.shrink_to_fit(); + } + +} + +impl HexastoreIndex { + pub fn new() -> Self { + Self { + spo: HashMap::new(), + pos: HashMap::new(), + osp: HashMap::new(), + pso: HashMap::new(), + ops: HashMap::new(), + sop: HashMap::new(), + } + } + + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: HexastoreIndex) { + // Merge SPO index + for (s, pred_map) in other.spo { + let entry = self.spo.entry(s).or_insert_with(HashMap::new); + for (p, obj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); + } + } + + // Merge PSO index + for (p, subj_map) in other.pso { + let entry = self.pso.entry(p).or_insert_with(HashMap::new); + for (s, obj_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); + } + } + + // Merge OPS index + for (o, pred_map) in other.ops { + let entry = self.ops.entry(o).or_insert_with(HashMap::new); + for (p, subj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); + } + } + + // Merge POS index + for (p, obj_map) in other.pos { + let entry = self.pos.entry(p).or_insert_with(HashMap::new); + for (o, subj_set) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(subj_set); + } + } + + // Merge OSP index + for (o, subj_map) in other.osp { + let entry = self.osp.entry(o).or_insert_with(HashMap::new); + for (s, pred_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(pred_set); + } + } + + // Merge SOP index + for (s, obj_map) in other.sop { + let entry = self.sop.entry(s).or_insert_with(HashMap::new); + for (o, pred_set) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(pred_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates only in SPO index (most selective) + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; + } + } + } + + // Batch insert into all indexes + self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(8)) + .entry(p).or_insert_with(|| HashSet::with_capacity(16)) + .insert(o); + + self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(8)) + .entry(s).or_insert_with(|| HashSet::with_capacity(16)) + .insert(p); + + self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + + self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(8)) + .entry(o).or_insert_with(|| HashSet::with_capacity(16)) + .insert(p); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + // Parallelize the optimization of each index + rayon::scope(|s| { + s.spawn(|_| { + // SPO index + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.spo.shrink_to_fit(); + }); + + s.spawn(|_| { + // POS index + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.pos.shrink_to_fit(); + }); + + s.spawn(|_| { + // OSP index + self.osp.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + }); + self.osp.shrink_to_fit(); + }); + + s.spawn(|_| { + // PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.pso.shrink_to_fit(); + }); + + s.spawn(|_| { + // OPS index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.ops.shrink_to_fit(); + }); + + s.spawn(|_| { + // SOP index + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, pred_set)| { + pred_set.shrink_to_fit(); + }); + }); + self.sop.shrink_to_fit(); + }); + }); + } +} \ No newline at end of file diff --git a/shared/src/index_manager/mod.rs b/shared/src/index_manager/mod.rs new file mode 100644 index 0000000..436e0f5 --- /dev/null +++ b/shared/src/index_manager/mod.rs @@ -0,0 +1,166 @@ +/* + * Copyright © 2024 Volodymyr Kadzhaia + * Copyright © 2024 Pieter Bonte + * KU Leuven — Stream Intelligence Lab, Belgium + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * you can obtain one at https://mozilla.org/MPL/2.0/. + */ + +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::terms::Term::*; +use crate::triple::Triple; + +pub use hexastore::HexastoreIndex; +pub use ops_single::OPSSingleIndex; +pub use osp_single::OSPSingleIndex; +pub use pos_single::POSSingleIndex; +pub use pso_single::PSOSingleIndex; +pub use sop_single::SOPSingleIndex; +pub use spo_single::SPOSingleIndex; +pub use single_table::SingleTableIndex; +pub use buckets::BucketIndex; +pub use partial_hexastore::PartialHexastoreIndex; +pub mod partial_hexastore; +pub mod hexastore; +pub mod ops_single; +pub mod osp_single; +pub mod pos_single; +pub mod pso_single; +pub mod sop_single; +pub mod spo_single; +pub mod single_table; +pub mod buckets; + +#[derive(Debug, Clone)] +pub enum IndexConfig { + /// Full hexastore — all 6 permutations, no questions asked. + Hexastore, + + /// A single permutation index. + SPO, + POS, + OSP, + PSO, + OPS, + SOP, + + /// Flat single-table index. + SingleTable, + + /// Buckets + Buckets { + queries: Vec, + }, + + PartialHexastore { + queries: Vec, + } + + // ── Future index types go here ── + // YourNewIndex { + // some_setting: usize, + // queries: Vec, // if it needs resolved patterns + // }, +} + +impl Default for IndexConfig { + fn default() -> Self { + IndexConfig::Hexastore + } +} + +/// Describes which access patterns an index can serve efficiently. +#[derive(Debug, Clone, Default)] +pub struct AccessPatternSupport { + pub sp: bool, // subject+predicate -> objects + pub so: bool, // subject+object -> predicates + pub po: bool, // predicate+object -> subjects + pub ps: bool, // predicate+subject -> objects + pub os: bool, // object+subject -> predicates + pub op: bool, // object+predicate -> subjects +} + +pub trait TripleIndex: Send + Sync + std::fmt::Debug { + // ── Mutation ── + fn insert(&mut self, triple: &Triple) -> bool; + fn delete(&mut self, triple: &Triple) -> bool; + fn clear(&mut self); + fn clone_empty(&self) -> Box; + + // ── Pattern query ── + /// Returns all triples matching the (s?, p?, o?) pattern. + /// Always works regardless of existing indexes. + fn query(&self, s: Option, p: Option, o: Option) -> Vec; + + /// Same as query but works with TriplePattern (for convenience). + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec; + + // ── Two-key scans ── + // These return None if the index doesn't support this access path + // efficiently — the engine will then fall back to query() + filter. + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet>; + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet>; + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet>; + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet>; + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet>; + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet>; + + // ── Bulk operations ── + + /// Absorb all triples from a slice. The default implementation + /// calls insert() in a loop, concrete types can override with + /// a faster path. + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + } + + /// Reclaim wasted memory / compact internal data structures. + /// The default is to do nothing, concrete types override if they + /// have internal structures that benefit from compaction. + fn optimize(&mut self) {} + + // ── Metadata ── + /// Reports which access patterns this index supports efficiently. + fn supported_access_patterns(&self) -> AccessPatternSupport; + fn triple_count(&self) -> usize { + self.query(None, None, None).len() // default: expensive but correct + } + + // ── Cloning support for Box ── + fn clone_box(&self) -> Box; +} + +/// Allow `Clone` on `Box`. +impl Clone for Box { + fn clone(&self) -> Self { + self.clone_box() + } +} + +/// Helper function to remove a triple from a nested index structure and clean up empty collections +#[inline] +fn remove_from_index( + index: &mut HashMap>>, + key1: u32, + key2: u32, + value: u32, +) { + if let Some(inner_map) = index.get_mut(&key1) { + if let Some(set) = inner_map.get_mut(&key2) { + set.remove(&value); + // Clean up empty inner set + if set.is_empty() { + inner_map.remove(&key2); + } + } + // Clean up empty inner map + if inner_map.is_empty() { + index.remove(&key1); + } + } +} diff --git a/shared/src/index_manager/ops_single.rs b/shared/src/index_manager/ops_single.rs new file mode 100644 index 0000000..3365faa --- /dev/null +++ b/shared/src/index_manager/ops_single.rs @@ -0,0 +1,331 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OPSSingleIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub ops: HashMap>>, +} + +impl TripleIndex for OPSSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(OPSSingleIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.ops.values() + .map(|sub_map| sub_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: false, os: false, op: true + } + } + + /// Insert a single triple + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.ops.get(&o) { + if let Some(subjects) = pred_map.get(&p) { + if subjects.contains(&s) { + return false; // triple already stored + } + } + } + self.ops.entry(o).or_default().entry(p).or_default().insert(s); + true + } + + /// Delete a single triple + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)) + .map_or(false, |subjects| subjects.contains(&s)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from index using helper function + remove_from_index(&mut self.ops, s, p, o); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.ops.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = OPSSingleIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.ops.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + if let Some(subjects) = pred_map.get(&pp) { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + for (&obj, pred_map) in &self.ops { + if let Some(subjects) = pred_map.get(&pp) { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + if subjects.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + if let Some(subjects) = pred_map.get(&pp) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: oo }); + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + for (&obj, pred_map) in &self.ops { + for (&pred, subjects) in pred_map { + if subjects.contains(&ss) { + results.push( Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + for (&obj, pred_map) in &self.ops { + if let Some(subjects) = pred_map.get(&pp) { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pp, object: obj }); + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + if let Some(pred_map) = self.ops.get(&oo) { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&obj, pred_map) in &self.ops { + for (&pred, subjects) in pred_map { + for &subj in subjects { + results.push(Triple { subject: subj, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.ops.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops + .get(&o) + .and_then(|pred_map| pred_map.get(&p)) + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize PSO index + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.ops.shrink_to_fit(); + } + +} + +impl OPSSingleIndex { + pub fn new() -> Self { + Self { + ops: HashMap::new(), + } + } + + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: OPSSingleIndex) { + + // Merge OPS index + for (o, pred_map) in other.ops { + let entry = self.ops.entry(o).or_insert_with(HashMap::new); + for (p, subj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(subj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates + if let Some(pred_map) = self.ops.get(&o) { + if let Some(subjects) = pred_map.get(&p) { + if subjects.contains(&s) { + return false; + } + } + } + + // Batch insert into all indexes + self.ops.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + self.ops.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, subj_set)| { + subj_set.shrink_to_fit(); + }); + }); + self.ops.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/osp_single.rs b/shared/src/index_manager/osp_single.rs new file mode 100644 index 0000000..1496fdf --- /dev/null +++ b/shared/src/index_manager/osp_single.rs @@ -0,0 +1,220 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OSPSingleIndex { + pub osp: HashMap>>, +} + +impl TripleIndex for OSPSingleIndex { + fn clone_empty(&self) -> Box { Box::new(OSPSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.osp.values().map(|sub_map| sub_map.values().map(|ps| ps.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: false, os: true, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get(&o) { + if let Some(preds) = sub_map.get(&s) { + if preds.contains(&p) { return false; } + } + } + self.osp.entry(o).or_default().entry(s).or_default().insert(p); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get_mut(&o) { + if let Some(preds) = sub_map.get_mut(&s) { + if preds.remove(&p) { + if preds.is_empty() { sub_map.remove(&s); } + if sub_map.is_empty() { self.osp.remove(&o); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.osp.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = OSPSingleIndex::new(); + local.osp.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + if let Some(preds) = sub_map.get(&ss) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + for (&obj, sub_map) in &self.osp { + if let Some(preds) = sub_map.get(&ss) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + if let Some(preds) = sub_map.get(&ss) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + for (&sub, preds) in sub_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + for (&obj, sub_map) in &self.osp { + if let Some(preds) = sub_map.get(&ss) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&obj, sub_map) in &self.osp { + for (&sub, preds) in sub_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + if let Some(sub_map) = self.osp.get(&oo) { + for (&sub, preds) in sub_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&obj, sub_map) in &self.osp { + for (&sub, preds) in sub_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.osp.clear(); } + + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp.get(&o).and_then(|sub_map| sub_map.get(&s)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.osp.par_iter_mut().for_each(|(_, sub_map)| { + sub_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + sub_map.shrink_to_fit(); + }); + self.osp.shrink_to_fit(); + } +} + +impl OSPSingleIndex { + pub fn new() -> Self { Self { osp: HashMap::new() } } + + pub fn merge_from(&mut self, other: OSPSingleIndex) { + for (o, sub_map) in other.osp { + let entry = self.osp.entry(o).or_insert_with(HashMap::new); + for (s, preds) in sub_map { + entry.entry(s).or_insert_with(HashSet::new).extend(preds); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.osp.get(&o) { + if let Some(preds) = sub_map.get(&s) { + if preds.contains(&p) { return false; } + } + } + self.osp.entry(o).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(p); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.osp.par_iter_mut().for_each(|(_, sub_map)| { + sub_map.shrink_to_fit(); + sub_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + }); + self.osp.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/partial_hexastore.rs b/shared/src/index_manager/partial_hexastore.rs new file mode 100644 index 0000000..ec6582e --- /dev/null +++ b/shared/src/index_manager/partial_hexastore.rs @@ -0,0 +1,405 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; +use crate::query::PlannedAccessPattern; + +/// The six possible triple-index orderings. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum IndexType { + SPO, // subject → predicate → object + SOP, // subject → object → predicate + PSO, // predicate → subject → object + POS, // predicate → object → subject + OSP, // object → subject → predicate + OPS, // object → predicate → subject +} + +impl IndexType { + /// All six permutations. + const ALL: [IndexType; 6] = [ + IndexType::SPO, IndexType::SOP, + IndexType::PSO, IndexType::POS, + IndexType::OSP, IndexType::OPS, + ]; + + /// Create a fresh, empty `Box` for this type. + fn create_empty(&self) -> Box { + match self { + IndexType::SPO => Box::new(SPOSingleIndex::new()), + IndexType::SOP => Box::new(SOPSingleIndex::new()), + IndexType::PSO => Box::new(PSOSingleIndex::new()), + IndexType::POS => Box::new(POSSingleIndex::new()), + IndexType::OSP => Box::new(OSPSingleIndex::new()), + IndexType::OPS => Box::new(OPSSingleIndex::new()), + } + } + + /// Which two-key scan does this index natively support? + fn native_scan(&self) -> ScanKind { + match self { + IndexType::SPO => ScanKind::SP, + IndexType::SOP => ScanKind::SO, + IndexType::PSO => ScanKind::PS, + IndexType::POS => ScanKind::PO, + IndexType::OSP => ScanKind::OS, + IndexType::OPS => ScanKind::OP, + } + } +} + +/// The six possible two-key scans. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum ScanKind { SP, SO, PS, PO, OS, OP } + +/// Lightweight snapshot of the unique-count statistics needed by the heuristic. +#[derive(Debug, Clone)] +pub struct CardinalitySnapshot { + pub num_subjects: f64, + pub num_predicates: f64, + pub num_objects: f64, + pub num_sp_pairs: f64, +} + +impl CardinalitySnapshot { + /// Build from raw counts. + pub fn from_stats( + _total_triples: u64, + unique_subjects: usize, + unique_predicates: usize, + unique_objects: usize, + unique_sp_pairs: usize, + ) -> Self { + // Use at least 1.0 to avoid division-by-zero in cost formulas + Self { + num_subjects: (unique_subjects as f64).max(1.0), + num_predicates: (unique_predicates as f64).max(1.0), + num_objects: (unique_objects as f64).max(1.0), + num_sp_pairs: (unique_sp_pairs as f64).max(1.0), + } + } + + /// A default snapshot when we have no data yet. + fn unknown() -> Self { + Self { num_subjects: 1.0, num_predicates: 1.0, num_objects: 1.0, num_sp_pairs: 1.0 } + } +} + +#[derive(Debug, Clone)] +pub struct PartialHexastoreIndex { + pub spo: Option>>>, + pub pos: Option>>>, + pub osp: Option>>>, + pub pso: Option>>>, + pub ops: Option>>>, + pub sop: Option>>>, + + pub latest_card: CardinalitySnapshot, +} + +impl PartialHexastoreIndex { + pub fn new(patterns: Vec) -> Self { + let required_indexes = Self::determine_smallest_index_set(&patterns); + + let mut created_names = Vec::new(); + if required_indexes.contains(&IndexType::SPO) { created_names.push("SPO"); } + if required_indexes.contains(&IndexType::POS) { created_names.push("POS"); } + if required_indexes.contains(&IndexType::OSP) { created_names.push("OSP"); } + if required_indexes.contains(&IndexType::PSO) { created_names.push("PSO"); } + if required_indexes.contains(&IndexType::OPS) { created_names.push("OPS"); } + if required_indexes.contains(&IndexType::SOP) { created_names.push("SOP"); } + + println!("PartialHexastoreIndex initialized with indexes: {:?}", created_names); + + Self { + spo: if required_indexes.contains(&IndexType::SPO) { Some(HashMap::new()) } else { None }, + pos: if required_indexes.contains(&IndexType::POS) { Some(HashMap::new()) } else { None }, + osp: if required_indexes.contains(&IndexType::OSP) { Some(HashMap::new()) } else { None }, + pso: if required_indexes.contains(&IndexType::PSO) { Some(HashMap::new()) } else { None }, + ops: if required_indexes.contains(&IndexType::OPS) { Some(HashMap::new()) } else { None }, + sop: if required_indexes.contains(&IndexType::SOP) { Some(HashMap::new()) } else { None }, + latest_card: CardinalitySnapshot::from_stats(0, 1, 1, 1, 1), + } + } + + pub fn update_cardinalities(&mut self, card: CardinalitySnapshot) { + self.latest_card = card; + } + + /// Finds the absolute smallest set of indexes that covers all physical access patterns efficiently. + fn determine_smallest_index_set(patterns: &[PlannedAccessPattern]) -> HashSet { + if patterns.is_empty() { + return HashSet::from([IndexType::SPO]); + } + + let all_types = [ + IndexType::SPO, IndexType::SOP, IndexType::PSO, + IndexType::POS, IndexType::OSP, IndexType::OPS + ]; + + let mut valid_types_per_pattern = Vec::new(); + for planned in patterns { + let (s, p, o) = &planned.pattern; + + // A variable is considered bound if it's a true constant OR if it's pipeline-bound (from previous steps) + let bound_s = matches!(s, Term::Constant(_)) || planned.bound_subject; + let bound_p = matches!(p, Term::Constant(_)) || planned.bound_predicate; + let bound_o = matches!(o, Term::Constant(_)) || planned.bound_object; + + let mut valid = Vec::new(); + match (bound_s, bound_p, bound_o) { + (true, true, _) => { valid.push(IndexType::SPO); valid.push(IndexType::PSO); } + (true, _, true) => { valid.push(IndexType::SOP); valid.push(IndexType::OSP); } + (_, true, true) => { valid.push(IndexType::POS); valid.push(IndexType::OPS); } + (true, false, false) => { valid.push(IndexType::SPO); valid.push(IndexType::SOP); } + (false, true, false) => { valid.push(IndexType::PSO); valid.push(IndexType::POS); } + (false, false, true) => { valid.push(IndexType::OSP); valid.push(IndexType::OPS); } + (false, false, false) | (true, true, true) => { + valid.extend_from_slice(&all_types); + } + } + valid_types_per_pattern.push(valid); + } + + let mut min_size = usize::MAX; + let mut best_set = HashSet::new(); + let n = all_types.len(); + + for mask in 1..=(1 << n) - 1 { + let mut candidate_set = HashSet::new(); + for (i, &t) in all_types.iter().enumerate() { + if mask & (1 << i) != 0 { + candidate_set.insert(t); + } + } + + let covers_all = valid_types_per_pattern.iter().all(|valid| { + valid.iter().any(|vt| candidate_set.contains(vt)) + }); + + if covers_all && candidate_set.len() < min_size { + min_size = candidate_set.len(); + best_set = candidate_set.clone(); + } + } + + if best_set.is_empty() { + best_set.insert(IndexType::SPO); + } + + best_set + } + + /// Selects the best index on-query based on bound variables and root cardinality (lean index tiebreaker). + fn select_best_index(&self, s: Option, p: Option, o: Option) -> IndexType { + let mut candidates = Vec::new(); + + let add_if_available = |candidates: &mut Vec<(IndexType, f64)>, idx: IndexType, available: bool, root_card: f64| { + if available { + candidates.push((idx, root_card)); + } + }; + + // Rule 1: Bound variables first. + match (s.is_some(), p.is_some(), o.is_some()) { + (true, true, _) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + } + (true, _, true) => { + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + } + (_, true, true) => { + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + (true, false, false) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + } + (false, true, false) => { + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + } + (false, false, true) => { + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + (false, false, false) | (true, true, true) => { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + } + + // Fallback if none of the optimal indexes for this query shape were instantiated + if candidates.is_empty() { + add_if_available(&mut candidates, IndexType::SPO, self.spo.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::SOP, self.sop.is_some(), self.latest_card.num_subjects); + add_if_available(&mut candidates, IndexType::PSO, self.pso.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::POS, self.pos.is_some(), self.latest_card.num_predicates); + add_if_available(&mut candidates, IndexType::OSP, self.osp.is_some(), self.latest_card.num_objects); + add_if_available(&mut candidates, IndexType::OPS, self.ops.is_some(), self.latest_card.num_objects); + } + + // Rule 2: Tiebreaker - lean index is better (smaller root cardinality) + candidates.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + candidates[0].0 + } + + #[inline] + fn query_index( + index: &HashMap>>, + q_root: Option, q_mid: Option, q_leaf: Option, + build_triple: impl Fn(u32, u32, u32) -> Triple + ) -> Vec { + let mut results = Vec::new(); + let mut scan_mid = |root_val: u32, mid_map: &HashMap>| { + if let Some(mv) = q_mid { + if let Some(leaf_set) = mid_map.get(&mv) { + if let Some(lv) = q_leaf { + if leaf_set.contains(&lv) { results.push(build_triple(root_val, mv, lv)); } + } else { + for &lv in leaf_set { results.push(build_triple(root_val, mv, lv)); } + } + } + } else { + for (&mv, leaf_set) in mid_map { + if let Some(lv) = q_leaf { + if leaf_set.contains(&lv) { results.push(build_triple(root_val, mv, lv)); } + } else { + for &lv in leaf_set { results.push(build_triple(root_val, mv, lv)); } + } + } + } + }; + + if let Some(rv) = q_root { + if let Some(mid_map) = index.get(&rv) { scan_mid(rv, mid_map); } + } else { + for (&rv, mid_map) in index { scan_mid(rv, mid_map); } + } + results + } +} + +impl TripleIndex for PartialHexastoreIndex { + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + let mut inserted = false; + if let Some(ref mut idx) = self.spo { inserted |= idx.entry(s).or_default().entry(p).or_default().insert(o); } + if let Some(ref mut idx) = self.pos { inserted |= idx.entry(p).or_default().entry(o).or_default().insert(s); } + if let Some(ref mut idx) = self.osp { inserted |= idx.entry(o).or_default().entry(s).or_default().insert(p); } + if let Some(ref mut idx) = self.pso { inserted |= idx.entry(p).or_default().entry(s).or_default().insert(o); } + if let Some(ref mut idx) = self.ops { inserted |= idx.entry(o).or_default().entry(p).or_default().insert(s); } + if let Some(ref mut idx) = self.sop { inserted |= idx.entry(s).or_default().entry(o).or_default().insert(p); } + inserted + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + let mut deleted = false; + + let check_and_delete = |idx: &mut Option>>>, r, m, l| { + if let Some(map) = idx { + remove_from_index(map, r, m, l); + return true; + } + false + }; + + if let Some(ref mut idx) = self.spo { + let exists = idx.get(&s).and_then(|pm| pm.get(&p)).map_or(false, |os| os.contains(&o)); + if !exists { return false; } + } + + deleted |= check_and_delete(&mut self.spo, s, p, o); + check_and_delete(&mut self.pos, p, o, s); + check_and_delete(&mut self.osp, o, s, p); + check_and_delete(&mut self.pso, p, s, o); + check_and_delete(&mut self.ops, o, p, s); + check_and_delete(&mut self.sop, s, o, p); + + deleted + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let best_index = self.select_best_index(s, p, o); + match best_index { + IndexType::SPO => Self::query_index(self.spo.as_ref().unwrap(), s, p, o, |s, p, o| Triple { subject: s, predicate: p, object: o }), + IndexType::SOP => Self::query_index(self.sop.as_ref().unwrap(), s, o, p, |s, o, p| Triple { subject: s, predicate: p, object: o }), + IndexType::PSO => Self::query_index(self.pso.as_ref().unwrap(), p, s, o, |p, s, o| Triple { subject: s, predicate: p, object: o }), + IndexType::POS => Self::query_index(self.pos.as_ref().unwrap(), p, o, s, |p, o, s| Triple { subject: s, predicate: p, object: o }), + IndexType::OSP => Self::query_index(self.osp.as_ref().unwrap(), o, s, p, |o, s, p| Triple { subject: s, predicate: p, object: o }), + IndexType::OPS => Self::query_index(self.ops.as_ref().unwrap(), o, p, s, |o, p, s| Triple { subject: s, predicate: p, object: o }), + } + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Term::Constant(x) => Some(*x), Term::Variable(_) => None, Term::QuotedTriple(_) => None }; + let pre = match p { Term::Constant(x) => Some(*x), Term::Variable(_) => None, Term::QuotedTriple(_) => None }; + let obj = match o { Term::Constant(x) => Some(*x), Term::Variable(_) => None, Term::QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + if let Some(idx) = &mut self.spo { idx.clear(); } + if let Some(idx) = &mut self.pos { idx.clear(); } + if let Some(idx) = &mut self.osp { idx.clear(); } + if let Some(idx) = &mut self.pso { idx.clear(); } + if let Some(idx) = &mut self.ops { idx.clear(); } + if let Some(idx) = &mut self.sop { idx.clear(); } + } + + fn clone_empty(&self) -> Box { + Box::new(Self { + spo: self.spo.as_ref().map(|_| HashMap::new()), + pos: self.pos.as_ref().map(|_| HashMap::new()), + osp: self.osp.as_ref().map(|_| HashMap::new()), + pso: self.pso.as_ref().map(|_| HashMap::new()), + ops: self.ops.as_ref().map(|_| HashMap::new()), + sop: self.sop.as_ref().map(|_| HashMap::new()), + latest_card: self.latest_card.clone(), + }) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: self.spo.is_some() || self.pso.is_some(), + so: self.sop.is_some() || self.osp.is_some(), + po: self.pos.is_some() || self.ops.is_some(), + ps: self.pso.is_some() || self.spo.is_some(), + os: self.osp.is_some() || self.sop.is_some(), + op: self.ops.is_some() || self.pos.is_some(), + } + } + + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo.as_ref().and_then(|idx| idx.get(&s).and_then(|m| m.get(&p))) + } + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop.as_ref().and_then(|idx| idx.get(&s).and_then(|m| m.get(&o))) + } + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos.as_ref().and_then(|idx| idx.get(&p).and_then(|m| m.get(&o))) + } + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso.as_ref().and_then(|idx| idx.get(&p).and_then(|m| m.get(&s))) + } + fn scan_os(&self, o: u32, s: u32) -> Option<&HashSet> { + self.osp.as_ref().and_then(|idx| idx.get(&o).and_then(|m| m.get(&s))) + } + fn scan_op(&self, o: u32, p: u32) -> Option<&HashSet> { + self.ops.as_ref().and_then(|idx| idx.get(&o).and_then(|m| m.get(&p))) + } +} \ No newline at end of file diff --git a/shared/src/index_manager/pos_single.rs b/shared/src/index_manager/pos_single.rs new file mode 100644 index 0000000..7a6b407 --- /dev/null +++ b/shared/src/index_manager/pos_single.rs @@ -0,0 +1,220 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct POSSingleIndex { + pub pos: HashMap>>, +} + +impl TripleIndex for POSSingleIndex { + fn clone_empty(&self) -> Box { Box::new(POSSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.pos.values().map(|obj_map| obj_map.values().map(|subs| subs.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: true, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get(&p) { + if let Some(subs) = obj_map.get(&o) { + if subs.contains(&s) { return false; } + } + } + self.pos.entry(p).or_default().entry(o).or_default().insert(s); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get_mut(&p) { + if let Some(subs) = obj_map.get_mut(&o) { + if subs.remove(&s) { + if subs.is_empty() { obj_map.remove(&o); } + if obj_map.is_empty() { self.pos.remove(&p); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.pos.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = POSSingleIndex::new(); + local.pos.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subs) = obj_map.get(&oo) { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(obj_map) = self.pos.get(&pp) { + for (&obj, subs) in obj_map { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + for (&pred, obj_map) in &self.pos { + if let Some(subs) = obj_map.get(&oo) { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + if let Some(obj_map) = self.pos.get(&pp) { + if let Some(subs) = obj_map.get(&oo) { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + for (&pred, obj_map) in &self.pos { + for (&obj, subs) in obj_map { + if subs.contains(&ss) { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + if let Some(obj_map) = self.pos.get(&pp) { + for (&obj, subs) in obj_map { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&pred, obj_map) in &self.pos { + if let Some(subs) = obj_map.get(&oo) { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&pred, obj_map) in &self.pos { + for (&obj, subs) in obj_map { + for &sub in subs { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.pos.clear(); } + + fn scan_po(&self, p: u32, o: u32) -> Option<&HashSet> { + self.pos.get(&p).and_then(|obj_map| obj_map.get(&o)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, subs)| { subs.shrink_to_fit(); }); + obj_map.shrink_to_fit(); + }); + self.pos.shrink_to_fit(); + } +} + +impl POSSingleIndex { + pub fn new() -> Self { Self { pos: HashMap::new() } } + + pub fn merge_from(&mut self, other: POSSingleIndex) { + for (p, obj_map) in other.pos { + let entry = self.pos.entry(p).or_insert_with(HashMap::new); + for (o, subs) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(subs); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.pos.get(&p) { + if let Some(subs) = obj_map.get(&o) { + if subs.contains(&s) { return false; } + } + } + self.pos.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(s); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.pos.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, subs)| { subs.shrink_to_fit(); }); + }); + self.pos.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/pso_single.rs b/shared/src/index_manager/pso_single.rs new file mode 100644 index 0000000..c9cfbb5 --- /dev/null +++ b/shared/src/index_manager/pso_single.rs @@ -0,0 +1,331 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PSOSingleIndex { + // The six permutations, using HashMap of HashMap of HashSet. + pub pso: HashMap>>, +} + +impl TripleIndex for PSOSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(PSOSingleIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.pso.values() + .map(|sub_map| sub_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: false, po: false, + ps: true, os: false, op: false + } + } + + /// Insert a single triple into all six indexes + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(sub_map) = self.pso.get(&p) { + if let Some(objects) = sub_map.get(&s) { + if objects.contains(&o) { + return false; // triple already stored + } + } + } + self.pso.entry(p).or_default().entry(s).or_default().insert(o); + true + } + + /// Delete a single triple from all six indexes + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + let exists = self.pso + .get(&p) + .and_then(|sub_map| sub_map.get(&s)) + .map_or(false, |objects| objects.contains(&o)); + + if !exists { + return false; // triple doesn't exist + } + + // Remove from all six indexes using helper function + remove_from_index(&mut self.pso, p, s, o); + true + } + + /// Bulk-build the index from a list of triples + fn build_from_triples(&mut self, triples: &[Triple]) { + for triple in triples { + self.insert(triple); + } + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Pre-allocate with capacity estimates + let capacity = triples.len() / 100; + + self.pso.reserve(capacity); + + // Build indexes in parallel by creating partial indexes and merging + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partial_indexes: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local_index = PSOSingleIndex::new(); + + // Pre-allocate local index + let local_capacity = chunk.len() / 50; + local_index.pso.reserve(local_capacity); + + // Insert triples into local index + for triple in chunk { + local_index.insert_optimized(triple); + } + + local_index + }) + .collect(); + + // Sequentially merge partial indexes + for partial_index in partial_indexes { + self.merge_from(partial_index); + } + + // Optimize memory layout after building + self.optimize_post_build(); + } + + /// Query the index + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + + match (s, p, o) { + // Fully bound + (Some(ss), Some(pp), Some(oo)) => { + if let Some(sub_map) = self.pso.get(&pp) { + if let Some(objects) = sub_map.get(&ss) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + // (S, P, -) + (Some(ss), Some(pp), None) => { + if let Some(sub_map) = self.pso.get(&pp) { + if let Some(objects) = sub_map.get(&ss) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + // (S, -, O) + (Some(ss), None, Some(oo)) => { + for (&pred, sub_map) in &self.pso { + if let Some(objects) = sub_map.get(&ss) { + if objects.contains(&oo) { + results.push(Triple { subject: ss, predicate: pred, object: oo }) + } + } + } + } + // (-, P, O) + (None, Some(pp), Some(oo)) => { + if let Some(sub_map) = self.pso.get(&pp) { + for (&sub, objects) in sub_map { + if objects.contains(&oo) { + results.push(Triple { subject: sub, predicate: pp, object: oo }) + } + } + } + } + // (S, -, -) + (Some(ss), None, None) => { + for (&pred, sub_map) in &self.pso { + if let Some(objects) = sub_map.get(&ss) { + for &obj in objects { + results.push(Triple { subject: ss, predicate: pred, object: obj }) + } + } + } + } + // (-, P, -) + (None, Some(pp), None) => { + if let Some(sub_map) = self.pso.get(&pp) { + for (&sub, objects) in sub_map { + for &obj in objects { + results.push(Triple { subject: sub, predicate: pp, object: obj }) + } + } + } + } + // (-, -, O) + (None, None, Some(oo)) => { + for (&pred, sub_map) in &self.pso { + for (&sub, objects) in sub_map { + if objects.contains(&oo) { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + // (-, -, -) => all + (None, None, None) => { + for (&pred, sub_map) in &self.pso { + for (&sub, objects) in sub_map { + for &obj in objects { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + + results + } + + /// Return all triples that match a given `TriplePattern` + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let pre = match p { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + let obj = match o { + Constant(x) => Some(*x), + Variable(_) => None, + QuotedTriple(_) => None, + }; + + self.query(sub, pre, obj) + } + + /// Clear all data in the indexes + fn clear(&mut self) { + self.pso.clear(); + } + + /// Scan using the Subject-Predicate index (spo) + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Subject-Object index (sop) + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Object index (pos) + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Predicate-Subject index (pso) + fn scan_ps(&self, p: u32, s: u32) -> Option<&HashSet> { + self.pso + .get(&p) + .and_then(|subj_map| subj_map.get(&s)) + } + + /// Scan using the Object-Subject index (osp) + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + /// Scan using the Object-Predicate index (ops) + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { + return None; + } + + fn optimize(&mut self) { + use rayon::prelude::*; + + // Optimize PSO index + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + subj_map.shrink_to_fit(); + }); + self.pso.shrink_to_fit(); + } + +} + +impl PSOSingleIndex { + pub fn new() -> Self { + Self { + pso: HashMap::new(), + } + } + + /// Efficiently merge another index into this one using parallel processing where possible + pub fn merge_from(&mut self, other: PSOSingleIndex) { + + // Merge PSO index + for (p, subj_map) in other.pso { + let entry = self.pso.entry(p).or_insert_with(HashMap::new); + for (s, obj_set) in subj_map { + entry.entry(s).or_insert_with(HashSet::new).extend(obj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + // Check for duplicates + if let Some(sub_map) = self.pso.get(&p) { + if let Some(objects) = sub_map.get(&s) { + if objects.contains(&o) { + return false; + } + } + } + + // Batch insert into all indexes + self.pso.entry(p).or_insert_with(|| HashMap::with_capacity(16)) + .entry(s).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + + self.pso.par_iter_mut().for_each(|(_, subj_map)| { + subj_map.shrink_to_fit(); + subj_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.pso.shrink_to_fit(); + } +} \ No newline at end of file diff --git a/shared/src/index_manager/single_table.rs b/shared/src/index_manager/single_table.rs new file mode 100644 index 0000000..f889fe8 --- /dev/null +++ b/shared/src/index_manager/single_table.rs @@ -0,0 +1,149 @@ +use serde::{Serialize, Deserialize}; +use std::collections::HashSet; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SingleTableIndex { + pub table: HashSet, +} + +impl TripleIndex for SingleTableIndex { + fn clone_empty(&self) -> Box { + Box::new(SingleTableIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.table.len() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + // no specialized access pattern supported + AccessPatternSupport { sp: false, so: false, po: false, ps: false, os: false, op: false } + } + + fn insert(&mut self, triple: &Triple) -> bool { + // Insert returns true only when the triple was not present before. + self.table.insert(triple.clone()) + } + + fn delete(&mut self, triple: &Triple) -> bool { + // HashSet::remove accepts &T and returns whether an element was removed. + self.table.remove(triple) + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + // simple replace-with-new-set strategy; keeps semantics consistent. + use rayon::prelude::*; + + self.clear(); + + if triples.is_empty() { + return; + } + + // Reserve a reasonable capacity (heuristic) + self.table.reserve(triples.len()); + + // If rayon is available, we can build partial sets and merge them. + let num_threads = rayon::current_num_threads(); + if triples.len() >= 10_000 && num_threads > 1 { + // parallel build + let chunk_size = (triples.len() / num_threads).max(1_000); + let partials: Vec> = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = HashSet::with_capacity(chunk.len()); + for t in chunk { + local.insert(t.clone()); + } + local + }) + .collect(); + + for part in partials { + self.table.extend(part); + } + } else { + // serial build + self.table.extend(triples.iter().cloned()); + } + + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + // brute-force scan across the table; acceptable because this index has no sub-indexes + let mut results = Vec::new(); + + for triple in &self.table { + let matches = match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => triple.subject == ss && triple.predicate == pp && triple.object == oo, + (Some(ss), Some(pp), None) => triple.subject == ss && triple.predicate == pp, + (Some(ss), None, Some(oo)) => triple.subject == ss && triple.object == oo, + (None, Some(pp), Some(oo)) => triple.predicate == pp && triple.object == oo, + (Some(ss), None, None) => triple.subject == ss, + (None, Some(pp), None) => triple.predicate == pp, + (None, None, Some(oo)) => triple.object == oo, + (None, None, None) => true, + }; + + if matches { + results.push(triple.clone()); + } + } + + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + self.table.clear(); + } + + // no specialized scans possible; all return None + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + // nothing complex to optimize here — just shrink to fit + self.table.shrink_to_fit(); + } +} +impl SingleTableIndex { + pub fn new() -> Self { + Self { table: HashSet::new() } + } + + /// merge another NoIndex into this one + pub fn merge_from(&mut self, other: SingleTableIndex) { + self.table.extend(other.table); + } + + /// optimized single-triple insert used during parallel builds + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + // returns true if inserted, false if already present + self.table.insert(triple.clone()) + } + + fn optimize_post_build(&mut self) { + self.table.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/sop_single.rs b/shared/src/index_manager/sop_single.rs new file mode 100644 index 0000000..206f21a --- /dev/null +++ b/shared/src/index_manager/sop_single.rs @@ -0,0 +1,222 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SOPSingleIndex { + pub sop: HashMap>>, +} + +impl TripleIndex for SOPSingleIndex { + fn clone_empty(&self) -> Box { Box::new(SOPSingleIndex::new()) } + fn clone_box(&self) -> Box { Box::new(self.clone()) } + + fn triple_count(&self) -> usize { + self.sop.values().map(|obj_map| obj_map.values().map(|ps| ps.len()).sum::()).sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: false, so: true, po: false, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get(&s) { + if let Some(preds) = obj_map.get(&o) { + if preds.contains(&p) { return false; } + } + } + self.sop.entry(s).or_default().entry(o).or_default().insert(p); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get_mut(&s) { + if let Some(preds) = obj_map.get_mut(&o) { + if preds.remove(&p) { + if preds.is_empty() { obj_map.remove(&o); } + if obj_map.is_empty() { self.sop.remove(&s); } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + self.clear(); + if triples.is_empty() { return; } + + let capacity = (triples.len() / 100).max(1); + self.sop.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = SOPSingleIndex::new(); + local.sop.reserve((chunk.len() / 50).max(1)); + for t in chunk { local.insert_optimized(t); } + local + }) + .collect(); + + for p in partials { self.merge_from(p); } + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(preds) = obj_map.get(&oo) { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(obj_map) = self.sop.get(&ss) { + for (&obj, preds) in obj_map { + if preds.contains(&pp) { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(obj_map) = self.sop.get(&ss) { + if let Some(preds) = obj_map.get(&oo) { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + for (&sub, obj_map) in &self.sop { + if let Some(preds) = obj_map.get(&oo) { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + if let Some(obj_map) = self.sop.get(&ss) { + for (&obj, preds) in obj_map { + for &pred in preds { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&sub, obj_map) in &self.sop { + for (&obj, preds) in obj_map { + if preds.contains(&pp) { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&sub, obj_map) in &self.sop { + if let Some(preds) = obj_map.get(&oo) { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&sub, obj_map) in &self.sop { + for (&obj, preds) in obj_map { + for &pred in preds { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { self.sop.clear(); } + + fn scan_so(&self, s: u32, o: u32) -> Option<&HashSet> { + self.sop.get(&s).and_then(|obj_map| obj_map.get(&o)) + } + + fn scan_sp(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.par_iter_mut().for_each(|(_, preds)| { + preds.shrink_to_fit(); + }); + obj_map.shrink_to_fit(); + }); + self.sop.shrink_to_fit(); + } +} + +impl SOPSingleIndex { + pub fn new() -> Self { Self { sop: HashMap::new() } } + + pub fn merge_from(&mut self, other: SOPSingleIndex) { + for (s, obj_map) in other.sop { + let entry = self.sop.entry(s).or_insert_with(HashMap::new); + for (o, preds) in obj_map { + entry.entry(o).or_insert_with(HashSet::new).extend(preds); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(obj_map) = self.sop.get(&s) { + if let Some(preds) = obj_map.get(&o) { + if preds.contains(&p) { return false; } + } + } + self.sop.entry(s).or_insert_with(|| HashMap::with_capacity(16)) + .entry(o).or_insert_with(|| HashSet::with_capacity(8)) + .insert(p); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.sop.par_iter_mut().for_each(|(_, obj_map)| { + obj_map.shrink_to_fit(); + obj_map.par_iter_mut().for_each(|(_, preds)| { preds.shrink_to_fit(); }); + }); + self.sop.shrink_to_fit(); + } +} diff --git a/shared/src/index_manager/spo_single.rs b/shared/src/index_manager/spo_single.rs new file mode 100644 index 0000000..3f8978d --- /dev/null +++ b/shared/src/index_manager/spo_single.rs @@ -0,0 +1,253 @@ +use serde::{Serialize, Deserialize}; +use std::collections::{HashMap, HashSet}; +use crate::terms::*; +use crate::triple::Triple; +use crate::index_manager::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SPOSingleIndex { + pub spo: HashMap>>, +} + +impl TripleIndex for SPOSingleIndex { + fn clone_empty(&self) -> Box { + Box::new(SPOSingleIndex::new()) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + fn triple_count(&self) -> usize { + self.spo.values() + .map(|pred_map| pred_map.values().map(|objs| objs.len()).sum::()) + .sum() + } + + fn supported_access_patterns(&self) -> AccessPatternSupport { + AccessPatternSupport { + sp: true, so: false, po: false, + ps: false, os: false, op: false + } + } + + fn insert(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objects) = pred_map.get(&p) { + if objects.contains(&o) { + return false; + } + } + } + self.spo.entry(s).or_default().entry(p).or_default().insert(o); + true + } + + fn delete(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + + if let Some(pred_map) = self.spo.get_mut(&s) { + if let Some(obj_set) = pred_map.get_mut(&p) { + if obj_set.remove(&o) { + // cleanup empty maps + if obj_set.is_empty() { + pred_map.remove(&p); + } + if pred_map.is_empty() { + self.spo.remove(&s); + } + return true; + } + } + } + false + } + + fn build_from_triples(&mut self, triples: &[Triple]) { + use rayon::prelude::*; + + self.clear(); + if triples.is_empty() { + return; + } + + let capacity = (triples.len() / 100).max(1); + self.spo.reserve(capacity); + + let num_threads = rayon::current_num_threads(); + let chunk_size = (triples.len() / num_threads).max(10_000); + + let partials: Vec = triples + .par_chunks(chunk_size) + .map(|chunk| { + let mut local = SPOSingleIndex::new(); + let local_capacity = (chunk.len() / 50).max(1); + local.spo.reserve(local_capacity); + + for t in chunk { + local.insert_optimized(t); + } + local + }) + .collect(); + + for part in partials { + self.merge_from(part); + } + + self.optimize_post_build(); + } + + fn query(&self, s: Option, p: Option, o: Option) -> Vec { + let mut results = Vec::new(); + match (s, p, o) { + (Some(ss), Some(pp), Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objs) = pred_map.get(&pp) { + if objs.contains(&oo) { + results.push(Triple { subject: ss, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), Some(pp), None) => { + if let Some(pred_map) = self.spo.get(&ss) { + if let Some(objs) = pred_map.get(&pp) { + for &obj in objs { + results.push(Triple { subject: ss, predicate: pp, object: obj }); + } + } + } + } + (Some(ss), None, Some(oo)) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objs) in pred_map { + if objs.contains(&oo) { + results.push(Triple { subject: ss, predicate: pred, object: oo }); + } + } + } + } + (None, Some(pp), Some(oo)) => { + for (&sub, pred_map) in &self.spo { + if let Some(objs) = pred_map.get(&pp) { + if objs.contains(&oo) { + results.push(Triple { subject: sub, predicate: pp, object: oo }); + } + } + } + } + (Some(ss), None, None) => { + if let Some(pred_map) = self.spo.get(&ss) { + for (&pred, objs) in pred_map { + for &obj in objs { + results.push(Triple { subject: ss, predicate: pred, object: obj }); + } + } + } + } + (None, Some(pp), None) => { + for (&sub, pred_map) in &self.spo { + if let Some(objs) = pred_map.get(&pp) { + for &obj in objs { + results.push(Triple { subject: sub, predicate: pp, object: obj }); + } + } + } + } + (None, None, Some(oo)) => { + for (&sub, pred_map) in &self.spo { + for (&pred, objs) in pred_map { + if objs.contains(&oo) { + results.push(Triple { subject: sub, predicate: pred, object: oo }); + } + } + } + } + (None, None, None) => { + for (&sub, pred_map) in &self.spo { + for (&pred, objs) in pred_map { + for &obj in objs { + results.push(Triple { subject: sub, predicate: pred, object: obj }); + } + } + } + } + } + results + } + + fn get_matching_triples(&self, pattern: &TriplePattern) -> Vec { + let (s, p, o) = pattern; + let sub = match s { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let pre = match p { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + let obj = match o { Constant(x) => Some(*x), Variable(_) => None, QuotedTriple(_) => None }; + self.query(sub, pre, obj) + } + + fn clear(&mut self) { + self.spo.clear(); + } + + fn scan_sp(&self, s: u32, p: u32) -> Option<&HashSet> { + self.spo.get(&s).and_then(|pred_map| pred_map.get(&p)) + } + + fn scan_so(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_po(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_ps(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_os(&self, _: u32, _: u32) -> Option<&HashSet> { None } + fn scan_op(&self, _: u32, _: u32) -> Option<&HashSet> { None } + + fn optimize(&mut self) { + use rayon::prelude::*; + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + pred_map.shrink_to_fit(); + }); + self.spo.shrink_to_fit(); + } +} + +impl SPOSingleIndex { + pub fn new() -> Self { + Self { spo: HashMap::new() } + } + + pub fn merge_from(&mut self, other: SPOSingleIndex) { + for (s, pred_map) in other.spo { + let entry = self.spo.entry(s).or_insert_with(HashMap::new); + for (p, obj_set) in pred_map { + entry.entry(p).or_insert_with(HashSet::new).extend(obj_set); + } + } + } + + #[inline] + fn insert_optimized(&mut self, triple: &Triple) -> bool { + let Triple { subject: s, predicate: p, object: o } = *triple; + if let Some(pred_map) = self.spo.get(&s) { + if let Some(objs) = pred_map.get(&p) { + if objs.contains(&o) { return false; } + } + } + self.spo.entry(s).or_insert_with(|| HashMap::with_capacity(16)) + .entry(p).or_insert_with(|| HashSet::with_capacity(8)) + .insert(o); + true + } + + fn optimize_post_build(&mut self) { + use rayon::prelude::*; + self.spo.par_iter_mut().for_each(|(_, pred_map)| { + pred_map.shrink_to_fit(); + pred_map.par_iter_mut().for_each(|(_, obj_set)| { + obj_set.shrink_to_fit(); + }); + }); + self.spo.shrink_to_fit(); + } +} diff --git a/shared/src/join_algorithm.rs b/shared/src/join_algorithm.rs index 1f7afd8..6855cee 100644 --- a/shared/src/join_algorithm.rs +++ b/shared/src/join_algorithm.rs @@ -9,7 +9,7 @@ */ use crate::dictionary::Dictionary; -use crate::index_manager::UnifiedIndex; +use crate::index_manager::TripleIndex; use crate::terms::{Term, TriplePattern}; use crate::triple::Triple; use rayon::prelude::*; @@ -20,7 +20,7 @@ pub fn perform_join_par_simd_with_strict_filter_4_redesigned_streaming( subject_var: String, predicate: String, object_var: String, - index_manager: &UnifiedIndex, // ← Pass index instead of database + index_manager: Box, // ← Pass index instead of database dictionary: &Dictionary, final_results: Vec>, literal_filter: Option, @@ -47,45 +47,15 @@ pub fn perform_join_par_simd_with_strict_filter_4_redesigned_streaming( dictionary, ); - // FIX: Use PSO index instead of POS for better ordering let mut filtered_triples: Vec = if let Some(pred_id) = predicate_id { - // Use PSO index (Predicate -> Subject -> Object) - // This gives results sorted by subject first! - if let Some(subject_map) = index_manager. pso.get(&pred_id) { - // Collect subjects in sorted order - let mut subjects: Vec<_> = subject_map.iter().collect(); - subjects.sort_unstable_by_key(|(subj, _)| *subj); // Sort by subject - - subjects - .par_iter() - .flat_map(|(&subject, objects)| { - // Objects are in HashSet, convert to sorted Vec - let mut sorted_objects: Vec = objects.iter().copied().collect(); - sorted_objects.sort_unstable(); // Sort objects within each subject - - // Build triples - naturally sorted by (subject, object)! - sorted_objects - .into_iter() - .filter_map(|object| { - // Apply literal filter if present - if let Some(filter_id) = literal_filter_id { - if object != filter_id { - return None; - } - } - - Some(Triple { - subject, - predicate: pred_id, - object, - }) - }) - .collect::>() - }) - .collect() - } else { - Vec::new() + let mut triples = index_manager.query(None, Some(pred_id), None); + + // Apply literal filter if present + if let Some(filter_id) = literal_filter_id { + triples.retain(|t| t.object == filter_id); } + + triples } else { Vec::new() }; diff --git a/shared/src/query.rs b/shared/src/query.rs index de6c00a..90a4805 100644 --- a/shared/src/query.rs +++ b/shared/src/query.rs @@ -11,6 +11,16 @@ use std::collections::HashMap; use std::time::Duration; +use crate::terms::TriplePattern; + +#[derive(Debug, Clone)] +pub struct PlannedAccessPattern { + pub pattern: TriplePattern, + pub bound_subject: bool, + pub bound_predicate: bool, + pub bound_object: bool, +} + #[derive(Debug, Clone)] pub enum FilterExpression<'a> { Comparison(&'a str, &'a str, &'a str),