diff --git a/kolibrie/src/sparql_database.rs b/kolibrie/src/sparql_database.rs index f858883..12e8cbb 100644 --- a/kolibrie/src/sparql_database.rs +++ b/kolibrie/src/sparql_database.rs @@ -727,24 +727,24 @@ impl SparqlDatabase { } } - // New parse_turtle function - pub fn parse_turtle(&mut self, turtle_data: &str) { - let lines = turtle_data.lines(); - for line in lines { - let line = line.trim(); + pub fn parse_turtle(&mut self, turtle_data: &str) { + for raw_line in turtle_data.lines() { + let line = raw_line.trim(); // Skip empty lines and comments if line.is_empty() || line.starts_with("#") { continue; } + // Prefix declarations if line.starts_with("@prefix") || line.starts_with("PREFIX") { let prefix_line = line .trim_start_matches("@prefix") .trim_start_matches("PREFIX") .trim_end_matches('.') .trim(); + let parts: Vec<&str> = prefix_line.split_whitespace().collect(); if parts.len() >= 2 { let prefix = parts[0].trim_end_matches(':').to_string(); @@ -756,83 +756,153 @@ impl SparqlDatabase { continue; } - // Use quoted-triple-aware tokenization - let line_no_dot = line.trim_end_matches('.').trim(); - let tokens = Self::tokenize_turtle_star_line(line_no_dot); - - if tokens.len() >= 3 { - let subject_raw = &tokens[0]; - let predicate_raw = &tokens[1]; - let object_raw = tokens[2..].join(" "); - - // Check for annotation syntax {| ... |} - let (object_part, annotations) = if let Some(ann_start) = object_raw.find("{|") { - let obj = object_raw[..ann_start].trim().to_string(); - // Extract annotation content between {| and |} - if let Some(ann_end) = object_raw.find("|}") { - let ann_content = object_raw[ann_start + 2..ann_end].trim(); - let ann_parts: Vec<&str> = ann_content.splitn(2, char::is_whitespace).collect(); - if ann_parts.len() == 2 { - (obj, vec![(ann_parts[0].to_string(), ann_parts[1].to_string())]) + // Tokenize, but keep ; , . as delimiters only when outside URIs, literals, and quoted triples. + let tokens = Self::tokenize_turtle_star_line(line); + + let mut subject_raw: Option = None; + let mut predicate_raw: Option = None; + let mut object_tokens: Vec = Vec::new(); + + let mut expect_subject = true; + let mut expect_predicate = false; + let mut expect_object = false; + + let flush_object = |this: &mut Self, + subject_raw: &Option, + predicate_raw: &Option, + object_tokens: &mut Vec| { + if let (Some(s_raw), Some(p_raw)) = (subject_raw.as_ref(), predicate_raw.as_ref()) { + if object_tokens.is_empty() { + return; + } + + let object_raw = object_tokens.join(" "); + + // Handle annotation syntax {| ... |} + let (object_part, annotations) = if let Some(ann_start) = object_raw.find("{|") { + let obj = object_raw[..ann_start].trim().to_string(); + + if let Some(ann_end) = object_raw.find("|}") { + let ann_content = object_raw[ann_start + 2..ann_end].trim(); + let ann_parts: Vec<&str> = + ann_content.splitn(2, char::is_whitespace).collect(); + + if ann_parts.len() == 2 { + (obj, vec![(ann_parts[0].to_string(), ann_parts[1].to_string())]) + } else { + (obj, vec![]) + } } else { - (obj, vec![]) + (object_raw, vec![]) } } else { (object_raw, vec![]) + }; + + let subject = this.resolve_query_term(&Self::clean_turtle_term(s_raw), &this.prefixes); + let predicate = this.resolve_query_term(&Self::clean_turtle_term(p_raw), &this.prefixes); + let object = this.resolve_query_term(&Self::clean_turtle_term(&object_part), &this.prefixes); + + // Emit the main triple + if subject.starts_with("<<") || object.starts_with("<<") { + let s_id = this.encode_term_star(&subject); + let p_id = this.encode_term_star(&predicate); + let o_id = this.encode_term_star(&object); + let triple = Triple { + subject: s_id, + predicate: p_id, + object: o_id, + }; + this.add_triple(triple); + } else { + let mut dict = this.dictionary.write().unwrap(); + let triple = Triple { + subject: dict.encode(&subject), + predicate: dict.encode(&predicate), + object: dict.encode(&object), + }; + drop(dict); + this.add_triple(triple); } - } else { - (object_raw, vec![]) - }; - let subject = self.resolve_query_term(&Self::clean_turtle_term(subject_raw), &self.prefixes); - let predicate = self.resolve_query_term(&Self::clean_turtle_term(predicate_raw), &self.prefixes); - let object = self.resolve_query_term(&Self::clean_turtle_term(&object_part), &self.prefixes); - - // Check if subject or object is a quoted triple - if subject.starts_with("<<") || object.starts_with("<<") { - let s_id = self.encode_term_star(&subject); - let p_id = self.encode_term_star(&predicate); - let o_id = self.encode_term_star(&object); - let triple = Triple { subject: s_id, predicate: p_id, object: o_id }; - self.add_triple(triple); - } else { - let mut dict = self.dictionary.write().unwrap(); - let triple = Triple { - subject: dict.encode(&subject), - predicate: dict.encode(&predicate), - object: dict.encode(&object), - }; - drop(dict); - self.add_triple(triple); + // Emit annotation triples, if any + for (ann_pred, ann_obj) in &annotations { + let qt_str = format!("<< {} {} {} >>", subject, predicate, object); + let qt_id = this.encode_term_star(&qt_str); + + let ann_p_id = this.encode_term_star( + &this.resolve_query_term(&Self::clean_turtle_term(ann_pred), &this.prefixes), + ); + let ann_o_id = this.encode_term_star( + &this.resolve_query_term(&Self::clean_turtle_term(ann_obj), &this.prefixes), + ); + + let ann_triple = Triple { + subject: qt_id, + predicate: ann_p_id, + object: ann_o_id, + }; + this.add_triple(ann_triple); + } + + object_tokens.clear(); } + }; - // Handle annotations: emit additional triples with << s p o >> as subject - for (ann_pred, ann_obj) in &annotations { - let qt_str = format!("<< {} {} {} >>", subject, predicate, object); - let qt_id = self.encode_term_star(&qt_str); - let ann_p_id = self.encode_term_star( - &self.resolve_query_term(&Self::clean_turtle_term(ann_pred), &self.prefixes), - ); - let ann_o_id = self.encode_term_star( - &self.resolve_query_term(&Self::clean_turtle_term(ann_obj), &self.prefixes), - ); - let ann_triple = Triple { subject: qt_id, predicate: ann_p_id, object: ann_o_id }; - self.add_triple(ann_triple); + for token in tokens { + match token.as_str() { + "." => { + flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens); + subject_raw = None; + predicate_raw = None; + expect_subject = true; + expect_predicate = false; + expect_object = false; + } + ";" => { + flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens); + predicate_raw = None; + expect_predicate = true; + expect_object = false; + } + "," => { + flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens); + expect_object = true; + } + _ => { + if expect_subject { + subject_raw = Some(token); + expect_subject = false; + expect_predicate = true; + } else if expect_predicate { + predicate_raw = Some(token); + expect_predicate = false; + expect_object = true; + } else if expect_object { + object_tokens.push(token); + } else { + // Fallback for slightly malformed input + object_tokens.push(token); + } + } } - } else { - eprintln!("Skipping invalid line: {}", line); } + + // Flush any trailing object if the line does not end with '.' + flush_object(self, &subject_raw, &predicate_raw, &mut object_tokens); } } - /// Tokenize a Turtle-star line, keeping `<< ... >>` as a single token. + /// Tokenize a Turtle-star line, keeping `<< ... >>` and punctuation structure intact. fn tokenize_turtle_star_line(line: &str) -> Vec { let mut tokens = Vec::new(); let mut current = String::new(); - let mut depth = 0i32; - let mut in_uri = false; - let mut in_literal = false; + + let mut depth = 0i32; // quoted-triple nesting depth + let mut in_uri = false; // inside <...> + let mut in_literal = false; // inside "..." let mut escaped = false; + let mut chars = line.chars().peekable(); while let Some(ch) = chars.next() { @@ -841,19 +911,23 @@ impl SparqlDatabase { escaped = false; continue; } + match ch { '\\' if in_literal => { current.push(ch); escaped = true; } + '"' if !in_uri && depth == 0 => { in_literal = !in_literal; current.push(ch); } + '"' if depth > 0 => { in_literal = !in_literal; current.push(ch); } + '<' if !in_literal => { if chars.peek() == Some(&'<') && !in_uri { current.push(ch); @@ -870,6 +944,7 @@ impl SparqlDatabase { current.push(ch); } } + '>' if !in_literal => { if depth > 0 && !in_uri { current.push(ch); @@ -892,6 +967,16 @@ impl SparqlDatabase { current.push(ch); } } + + ';' | ',' | '.' if depth == 0 && !in_uri && !in_literal => { + let trimmed = current.trim().to_string(); + if !trimmed.is_empty() { + tokens.push(trimmed); + current.clear(); + } + tokens.push(ch.to_string()); + } + ' ' | '\t' | '\n' | '\r' if depth == 0 && !in_uri && !in_literal => { let trimmed = current.trim().to_string(); if !trimmed.is_empty() { @@ -899,15 +984,18 @@ impl SparqlDatabase { current.clear(); } } + _ => { current.push(ch); } } } + let trimmed = current.trim().to_string(); if !trimmed.is_empty() { tokens.push(trimmed); } + tokens }