Skip to content
Open
170 changes: 162 additions & 8 deletions vortex-duckdb/src/multi_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,38 @@ use crate::duckdb::ClientContextRef;
use crate::duckdb::LogicalType;
use crate::filesystem::resolve_filesystem;

/// Parse a glob string into a [`Url`].
///
/// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as
/// bare file paths. For bare paths, the path is made absolute (without requiring it to exist)
/// so that relative paths such as `./data/*.vortex` or `../data/*.vortex` are resolved correctly.
fn parse_glob_url(glob_url_str: &str) -> VortexResult<Url> {
Url::parse(glob_url_str).or_else(|_| {
let path = absolute(Path::new(glob_url_str))
.map_err(|e| vortex_err!("Failed making {glob_url_str} absolute: {e}"))?;
// `absolute()` does not normalize `..` components, so `/a/b/../c` stays as-is.
// Normalizing manually avoids `..` being percent-encoded in the resulting URL.
let path = normalize_path(path);
Url::from_file_path(path).map_err(|_| vortex_err!("Neither URL nor path: {glob_url_str}"))
})
}

/// Normalize a path by resolving `.` and `..` components without accessing the filesystem.
fn normalize_path(path: std::path::PathBuf) -> std::path::PathBuf {
use std::path::Component;
let mut normalized = std::path::PathBuf::new();
for component in path.components() {
match component {
Component::CurDir => {}
Component::ParentDir => {
normalized.pop();
}
c => normalized.push(c),
}
}
normalized
}

/// Vortex multi-file scan table function (`vortex_scan` / `read_vortex`).
///
/// Takes a file glob parameter and resolves it into a [`MultiFileDataSource`].
Expand All @@ -39,14 +71,9 @@ impl DataSourceTableFunction for VortexMultiFileScan {
.ok_or_else(|| vortex_err!("Missing file glob parameter"))?;

// Parse the URL and separate the base URL (keep scheme, host, etc.) from the path.
let glob_url_str = glob_url_parameter.as_string();

let glob_url = Url::parse(&glob_url_str).or_else(|_| {
let glob_url = glob_url_str.as_str();
let path = absolute(Path::new(glob_url));
let path = path.map_err(|e| vortex_err!("Failed making {glob_url} absolute: {e}"))?;
Url::from_file_path(path).map_err(|_| vortex_err!("Neither URL nor path: {glob_url}"))
})?;
let glob_url_string = glob_url_parameter.as_string();
let glob_url_str = glob_url_string.as_str();
let glob_url = parse_glob_url(glob_url_str)?;

let mut base_url = glob_url.clone();
base_url.set_path("");
Expand All @@ -62,3 +89,130 @@ impl DataSourceTableFunction for VortexMultiFileScan {
})
}
}

#[cfg(test)]
mod tests {
#[allow(clippy::wildcard_imports)]
use rstest::rstest;

use super::*;

#[test]
fn test_parse_glob_url_s3() -> VortexResult<()> {
let url = parse_glob_url("s3://my-bucket/prefix/*.vortex")?;
assert_eq!(url.scheme(), "s3");
assert_eq!(url.host_str(), Some("my-bucket"));
assert_eq!(url.path(), "/prefix/*.vortex");
Ok(())
}

#[test]
fn test_parse_glob_url_file_scheme() -> VortexResult<()> {
let url = parse_glob_url("file:///absolute/path/data.vortex")?;
assert_eq!(url.scheme(), "file");
assert_eq!(url.path(), "/absolute/path/data.vortex");
Ok(())
}

#[test]
fn test_parse_glob_url_absolute_glob_path() -> VortexResult<()> {
let tmpdir = tempfile::tempdir().unwrap();
let glob = format!("{}/*.vortex", tmpdir.path().display());
let url = parse_glob_url(&glob)?;
assert_eq!(url.scheme(), "file");
assert!(url.path().ends_with("/*.vortex"));
Ok(())
}

#[test]
fn test_parse_glob_url_absolute_existing_path() -> VortexResult<()> {
let tmpfile = tempfile::NamedTempFile::new().unwrap();
let canonical = std::fs::canonicalize(tmpfile.path()).unwrap();
let path_str = canonical.to_str().unwrap();
let url = parse_glob_url(path_str)?;
assert_eq!(url.scheme(), "file");
assert_eq!(url.path(), path_str);
Ok(())
}

#[test]
fn test_parse_glob_url_relative_path() -> VortexResult<()> {
// Create a tempfile in the current working directory so we can refer to it
// by a relative name (just the filename, without any directory component).
let tmpfile = tempfile::NamedTempFile::new_in(".").unwrap();
let filename = tmpfile.path().file_name().unwrap().to_str().unwrap();

let url = parse_glob_url(filename)?;
assert_eq!(url.scheme(), "file");
// The relative name must have been resolved to an absolute path.
assert!(url.path().ends_with(filename));
assert!(url.path().starts_with('/'));
Ok(())
}

#[test]
fn test_parse_glob_url_relative_glob_path() -> VortexResult<()> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I believe the PR is nearly ready.

Can you also add a test for relative paths like "../test.vortex", please?
I've tested and it fails, likely a flaw in my implementation.

$ duckdb -c 'SELECT * FROM "../vortex/vortex-bench/data/tpch/1.0/vortex-file-compressed/customer_0.vortex"'
Binder Error:
Object store error: Object at location /home/myrrc/duckdb-vortex/%2E%2E/vortex/vortex-bench/data/tpch/1.0/vortex-file-compressed/customer_0.vortex not found: No such file or directory (os error 2)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Root cause

std::path::absolute() makes a path absolute but does not normalize .. and/or . components. So ../test.vortex from /home/user/vortex/ becomes /home/user/vortex/../test.vortex, and Url::from_file_path then percent-encodes the dots as %2E%2E, causing DuckDB to look up a wrong/encoded path.

Fix

Added a normalize_path helper function that walks the path components and resolves . and .. without touching the filesystem. This is called after absolute() before creating the Url.

Tests

  • added: test_parse_glob_url_parent_relative_path — creates a temp file in .., constructs a ../filename relative path, and asserts the resulting URL is absolute, has no %2E encoding, and ends with the correct filename.
  • added test_parse_glob_url_dot_normalization, just test the normalize_path algorithm without creating any files.

// A relative path with a glob character (e.g. `./data/*.vortex`) must also resolve
// correctly.
let tmpdir = tempfile::tempdir_in(".").unwrap();
let dir_name = tmpdir.path().file_name().unwrap().to_str().unwrap();
let glob = format!("./{dir_name}/*.vortex");
let url = parse_glob_url(&glob)?;
assert_eq!(url.scheme(), "file");
assert!(url.path().starts_with('/'));
assert!(url.path().ends_with("/*.vortex"));
Ok(())
}

#[test]
fn test_parse_glob_url_nonexistent_path() -> VortexResult<()> {
// absolute() does not require the path to exist, so a non-existent path succeeds.
let url = parse_glob_url("/nonexistent/path/file.vortex")?;
assert_eq!(url.scheme(), "file");
assert_eq!(url.path(), "/nonexistent/path/file.vortex");
Ok(())
}

#[test]
fn test_parse_glob_url_parent_relative_path() -> VortexResult<()> {
// A path starting with `..` must be resolved to an absolute path without
// percent-encoding the `..` component in the resulting URL.
let tmpfile = tempfile::NamedTempFile::new_in("..").unwrap();
let filename = tmpfile.path().file_name().unwrap().to_str().unwrap();
let relative = format!("../{filename}");

let url = parse_glob_url(&relative)?;
assert_eq!(url.scheme(), "file");
// The resolved path must be absolute and must not contain encoded dots.
assert!(url.path().starts_with('/'));
assert!(
!url.path().contains("%2E"),
"path must not contain percent-encoded dots"
);
assert!(url.path().ends_with(filename));
Ok(())
}

// Use absolute paths so the expected result is cwd-independent.
#[rstest]
#[case("/a/./b", "/a/b")]
#[case("/a/b/./c", "/a/b/c")]
#[case("/a/../b", "/b")]
#[case("/a/b/../c", "/a/c")]
#[case("/a/b/../../c", "/c")]
#[case("/a/./b/.././c", "/a/c")]
#[case("/a/b/../..", "/")]
fn test_parse_glob_url_dot_normalization(
#[case] input: &str,
#[case] expected_path: &str,
) -> VortexResult<()> {
let url = parse_glob_url(input)?;
assert_eq!(url.scheme(), "file");
assert_eq!(
url.path(),
expected_path,
"input {input:?} should normalize to {expected_path:?}"
);
Ok(())
}
}