From aff7c79f0bbc550b0f025a29cb6629438a7b5c0d Mon Sep 17 00:00:00 2001 From: Ruoshi Li Date: Sun, 8 Mar 2026 12:47:19 -0700 Subject: [PATCH 1/4] Improve URL parsing in VortexMultiFileScan to handle file paths more robustly Fixes #6841 where relative file paths caused a "Neither URL nor path" error. Relative paths are now resolved to absolute paths via `std::fs::canonicalize` before being converted to a `file://` URL. Glob patterns containing `*`, `?`, or `[` skip canonicalization because the pattern itself does not correspond to an existing path. The URL parsing logic is extracted into a `parse_glob_url` helper with unit tests covering s3 URLs, `file://` URLs, absolute glob patterns, absolute file paths, relative file paths, and the error case for non-existent paths. Signed-off-by: Ruoshi Li --- vortex-duckdb/src/multi_file.rs | 97 +++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 6 deletions(-) diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index 3ce2ed3856e..010bf72ff03 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -19,6 +19,30 @@ use crate::duckdb::ClientContextRef; use crate::duckdb::LogicalType; use crate::filesystem::resolve_filesystem; +/// Parse a glob string into a [`Url`]. +/// +/// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as +/// bare file paths. For bare paths that contain no glob characters the path is first canonicalized +/// so that relative paths (e.g. `./data/file.vortex`) are resolved to absolute paths before +/// conversion. Paths that already contain glob characters (`*`, `?`, `[`) are used as-is because +/// [`std::fs::canonicalize`] would fail on a non-existent glob pattern; note that such paths must +/// be absolute for the conversion to succeed. +pub(crate) fn parse_glob_url(glob_url_str: &str) -> VortexResult { + Url::parse(glob_url_str).or_else(|_| { + let path = Path::new(glob_url_str); + + let path = if glob_url_str.contains(['*', '?', '[']) { + path.to_path_buf() + } else { + std::fs::canonicalize(path) + .map_err(|_| vortex_err!("Neither URL nor path: '{}'", glob_url_str))? + }; + + Url::from_file_path(path) + .map_err(|_| vortex_err!("Neither URL nor path: '{}'", glob_url_str)) + }) +} + /// Vortex multi-file scan table function (`vortex_scan` / `read_vortex`). /// /// Takes a file glob parameter and resolves it into a [`MultiFileDataSource`]. @@ -38,12 +62,9 @@ impl DataSourceTableFunction for VortexMultiFileScan { .ok_or_else(|| vortex_err!("Missing file glob parameter"))?; // Parse the URL and separate the base URL (keep scheme, host, etc.) from the path. - let glob_url_str = glob_url_parameter.as_string(); - let glob_url = match Url::parse(glob_url_str.as_str()) { - Ok(url) => Ok(url), - Err(_) => Url::from_file_path(Path::new(glob_url_str.as_str())) - .map_err(|_| vortex_err!("Neither URL nor path: '{}' ", glob_url_str.as_str())), - }?; + let glob_url_string = glob_url_parameter.as_string(); + let glob_url_str = glob_url_string.as_str(); + let glob_url = parse_glob_url(glob_url_str)?; let mut base_url = glob_url.clone(); base_url.set_path(""); @@ -59,3 +80,67 @@ impl DataSourceTableFunction for VortexMultiFileScan { }) } } + +#[cfg(test)] +mod tests { + #[allow(clippy::wildcard_imports)] + use super::*; + + #[test] + fn test_parse_glob_url_s3() -> VortexResult<()> { + let url = parse_glob_url("s3://my-bucket/prefix/*.vortex")?; + assert_eq!(url.scheme(), "s3"); + assert_eq!(url.host_str(), Some("my-bucket")); + assert_eq!(url.path(), "/prefix/*.vortex"); + Ok(()) + } + + #[test] + fn test_parse_glob_url_file_scheme() -> VortexResult<()> { + let url = parse_glob_url("file:///absolute/path/data.vortex")?; + assert_eq!(url.scheme(), "file"); + assert_eq!(url.path(), "/absolute/path/data.vortex"); + Ok(()) + } + + #[test] + fn test_parse_glob_url_absolute_glob_path() -> VortexResult<()> { + // Absolute paths with glob characters are used as-is (no canonicalize). + let url = parse_glob_url("/tmp/data/*.vortex")?; + assert_eq!(url.scheme(), "file"); + assert_eq!(url.path(), "/tmp/data/*.vortex"); + Ok(()) + } + + #[test] + fn test_parse_glob_url_absolute_existing_path() -> VortexResult<()> { + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let canonical = std::fs::canonicalize(tmpfile.path()).unwrap(); + let path_str = canonical.to_str().unwrap(); + let url = parse_glob_url(path_str)?; + assert_eq!(url.scheme(), "file"); + assert_eq!(url.path(), path_str); + Ok(()) + } + + #[test] + fn test_parse_glob_url_relative_path() -> VortexResult<()> { + // Create a tempfile in the current working directory so we can refer to it + // by a relative name (just the filename, without any directory component). + let tmpfile = tempfile::NamedTempFile::new_in(".").unwrap(); + let filename = tmpfile.path().file_name().unwrap().to_str().unwrap(); + + let url = parse_glob_url(filename)?; + assert_eq!(url.scheme(), "file"); + // The relative name must have been resolved to an absolute path. + assert!(url.path().ends_with(filename)); + assert!(url.path().starts_with('/')); + Ok(()) + } + + #[test] + fn test_parse_glob_url_nonexistent_path_returns_error() { + let result = parse_glob_url("/nonexistent/path/file.vortex"); + assert!(result.is_err()); + } +} From 654b40306f26f11c9e7e1f09c8f0c24f7f35a3a6 Mon Sep 17 00:00:00 2001 From: Ruoshi Li Date: Sun, 8 Mar 2026 13:14:15 -0700 Subject: [PATCH 2/4] Fix relative glob paths and improve error messages in parse_glob_url - Extract URL parsing into parse_glob_url and path resolution into canonicalize_path_prefix for testability. - Fix relative glob paths (e.g. ./data/*.vortex): canonicalize only the directory prefix before the first glob character, then append the glob suffix, so relative patterns resolve to absolute file URLs. - Replace repeated generic "Neither URL nor path" errors with specific messages that include the underlying IO error (e.g. "No such file or directory"), making diagnosis easier. - Add unit tests covering s3/file URLs, absolute/relative paths, absolute/relative glob patterns, and the not-found error case. Signed-off-by: Ruoshi Li --- vortex-duckdb/src/multi_file.rs | 85 +++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index 010bf72ff03..232e9d467bb 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::path::Path; +use std::path::PathBuf; use std::sync::Arc; use url::Url; @@ -22,27 +22,57 @@ use crate::filesystem::resolve_filesystem; /// Parse a glob string into a [`Url`]. /// /// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as -/// bare file paths. For bare paths that contain no glob characters the path is first canonicalized -/// so that relative paths (e.g. `./data/file.vortex`) are resolved to absolute paths before -/// conversion. Paths that already contain glob characters (`*`, `?`, `[`) are used as-is because -/// [`std::fs::canonicalize`] would fail on a non-existent glob pattern; note that such paths must -/// be absolute for the conversion to succeed. +/// bare file paths. For bare paths the portion of the path before any glob character (`*`, `?`, +/// `[`) is canonicalized so that relative paths such as `./data/*.vortex` are resolved to +/// absolute paths before conversion. pub(crate) fn parse_glob_url(glob_url_str: &str) -> VortexResult { Url::parse(glob_url_str).or_else(|_| { - let path = Path::new(glob_url_str); - - let path = if glob_url_str.contains(['*', '?', '[']) { - path.to_path_buf() - } else { - std::fs::canonicalize(path) - .map_err(|_| vortex_err!("Neither URL nor path: '{}'", glob_url_str))? - }; - - Url::from_file_path(path) - .map_err(|_| vortex_err!("Neither URL nor path: '{}'", glob_url_str)) + let path = canonicalize_path_prefix(glob_url_str)?; + // from_file_path only fails when the path is not absolute, which cannot happen after + // canonicalization, so this error is purely defensive. + Url::from_file_path(&path) + .map_err(|_| vortex_err!("Neither URL nor valid path: '{}'", glob_url_str)) }) } +/// Canonicalize the non-glob prefix of `path_str` and return the resulting absolute path. +/// +/// For paths without any glob characters the whole path is canonicalized. For paths that contain +/// glob characters (`*`, `?`, `[`) the directory portion that precedes the first glob character +/// is canonicalized and the remaining glob suffix is appended unchanged. This allows relative glob +/// patterns such as `./data/*.vortex` to be resolved correctly. +fn canonicalize_path_prefix(path_str: &str) -> VortexResult { + let first_glob_char_pos = path_str.find(['*', '?', '[']); + + let Some(first_glob_char_pos) = first_glob_char_pos else { + // No glob characters — canonicalize the whole path. + return std::fs::canonicalize(path_str) + .map_err(|e| vortex_err!("Cannot resolve path '{}': {}", path_str, e)); + }; + + // Find the last path separator before the first glob character to split the string into a + // concrete directory prefix and a glob suffix. + let last_separator_before_glob = + path_str[..first_glob_char_pos].rfind(std::path::MAIN_SEPARATOR); + + let (dir_prefix, glob_suffix) = match last_separator_before_glob { + Some(sep_pos) => (&path_str[..sep_pos], &path_str[sep_pos + 1..]), + // No separator before the glob (e.g. `*.vortex`); canonicalize the current directory. + None => (".", path_str), + }; + + let canonical_dir = std::fs::canonicalize(dir_prefix).map_err(|e| { + vortex_err!( + "Cannot resolve directory '{}' in glob pattern '{}': {}", + dir_prefix, + path_str, + e + ) + })?; + + Ok(canonical_dir.join(glob_suffix)) +} + /// Vortex multi-file scan table function (`vortex_scan` / `read_vortex`). /// /// Takes a file glob parameter and resolves it into a [`MultiFileDataSource`]. @@ -105,10 +135,11 @@ mod tests { #[test] fn test_parse_glob_url_absolute_glob_path() -> VortexResult<()> { - // Absolute paths with glob characters are used as-is (no canonicalize). - let url = parse_glob_url("/tmp/data/*.vortex")?; + let tmpdir = tempfile::tempdir().unwrap(); + let glob = format!("{}/*.vortex", tmpdir.path().display()); + let url = parse_glob_url(&glob)?; assert_eq!(url.scheme(), "file"); - assert_eq!(url.path(), "/tmp/data/*.vortex"); + assert!(url.path().ends_with("/*.vortex")); Ok(()) } @@ -138,6 +169,20 @@ mod tests { Ok(()) } + #[test] + fn test_parse_glob_url_relative_glob_path() -> VortexResult<()> { + // A relative path with a glob character (e.g. `./data/*.vortex`) must also resolve + // correctly; this was broken before the canonicalize-prefix fix. + let tmpdir = tempfile::tempdir_in(".").unwrap(); + let dir_name = tmpdir.path().file_name().unwrap().to_str().unwrap(); + let glob = format!("./{dir_name}/*.vortex"); + let url = parse_glob_url(&glob)?; + assert_eq!(url.scheme(), "file"); + assert!(url.path().starts_with('/')); + assert!(url.path().ends_with("/*.vortex")); + Ok(()) + } + #[test] fn test_parse_glob_url_nonexistent_path_returns_error() { let result = parse_glob_url("/nonexistent/path/file.vortex"); From 854453b4b3f76577e47de3abffe09be7fb12d409 Mon Sep 17 00:00:00 2001 From: Ruoshi Li Date: Sun, 8 Mar 2026 13:20:29 -0700 Subject: [PATCH 3/4] Make parse_glob_url module-private Signed-off-by: Ruoshi Li --- vortex-duckdb/src/multi_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index 232e9d467bb..486a3aff4ba 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -25,7 +25,7 @@ use crate::filesystem::resolve_filesystem; /// bare file paths. For bare paths the portion of the path before any glob character (`*`, `?`, /// `[`) is canonicalized so that relative paths such as `./data/*.vortex` are resolved to /// absolute paths before conversion. -pub(crate) fn parse_glob_url(glob_url_str: &str) -> VortexResult { +fn parse_glob_url(glob_url_str: &str) -> VortexResult { Url::parse(glob_url_str).or_else(|_| { let path = canonicalize_path_prefix(glob_url_str)?; // from_file_path only fails when the path is not absolute, which cannot happen after From ab65460d731c0a1832d6c151749f4006cb7db3fd Mon Sep 17 00:00:00 2001 From: Ruoshi Li Date: Tue, 10 Mar 2026 12:55:38 -0700 Subject: [PATCH 4/4] feat: enhance glob URL parsing and add normalization for relative paths Signed-off-by: Ruoshi Li --- vortex-duckdb/src/multi_file.rs | 69 +++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/vortex-duckdb/src/multi_file.rs b/vortex-duckdb/src/multi_file.rs index 2b25a5ded15..cba9a316ff0 100644 --- a/vortex-duckdb/src/multi_file.rs +++ b/vortex-duckdb/src/multi_file.rs @@ -24,16 +24,34 @@ use crate::filesystem::resolve_filesystem; /// /// Accepts full URLs (e.g. `s3://bucket/prefix/*.vortex`, `file:///data/*.vortex`) as well as /// bare file paths. For bare paths, the path is made absolute (without requiring it to exist) -/// so that relative paths such as `./data/*.vortex` are resolved correctly. +/// so that relative paths such as `./data/*.vortex` or `../data/*.vortex` are resolved correctly. fn parse_glob_url(glob_url_str: &str) -> VortexResult { Url::parse(glob_url_str).or_else(|_| { let path = absolute(Path::new(glob_url_str)) .map_err(|e| vortex_err!("Failed making {glob_url_str} absolute: {e}"))?; - Url::from_file_path(path) - .map_err(|_| vortex_err!("Neither URL nor path: {glob_url_str}")) + // `absolute()` does not normalize `..` components, so `/a/b/../c` stays as-is. + // Normalizing manually avoids `..` being percent-encoded in the resulting URL. + let path = normalize_path(path); + Url::from_file_path(path).map_err(|_| vortex_err!("Neither URL nor path: {glob_url_str}")) }) } +/// Normalize a path by resolving `.` and `..` components without accessing the filesystem. +fn normalize_path(path: std::path::PathBuf) -> std::path::PathBuf { + use std::path::Component; + let mut normalized = std::path::PathBuf::new(); + for component in path.components() { + match component { + Component::CurDir => {} + Component::ParentDir => { + normalized.pop(); + } + c => normalized.push(c), + } + } + normalized +} + /// Vortex multi-file scan table function (`vortex_scan` / `read_vortex`). /// /// Takes a file glob parameter and resolves it into a [`MultiFileDataSource`]. @@ -75,6 +93,8 @@ impl DataSourceTableFunction for VortexMultiFileScan { #[cfg(test)] mod tests { #[allow(clippy::wildcard_imports)] + use rstest::rstest; + use super::*; #[test] @@ -152,4 +172,47 @@ mod tests { assert_eq!(url.path(), "/nonexistent/path/file.vortex"); Ok(()) } + + #[test] + fn test_parse_glob_url_parent_relative_path() -> VortexResult<()> { + // A path starting with `..` must be resolved to an absolute path without + // percent-encoding the `..` component in the resulting URL. + let tmpfile = tempfile::NamedTempFile::new_in("..").unwrap(); + let filename = tmpfile.path().file_name().unwrap().to_str().unwrap(); + let relative = format!("../{filename}"); + + let url = parse_glob_url(&relative)?; + assert_eq!(url.scheme(), "file"); + // The resolved path must be absolute and must not contain encoded dots. + assert!(url.path().starts_with('/')); + assert!( + !url.path().contains("%2E"), + "path must not contain percent-encoded dots" + ); + assert!(url.path().ends_with(filename)); + Ok(()) + } + + // Use absolute paths so the expected result is cwd-independent. + #[rstest] + #[case("/a/./b", "/a/b")] + #[case("/a/b/./c", "/a/b/c")] + #[case("/a/../b", "/b")] + #[case("/a/b/../c", "/a/c")] + #[case("/a/b/../../c", "/c")] + #[case("/a/./b/.././c", "/a/c")] + #[case("/a/b/../..", "/")] + fn test_parse_glob_url_dot_normalization( + #[case] input: &str, + #[case] expected_path: &str, + ) -> VortexResult<()> { + let url = parse_glob_url(input)?; + assert_eq!(url.scheme(), "file"); + assert_eq!( + url.path(), + expected_path, + "input {input:?} should normalize to {expected_path:?}" + ); + Ok(()) + } }