Skip to content

Commit 54f6f39

Browse files
committed
fix to truncate Unicode correctly and add related test
1 parent bd1c68a commit 54f6f39

2 files changed

Lines changed: 47 additions & 8 deletions

File tree

crates/iceberg/Cargo.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,19 @@ license = "Apache-2.0"
2727
keywords = ["iceberg"]
2828

2929
[dependencies]
30-
arrow-array = { version = ">=46" }
31-
arrow-schema = { version = ">=46" }
32-
arrow-arith = { version = ">=46" }
3330
anyhow = "1.0.72"
3431
apache-avro = "0.15"
32+
arrow-arith = { version = ">=46" }
33+
arrow-array = { version = ">=46" }
34+
arrow-schema = { version = ">=46" }
3535
bimap = "0.6"
3636
bitvec = "1.0.1"
3737
chrono = "0.4"
3838
derive_builder = "0.12.0"
3939
either = "1"
4040
itertools = "0.11"
4141
lazy_static = "1"
42+
murmur3 = "0.5.2"
4243
once_cell = "1"
4344
ordered-float = "3.7.0"
4445
rust_decimal = "1.31.0"
@@ -48,8 +49,6 @@ serde_derive = "^1.0"
4849
serde_json = "^1.0"
4950
serde_repr = "0.1.16"
5051
uuid = "1.4.1"
51-
murmur3 = "0.5.2"
52-
5352

5453
[dev-dependencies]
5554
pretty_assertions = "1.4.0"

crates/iceberg/src/transform/truncate.rs

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ impl Truncate {
3232
pub fn new(width: u32) -> Self {
3333
Self { width }
3434
}
35+
36+
fn truncate_str_by_char(s: &str, max_chars: usize) -> &str {
37+
match s.char_indices().nth(max_chars) {
38+
None => s,
39+
Some((idx, _)) => &s[..idx],
40+
}
41+
}
3542
}
3643

3744
impl TransformFunction for Truncate {
@@ -79,7 +86,7 @@ impl TransformFunction for Truncate {
7986
.downcast_ref::<arrow_array::StringArray>()
8087
.unwrap()
8188
.iter()
82-
.map(|v| v.map(|v| &v[..len])),
89+
.map(|v| v.map(|v| Self::truncate_str_by_char(v, len))),
8390
);
8491
Ok(Arc::new(res))
8592
}
@@ -91,7 +98,7 @@ impl TransformFunction for Truncate {
9198
.downcast_ref::<arrow_array::LargeStringArray>()
9299
.unwrap()
93100
.iter()
94-
.map(|v| v.map(|v| &v[..len])),
101+
.map(|v| v.map(|v| Self::truncate_str_by_char(v, len))),
95102
);
96103
Ok(Arc::new(res))
97104
}
@@ -112,7 +119,7 @@ mod test {
112119

113120
// Test case ref from: https://iceberg.apache.org/spec/#truncate-transform-details
114121
#[test]
115-
fn test_truncate() {
122+
fn test_truncate_simple() {
116123
// test truncate int
117124
let input = Arc::new(Int32Array::from(vec![1, -1]));
118125
let res = super::Truncate::new(10).transform(input).unwrap();
@@ -174,4 +181,37 @@ mod test {
174181
"ice"
175182
);
176183
}
184+
185+
#[test]
186+
fn test_string_truncate() {
187+
let test1 = "イロハニホヘト";
188+
let test1_2_expected = "イロ";
189+
assert_eq!(
190+
super::Truncate::truncate_str_by_char(test1, 2),
191+
test1_2_expected
192+
);
193+
194+
let test1_3_expected = "イロハ";
195+
assert_eq!(
196+
super::Truncate::truncate_str_by_char(test1, 3),
197+
test1_3_expected
198+
);
199+
200+
let test2 = "щщаεはчωいにπάほхεろへσκζ";
201+
let test2_7_expected = "щщаεはчω";
202+
assert_eq!(
203+
super::Truncate::truncate_str_by_char(test2, 7),
204+
test2_7_expected
205+
);
206+
207+
let test3 = "\u{FFFF}\u{FFFF}";
208+
assert_eq!(super::Truncate::truncate_str_by_char(test3, 2), test3);
209+
210+
let test4 = "\u{10000}\u{10000}";
211+
let test4_1_expected = "\u{10000}";
212+
assert_eq!(
213+
super::Truncate::truncate_str_by_char(test4, 1),
214+
test4_1_expected
215+
);
216+
}
177217
}

0 commit comments

Comments
 (0)