martin-g commented on code in PR #20305:
URL: https://github.com/apache/datafusion/pull/20305#discussion_r2818084513
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -199,6 +267,99 @@ where
Ok(Arc::new(result) as ArrayRef)
}
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character). Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+ if !from.is_ascii() || !to.is_ascii() {
+ return None;
+ }
+ let mut table = [0u8; 128];
+ for i in 0..128u8 {
+ table[i as usize] = i;
+ }
+ let to_bytes = to.as_bytes();
+ let mut seen = [false; 128];
+ for (i, from_byte) in from.bytes().enumerate() {
+ let idx = from_byte as usize;
+ if !seen[idx] {
+ seen[idx] = true;
+ if i < to_bytes.len() {
+ table[idx] = to_bytes[i];
+ } else {
+ table[idx] = ASCII_DELETE;
+ }
+ }
+ }
+ Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a
pre-built
+/// translation map instead of rebuilding it for every row. When an ASCII byte
+/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
Review Comment:
```suggestion
/// lookup table is provided, ASCII input rows use the lookup table;
non-ASCII
```
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -199,6 +267,99 @@ where
Ok(Arc::new(result) as ArrayRef)
}
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character). Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+ if !from.is_ascii() || !to.is_ascii() {
+ return None;
+ }
+ let mut table = [0u8; 128];
+ for i in 0..128u8 {
+ table[i as usize] = i;
+ }
+ let to_bytes = to.as_bytes();
+ let mut seen = [false; 128];
+ for (i, from_byte) in from.bytes().enumerate() {
+ let idx = from_byte as usize;
+ if !seen[idx] {
+ seen[idx] = true;
+ if i < to_bytes.len() {
+ table[idx] = to_bytes[i];
+ } else {
+ table[idx] = ASCII_DELETE;
+ }
+ }
+ }
+ Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a
pre-built
+/// translation map instead of rebuilding it for every row. When an ASCII byte
+/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
+/// inputs fallback to using the map.
+fn translate_with_map<'a, T: OffsetSizeTrait, V>(
+ string_array: V,
+ from_map: &HashMap<&str, usize>,
+ to_graphemes: &[&str],
+ ascii_table: Option<&[u8; 128]>,
+) -> Result<ArrayRef>
+where
+ V: ArrayAccessor<Item = &'a str>,
+{
+ let mut string_graphemes: Vec<&str> = Vec::new();
+ let mut result_graphemes: Vec<&str> = Vec::new();
+ let mut ascii_buf: Vec<u8> = Vec::new();
+
+ let result = ArrayIter::new(string_array)
+ .map(|string| {
+ string.map(|s| {
+ // Fast path: byte-level table lookup for ASCII strings
+ if let Some(table) = ascii_table
+ && s.is_ascii()
+ {
+ ascii_buf.clear();
+ for &b in s.as_bytes() {
+ let mapped = table[b as usize];
+ if mapped != ASCII_DELETE {
+ ascii_buf.push(mapped);
+ }
+ }
+ // ascii_buf contains only ASCII bytes, so it is valid
+ // UTF-8.
+ return String::from_utf8(ascii_buf.clone()).unwrap();
Review Comment:
How about:
```suggestion
// SAFETY: all bytes are ASCII, hence valid UTF-8.
return unsafe {
std::str::from_utf8_unchecked(&ascii_buf).to_owned() };
```
to avoid the cloning of `ascii_buf`
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -199,6 +267,99 @@ where
Ok(Arc::new(result) as ArrayRef)
}
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character). Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+ if !from.is_ascii() || !to.is_ascii() {
+ return None;
+ }
+ let mut table = [0u8; 128];
+ for i in 0..128u8 {
+ table[i as usize] = i;
+ }
+ let to_bytes = to.as_bytes();
+ let mut seen = [false; 128];
+ for (i, from_byte) in from.bytes().enumerate() {
+ let idx = from_byte as usize;
+ if !seen[idx] {
+ seen[idx] = true;
+ if i < to_bytes.len() {
+ table[idx] = to_bytes[i];
+ } else {
+ table[idx] = ASCII_DELETE;
+ }
+ }
+ }
+ Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a
pre-built
+/// translation map instead of rebuilding it for every row. When an ASCII byte
+/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
+/// inputs fallback to using the map.
+fn translate_with_map<'a, T: OffsetSizeTrait, V>(
+ string_array: V,
+ from_map: &HashMap<&str, usize>,
+ to_graphemes: &[&str],
+ ascii_table: Option<&[u8; 128]>,
+) -> Result<ArrayRef>
+where
+ V: ArrayAccessor<Item = &'a str>,
+{
+ let mut string_graphemes: Vec<&str> = Vec::new();
+ let mut result_graphemes: Vec<&str> = Vec::new();
+ let mut ascii_buf: Vec<u8> = Vec::new();
+
+ let result = ArrayIter::new(string_array)
+ .map(|string| {
+ string.map(|s| {
+ // Fast path: byte-level table lookup for ASCII strings
+ if let Some(table) = ascii_table
+ && s.is_ascii()
+ {
+ ascii_buf.clear();
+ for &b in s.as_bytes() {
+ let mapped = table[b as usize];
+ if mapped != ASCII_DELETE {
+ ascii_buf.push(mapped);
+ }
+ }
+ // ascii_buf contains only ASCII bytes, so it is valid
+ // UTF-8.
+ return String::from_utf8(ascii_buf.clone()).unwrap();
+ }
+
+ // Slow path: grapheme-based translation
+ string_graphemes.clear();
+ result_graphemes.clear();
+
+ string_graphemes.extend(s.graphemes(true));
+ for c in &string_graphemes {
Review Comment:
Is `string_graphemes` really needed ? You can just use `s.graphemes(true)`
here, no ?
No `suggestion` because you will need to remove the Vec above anyway :-)
##########
datafusion/functions/src/unicode/translate.rs:
##########
Review Comment:
Hm. The `from` and `to` are `Utf8`, not `LargeUtf8`. Should they use
`.as_string<i32>()` instead ?!
The new SLT test will tell us whether I'm right.
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -71,6 +71,7 @@ impl TranslateFunc {
vec![
Exact(vec![Utf8View, Utf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8]),
+ Exact(vec![LargeUtf8, Utf8, Utf8]),
Review Comment:
You may add an SLT test for using translate() with LargeUtf8 input to
prevent from regressions.
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -99,6 +100,65 @@ impl ScalarUDFImpl for TranslateFunc {
&self,
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
+ // When from and to are scalars, pre-build the translation map once
+ if let (Some(from_str), Some(to_str)) = (
Review Comment:
What if `to` is a scalar Null ? `try_as_scalar_str()` will return None and
it will go to the "process array" (line 162).
Could it be optimized too ? To treat all characters from `from` as "to
remove".
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]