Re: [PR] Implementation for regex_instr [datafusion]

via GitHub Wed, 18 Jun 2025 16:16:55 -0700


blaginin commented on code in PR #15928:
URL: https://github.com/apache/datafusion/pull/15928#discussion_r2155673833



##########
datafusion/functions/src/regex/regexpcount.rs:
##########
@@ -29,10 +30,10 @@ use datafusion_expr::{
 use datafusion_macros::user_doc;
 use itertools::izip;
 use regex::Regex;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Arc;
 
+// Ensure the `compile_and_cache_regex` function is defined in the `regex` 
module or imported correctly.

Review Comment:
   i don't think this comment is needed - it simply won't compile otheriwse



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }

Review Comment:
   I don't think this construction is needed - it makes the code much more 
harder to read and no other regex function uses it. I think you may be 
reinventing `Datum` is exactly "A possibly Scalar"
   
   https://docs.rs/arrow/latest/arrow/array/trait.Datum.html
   
   Also see `ColumnarValue`



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn regexp_instr_inner<'a, S>(
+    values: S,
+    regex_array: S,
+    is_regex_scalar: bool,
+    start_array: Option<&Int64Array>,
+    is_start_scalar: bool,
+    nth_array: Option<&Int64Array>,
+    is_nth_scalar: bool,
+    flags_array: Option<S>,
+    is_flags_scalar: bool,
+    subexp_array: Option<&Int64Array>,
+    is_subexp_scalar: bool,
+) -> Result<ArrayRef, ArrowError>
+where
+    S: StringArrayType<'a>,
+{
+    let len = values.len();
+
+    let regex_input = if is_regex_scalar || regex_array.len() == 1 {
+        ScalarOrArray::Scalar(Some(regex_array.value(0)))
+    } else {
+        let regex_vec: Vec<Option<&str>> = regex_array.iter().collect();
+        ScalarOrArray::Array(regex_vec)
+    };
+
+    let start_input = if let Some(start) = start_array {
+        if is_start_scalar || start.len() == 1 {
+            ScalarOrArray::Scalar(start.value(0))
+        } else {
+            let start_vec: Vec<i64> = (0..start.len())
+                .map(|i| if start.is_null(i) { 0 } else { start.value(i) }) // 
handle nulls as 0
+                .collect();
+
+            ScalarOrArray::Array(start_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    } else {
+        ScalarOrArray::Array(vec![1; len])

Review Comment:
   I think this code may be too complex (and expensive). For example, if 
`start_input` was passed is None, you can pass none along later and handle that 
nullable case (same for other params). This will save you all this big 
processing "defaults" here



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }

Review Comment:
   Example where that's used: `datafusion/functions/src/regex/regexpmatch.rs`



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn regexp_instr_inner<'a, S>(
+    values: S,
+    regex_array: S,
+    is_regex_scalar: bool,
+    start_array: Option<&Int64Array>,
+    is_start_scalar: bool,
+    nth_array: Option<&Int64Array>,
+    is_nth_scalar: bool,
+    flags_array: Option<S>,
+    is_flags_scalar: bool,
+    subexp_array: Option<&Int64Array>,
+    is_subexp_scalar: bool,
+) -> Result<ArrayRef, ArrowError>
+where
+    S: StringArrayType<'a>,
+{
+    let len = values.len();
+
+    let regex_input = if is_regex_scalar || regex_array.len() == 1 {
+        ScalarOrArray::Scalar(Some(regex_array.value(0)))
+    } else {
+        let regex_vec: Vec<Option<&str>> = regex_array.iter().collect();
+        ScalarOrArray::Array(regex_vec)
+    };
+
+    let start_input = if let Some(start) = start_array {
+        if is_start_scalar || start.len() == 1 {
+            ScalarOrArray::Scalar(start.value(0))
+        } else {
+            let start_vec: Vec<i64> = (0..start.len())
+                .map(|i| if start.is_null(i) { 0 } else { start.value(i) }) // 
handle nulls as 0
+                .collect();
+
+            ScalarOrArray::Array(start_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    } else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let nth_input = if let Some(nth) = nth_array {
+        if is_nth_scalar || nth.len() == 1 {
+            ScalarOrArray::Scalar(nth.value(0))
+        } else {
+            let nth_vec: Vec<i64> = (0..nth.len())
+                .map(|i| if nth.is_null(i) { 0 } else { nth.value(i) }) // 
handle nulls as 0
+                .collect();
+            ScalarOrArray::Array(nth_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    }
+    // Default nth = 0
+    else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let flags_input = if let Some(ref flags) = flags_array {
+        if is_flags_scalar || flags.len() == 1 {
+            ScalarOrArray::Scalar(flags.value(0))
+        } else {
+            let flags_vec: Vec<&str> = flags.iter().map(|v| 
v.unwrap_or("")).collect();
+            ScalarOrArray::Array(flags_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar("")
+    }
+    // Default flags = ""
+    else {
+        ScalarOrArray::Array(vec![""; len])
+    };
+
+    let subexp_input = if let Some(subexp) = subexp_array {
+        if is_subexp_scalar || subexp.len() == 1 {
+            ScalarOrArray::Scalar(subexp.value(0))
+        } else {
+            let subexp_vec: Vec<i64> = (0..subexp.len())
+                .map(|i| {
+                    if subexp.is_null(i) {
+                        0
+                    } else {
+                        subexp.value(i)
+                    }
+                }) // handle nulls as 0
+                .collect();
+            ScalarOrArray::Array(subexp_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(0)
+    }
+    // Default subexp = 0
+    else {
+        ScalarOrArray::Array(vec![0; len])
+    };
+
+    let mut regex_cache = HashMap::new();
+
+    let result: Result<Vec<Option<i64>>, ArrowError> = izip!(
+        values.iter(),
+        regex_input.iter(len),
+        start_input.iter(len),
+        nth_input.iter(len),
+        flags_input.iter(len),
+        subexp_input.iter(len)
+    )
+    .map(|(value, regex, start, nth, flags, subexp)| match regex {
+        None => Ok(None),
+        Some("") => Ok(None),
+        Some(regex) => get_index(

Review Comment:
   not sure that's wrong but not sure i get your idea - isn't regex a mandatory 
argument? how can it be none? and why is this allowed?



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn regexp_instr_inner<'a, S>(
+    values: S,
+    regex_array: S,
+    is_regex_scalar: bool,
+    start_array: Option<&Int64Array>,
+    is_start_scalar: bool,
+    nth_array: Option<&Int64Array>,
+    is_nth_scalar: bool,
+    flags_array: Option<S>,
+    is_flags_scalar: bool,
+    subexp_array: Option<&Int64Array>,
+    is_subexp_scalar: bool,
+) -> Result<ArrayRef, ArrowError>
+where
+    S: StringArrayType<'a>,
+{
+    let len = values.len();
+
+    let regex_input = if is_regex_scalar || regex_array.len() == 1 {
+        ScalarOrArray::Scalar(Some(regex_array.value(0)))
+    } else {
+        let regex_vec: Vec<Option<&str>> = regex_array.iter().collect();
+        ScalarOrArray::Array(regex_vec)
+    };
+
+    let start_input = if let Some(start) = start_array {
+        if is_start_scalar || start.len() == 1 {
+            ScalarOrArray::Scalar(start.value(0))
+        } else {
+            let start_vec: Vec<i64> = (0..start.len())
+                .map(|i| if start.is_null(i) { 0 } else { start.value(i) }) // 
handle nulls as 0
+                .collect();
+
+            ScalarOrArray::Array(start_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    } else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let nth_input = if let Some(nth) = nth_array {
+        if is_nth_scalar || nth.len() == 1 {
+            ScalarOrArray::Scalar(nth.value(0))
+        } else {
+            let nth_vec: Vec<i64> = (0..nth.len())
+                .map(|i| if nth.is_null(i) { 0 } else { nth.value(i) }) // 
handle nulls as 0
+                .collect();
+            ScalarOrArray::Array(nth_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    }
+    // Default nth = 0
+    else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let flags_input = if let Some(ref flags) = flags_array {
+        if is_flags_scalar || flags.len() == 1 {
+            ScalarOrArray::Scalar(flags.value(0))
+        } else {
+            let flags_vec: Vec<&str> = flags.iter().map(|v| 
v.unwrap_or("")).collect();
+            ScalarOrArray::Array(flags_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar("")
+    }
+    // Default flags = ""
+    else {
+        ScalarOrArray::Array(vec![""; len])
+    };
+
+    let subexp_input = if let Some(subexp) = subexp_array {
+        if is_subexp_scalar || subexp.len() == 1 {
+            ScalarOrArray::Scalar(subexp.value(0))
+        } else {
+            let subexp_vec: Vec<i64> = (0..subexp.len())
+                .map(|i| {
+                    if subexp.is_null(i) {
+                        0
+                    } else {
+                        subexp.value(i)
+                    }
+                }) // handle nulls as 0
+                .collect();
+            ScalarOrArray::Array(subexp_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(0)
+    }
+    // Default subexp = 0
+    else {
+        ScalarOrArray::Array(vec![0; len])
+    };
+
+    let mut regex_cache = HashMap::new();
+
+    let result: Result<Vec<Option<i64>>, ArrowError> = izip!(
+        values.iter(),
+        regex_input.iter(len),
+        start_input.iter(len),
+        nth_input.iter(len),
+        flags_input.iter(len),
+        subexp_input.iter(len)
+    )
+    .map(|(value, regex, start, nth, flags, subexp)| match regex {
+        None => Ok(None),
+        Some("") => Ok(None),
+        Some(regex) => get_index(
+            value,
+            regex,
+            start,
+            nth,
+            subexp,
+            Some(flags),
+            &mut regex_cache,
+        ),
+    })
+    .collect();
+
+    Ok(Arc::new(Int64Array::from(result?)))
+}
+
+fn get_index<'strings, 'cache>(
+    value: Option<&str>,
+    pattern: &'strings str,
+    start: i64,
+    n: i64,
+    subexpr: i64,
+    flags: Option<&'strings str>,
+    regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), 
Regex>,
+) -> Result<Option<i64>, ArrowError>
+where
+    'strings: 'cache,
+{
+    let value = match value {
+        None => return Ok(None),
+        Some("") => return Ok(Some(0)),
+        Some(value) => value,
+    };
+
+    let pattern = compile_and_cache_regex(pattern, flags, regex_cache)?;
+    if start < 1 {
+        return Err(ArrowError::ComputeError(
+            "regexp_instr() requires start to be 1-based".to_string(),
+        ));
+    }
+
+    if n < 1 {
+        return Err(ArrowError::ComputeError(
+            "N must be 1 or greater".to_string(),
+        ));
+    }
+
+    // --- Simplified byte_start_offset calculation ---
+    let total_chars = value.chars().count() as i64;
+    let byte_start_offset = if start > total_chars {
+        // If start is beyond the total characters, it means we start searching
+        // after the string effectively. No matches possible.
+        return Ok(Some(0));
+    } else {
+        // Get the byte offset for the (start - 1)-th character (0-based)
+        value
+            .char_indices()
+            .nth((start - 1) as usize)
+            .map(|(idx, _)| idx)
+            .unwrap_or(0) // Should not happen if start is valid and <= 
total_chars
+    };
+    // --- End simplified calculation ---
+
+    let search_slice = &value[byte_start_offset..];
+
+    // Handle subexpression capturing first, as it takes precedence
+    if subexpr > 0 {
+        if let Some(captures) = pattern.captures(search_slice) {
+            if let Some(matched) = captures.get(subexpr as usize) {
+                // Convert byte offset relative to search_slice back to 
1-based character offset
+                // relative to the original `value` string.
+                let start_char_offset =
+                    value[..byte_start_offset + 
matched.start()].chars().count() as i64
+                        + 1;
+                return Ok(Some(start_char_offset));
+            }
+        }
+        return Ok(Some(0)); // Return 0 if the subexpression was not found
+    }
+
+    // Use nth to get the N-th match (n is 1-based, nth is 0-based)
+    if let Some(mat) = pattern.find_iter(search_slice).nth((n - 1) as usize) {
+        // Convert byte offset relative to search_slice back to 1-based 
character offset
+        // relative to the original `value` string.
+        let match_start_byte_offset = byte_start_offset + mat.start();
+        let match_start_char_offset =
+            value[..match_start_byte_offset].chars().count() as i64 + 1;
+        Ok(Some(match_start_char_offset))

Review Comment:
   (nit), if you want, this can be each small own function



##########
datafusion/functions/src/regex/regexpinstr.rs:
##########
@@ -0,0 +1,804 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array, 
StringArrayType};
+use arrow::datatypes::{DataType, Int64Type};
+use arrow::datatypes::{
+    DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
+};
+use arrow::error::ArrowError;
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarUDFImpl, Signature, 
TypeSignature::Exact,
+    TypeSignature::Uniform, Volatility,
+};
+use datafusion_macros::user_doc;
+use itertools::izip;
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use crate::regex::compile_and_cache_regex;
+
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the position in a string where the specified 
occurrence of a POSIX regular expression is located.",
+    syntax_example = "regexp_instr(str, regexp[, start[, N[, flags]]])",
+    sql_example = r#"```sql
+> SELECT regexp_instr('ABCDEF', 'C(.)(..)');
++---------------------------------------------------------------+
+| regexp_instr(Utf8("ABCDEF"),Utf8("C(.)(..)"))                 |
++---------------------------------------------------------------+
+| 3                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first 
position is 1) to search for the regular expression. Can be a constant, column, 
or function. Defaults to 1"
+    ),
+    argument(
+        name = "N",
+        description = "- **N**: Optional The N-th occurrence of pattern to 
find. Defaults to 1 (first match). Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the 
behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    ),
+    argument(
+        name = "subexpr",
+        description = "Optional Specifies which capture group (subexpression) 
to return the position for. Defaults to 0, which returns the position of the 
entire match."
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpInstrFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpInstrFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RegexpInstrFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
+                    Exact(vec![Utf8View, Utf8View, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64]),
+                    Exact(vec![Utf8View, Utf8View, Int64, Int64, Utf8View]),
+                    Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64, LargeUtf8]),
+                    Exact(vec![Utf8, Utf8, Int64, Int64, Utf8]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
+}
+
+impl ScalarUDFImpl for RegexpInstrFunc {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "regexp_instr"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(Int64)
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: datafusion_expr::ScalarFunctionArgs,
+    ) -> Result<ColumnarValue> {
+        let args = &args.args;
+
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+        let inferred_length = len.unwrap_or(1);
+        let args = args
+            .iter()
+            .map(|arg| arg.to_array(inferred_length))
+            .collect::<Result<Vec<_>>>()?;
+
+        let result = regexp_instr_func(&args);
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+}
+
+pub fn regexp_instr_func(args: &[ArrayRef]) -> Result<ArrayRef> {
+    let args_len = args.len();
+    if !(2..=6).contains(&args_len) {
+        return exec_err!("regexp_instr was called with {args_len} arguments. 
It requires at least 2 and at most 6.");
+    }
+
+    let values = &args[0];
+    match values.data_type() {
+        Utf8 | LargeUtf8 | Utf8View => (),
+        other => {
+            return internal_err!(
+                "Unsupported data type {other:?} for function regexp_instr"
+            );
+        }
+    }
+
+    regexp_instr(
+        values,
+        &args[1],
+        if args_len > 2 { Some(&args[2]) } else { None },
+        if args_len > 3 { Some(&args[3]) } else { None },
+        if args_len > 4 { Some(&args[4]) } else { None },
+        if args_len > 5 { Some(&args[5]) } else { None },
+    )
+    .map_err(|e| e.into())
+}
+
+/// `arrow-rs` style implementation of `regexp_instr` function.
+/// This function `regexp_instr` is responsible for returning the index of a 
regular expression pattern
+/// within a string array. It supports optional start positions and flags for 
case insensitivity.
+///
+/// The function accepts a variable number of arguments:
+/// - `values`: The array of strings to search within.
+/// - `regex_array`: The array of regular expression patterns to search for.
+/// - `start_array` (optional): The array of start positions for the search.
+/// - `nth_array` (optional): The array of start nth for the search.
+/// - `endoption_array` (optional): The array of endoption positions for the 
search.
+/// - `flags_array` (optional): The array of flags to modify the search 
behavior (e.g., case insensitivity).
+/// - `subexpr_array` (optional): The array of subexpr positions for the 
search.
+///
+/// The function handles different combinations of scalar and array inputs for 
the regex patterns, start positions,
+/// and flags. It uses a cache to store compiled regular expressions for 
efficiency.
+///
+/// # Errors
+/// Returns an error if the input arrays have mismatched lengths or if the 
regular expression fails to compile.
+pub fn regexp_instr(
+    values: &dyn Array,
+    regex_array: &dyn Datum,
+    start_array: Option<&dyn Datum>,
+    nth_array: Option<&dyn Datum>,
+    flags_array: Option<&dyn Datum>,
+    subexpr_array: Option<&dyn Datum>,
+) -> Result<ArrayRef, ArrowError> {
+    let (regex_array, is_regex_scalar) = regex_array.get();
+    let (start_array, is_start_scalar) = start_array.map_or((None, true), 
|start| {
+        let (start, is_start_scalar) = start.get();
+        (Some(start), is_start_scalar)
+    });
+    let (nth_array, is_nth_scalar) = nth_array.map_or((None, true), |nth| {
+        let (nth, is_nth_scalar) = nth.get();
+        (Some(nth), is_nth_scalar)
+    });
+    let (flags_array, is_flags_scalar) = flags_array.map_or((None, true), 
|flags| {
+        let (flags, is_flags_scalar) = flags.get();
+        (Some(flags), is_flags_scalar)
+    });
+    let (subexpr_array, is_subexpr_scalar) =
+        subexpr_array.map_or((None, true), |subexpr| {
+            let (subexpr, is_subexpr_scalar) = subexpr.get();
+            (Some(subexpr), is_subexpr_scalar)
+        });
+
+    match (values.data_type(), regex_array.data_type(), flags_array) {
+        (Utf8, Utf8, None) => regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => 
regexp_instr_inner(
+            values.as_string::<i32>(),
+            regex_array.as_string::<i32>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i32>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, None) => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() 
== LargeUtf8 => regexp_instr_inner(
+            values.as_string::<i64>(),
+            regex_array.as_string::<i64>(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string::<i64>()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, None) => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            None,
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == 
Utf8View => regexp_instr_inner(
+            values.as_string_view(),
+            regex_array.as_string_view(),
+            is_regex_scalar,
+            start_array.map(|start| start.as_primitive::<Int64Type>()),
+            is_start_scalar,
+            nth_array.map(|nth| nth.as_primitive::<Int64Type>()),
+            is_nth_scalar,
+            Some(flags_array.as_string_view()),
+            is_flags_scalar,
+            subexpr_array.map(|subexpr| subexpr.as_primitive::<Int64Type>()),
+            is_subexpr_scalar,
+        ),
+        _ => Err(ArrowError::ComputeError(
+            "regexp_instr() expected the input arrays to be of type Utf8, 
LargeUtf8, or Utf8View and the data types of the values, regex_array, and 
flags_array to match".to_string(),
+        )),
+    }
+}
+
+enum ScalarOrArray<T> {
+    Scalar(T),
+    Array(Vec<T>),
+}
+
+impl<T: Clone> ScalarOrArray<T> {
+    fn iter(&self, len: usize) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            ScalarOrArray::Scalar(val) => 
Box::new(std::iter::repeat_n(val.clone(), len)),
+            ScalarOrArray::Array(arr) => Box::new(arr.iter().cloned()),
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn regexp_instr_inner<'a, S>(
+    values: S,
+    regex_array: S,
+    is_regex_scalar: bool,
+    start_array: Option<&Int64Array>,
+    is_start_scalar: bool,
+    nth_array: Option<&Int64Array>,
+    is_nth_scalar: bool,
+    flags_array: Option<S>,
+    is_flags_scalar: bool,
+    subexp_array: Option<&Int64Array>,
+    is_subexp_scalar: bool,
+) -> Result<ArrayRef, ArrowError>
+where
+    S: StringArrayType<'a>,
+{
+    let len = values.len();
+
+    let regex_input = if is_regex_scalar || regex_array.len() == 1 {
+        ScalarOrArray::Scalar(Some(regex_array.value(0)))
+    } else {
+        let regex_vec: Vec<Option<&str>> = regex_array.iter().collect();
+        ScalarOrArray::Array(regex_vec)
+    };
+
+    let start_input = if let Some(start) = start_array {
+        if is_start_scalar || start.len() == 1 {
+            ScalarOrArray::Scalar(start.value(0))
+        } else {
+            let start_vec: Vec<i64> = (0..start.len())
+                .map(|i| if start.is_null(i) { 0 } else { start.value(i) }) // 
handle nulls as 0
+                .collect();
+
+            ScalarOrArray::Array(start_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    } else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let nth_input = if let Some(nth) = nth_array {
+        if is_nth_scalar || nth.len() == 1 {
+            ScalarOrArray::Scalar(nth.value(0))
+        } else {
+            let nth_vec: Vec<i64> = (0..nth.len())
+                .map(|i| if nth.is_null(i) { 0 } else { nth.value(i) }) // 
handle nulls as 0
+                .collect();
+            ScalarOrArray::Array(nth_vec)
+        }
+    } else if len == 1 {
+        ScalarOrArray::Scalar(1)
+    }
+    // Default nth = 0
+    else {
+        ScalarOrArray::Array(vec![1; len])
+    };
+
+    let flags_input = if let Some(ref flags) = flags_array {
+        if is_flags_scalar || flags.len() == 1 {
+            ScalarOrArray::Scalar(flags.value(0))
+        } else {
+            let flags_vec: Vec<&str> = flags.iter().map(|v| 
v.unwrap_or("")).collect();
+            ScalarOrArray::Array(flags_vec)

Review Comment:
   same as above, you don't need to set flags be "" - they can be null. Can you 
please check existing implementations for example? e.g. 
https://arrow.apache.org/rust/src/arrow_string/regexp.rs.html#356



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Implementation for regex_instr [datafusion]

Reply via email to