Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f72c11f
Implement regexp_ccount
xinlifoobar Aug 20, 2024
682a50a
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 20, 2024
ee23b97
Update document
xinlifoobar Aug 20, 2024
d5b63f4
fix check
xinlifoobar Aug 20, 2024
2acd148
add more tests
xinlifoobar Aug 20, 2024
a3563ee
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 21, 2024
27a6fc6
Update the world to 1.80
xinlifoobar Aug 21, 2024
d17e45d
Fix doc format
xinlifoobar Aug 21, 2024
ee14adf
Add null tests
xinlifoobar Aug 22, 2024
08343dd
Add uft8 support and bench
xinlifoobar Aug 22, 2024
218ff7b
Refactoring regexp_count
xinlifoobar Aug 28, 2024
0333ec4
Merge branch 'main' of github.com:apache/datafusion into dev/xinli/re…
xinlifoobar Aug 29, 2024
07312be
Refactoring regexp_count
xinlifoobar Aug 29, 2024
4eb7e6b
Revert ci change
xinlifoobar Aug 29, 2024
cb13556
Fix ci
xinlifoobar Aug 29, 2024
574047a
Merge remote-tracking branch 'upstream/main' into fork/xinlifoobar/de…
Omega359 Oct 12, 2024
5a41fbf
Updates for documentation, minor improvements.
Omega359 Oct 13, 2024
2e4cd78
Updates for documentation, minor improvements.
Omega359 Oct 13, 2024
59432f3
Merge remote-tracking branch 'origin/feature/regexp_count' into featu…
Omega359 Oct 13, 2024
01509e8
Merge remote-tracking branch 'upstream/main' into feature/regexp_count
Omega359 Oct 16, 2024
97e61ae
updates to fix scalar tests, doc updates.
Omega359 Oct 16, 2024
74b545a
Merge remote-tracking branch 'origin/main' into feature/regexp_count
Omega359 Oct 17, 2024
696545f
Merge remote-tracking branch 'origin/main' into feature/regexp_count
Omega359 Oct 18, 2024
7371923
updated regex and string features to remove deps on other features.
Omega359 Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Updates for documentation, minor improvements.
  • Loading branch information
Omega359 committed Oct 13, 2024
commit 2e4cd78681dbcbec8de69cefa6400eed109ff9ba
78 changes: 52 additions & 26 deletions datafusion/functions/src/regex/regexpcount.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,24 @@
// specific language governing permissions and limitations
// under the License.

use crate::string::common::StringArrayType;
use arrow::array::{Array, ArrayRef, AsArray, Datum, Int64Array};
use arrow::datatypes::{DataType, Int64Type};
use arrow::datatypes::{
DataType::Int64, DataType::LargeUtf8, DataType::Utf8, DataType::Utf8View,
};
use arrow::error::ArrowError;
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{
ColumnarValue, ScalarUDFImpl, Signature, TypeSignature::Exact,
ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
TypeSignature::Uniform, Volatility,
};
use itertools::izip;
use regex::Regex;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;

use crate::string::common::StringArrayType;
use std::sync::{Arc, OnceLock};

#[derive(Debug)]
pub struct RegexpCountFunc {
Expand All @@ -50,13 +50,13 @@ impl RegexpCountFunc {
Self {
signature: Signature::one_of(
vec![
Uniform(2, vec![Utf8, LargeUtf8, Utf8View]),
Exact(vec![Utf8, Utf8, Int64]),
Exact(vec![Utf8, Utf8, Int64, Utf8]),
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
Exact(vec![LargeUtf8, LargeUtf8, Int64, LargeUtf8]),
Uniform(2, vec![Utf8View, LargeUtf8, Utf8]),
Exact(vec![Utf8View, Utf8View, Int64]),
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
Exact(vec![Utf8, Utf8, Int64]),
Exact(vec![Utf8View, Utf8View, Int64, Utf8View]),
Exact(vec![LargeUtf8, LargeUtf8, Int64, LargeUtf8]),
Exact(vec![Utf8, Utf8, Int64, Utf8]),
],
Volatility::Immutable,
),
Expand All @@ -81,7 +81,7 @@ impl ScalarUDFImpl for RegexpCountFunc {
Ok(Int64)
}

fn invoke(&self, args: &[datafusion_expr::ColumnarValue]) -> Result<ColumnarValue> {
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
let len = args
.iter()
.fold(Option::<usize>::None, |acc, arg| match arg {
Expand All @@ -105,6 +105,36 @@ impl ScalarUDFImpl for RegexpCountFunc {
result.map(ColumnarValue::Array)
}
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_count_doc())
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_count_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder()
.with_doc_section(DOC_SECTION_REGEX)
.with_description("Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string.")
.with_syntax_example("regexp_count(str, regexp[, start, flags])")
.with_sql_example(r#"```sql
>
```"#)
.with_standard_argument("str", "String")
.with_standard_argument("regexp","Regular")
.with_argument("start", "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function.")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
.unwrap()
})
}

pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
Expand Down Expand Up @@ -264,6 +294,8 @@ where
(None, None, true)
};

let mut regex_cache = HashMap::new();

match (is_regex_scalar, is_start_scalar, is_flags_scalar) {
(true, true, true) => {
let regex = match regex_scalar {
Expand Down Expand Up @@ -294,12 +326,11 @@ where
if values.len() != flags_array.len() {
return Err(ArrowError::ComputeError(format!(
"flags_array must be the same length as values array; got {} and {}",
flags_array.len(),
values.len(),
flags_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
values
.iter()
Expand Down Expand Up @@ -344,12 +375,11 @@ where
if values.len() != flags_array.len() {
return Err(ArrowError::ComputeError(format!(
"flags_array must be the same length as values array; got {} and {}",
flags_array.len(),
values.len(),
flags_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
izip!(
values.iter(),
Expand All @@ -369,12 +399,11 @@ where
if values.len() != regex_array.len() {
return Err(ArrowError::ComputeError(format!(
"regex_array must be the same length as values array; got {} and {}",
regex_array.len(),
values.len(),
regex_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
values
.iter()
Expand All @@ -399,21 +428,20 @@ where
if values.len() != regex_array.len() {
return Err(ArrowError::ComputeError(format!(
"regex_array must be the same length as values array; got {} and {}",
regex_array.len(),
values.len(),
regex_array.len()
)));
}

let flags_array = flags_array.unwrap();
if values.len() != flags_array.len() {
return Err(ArrowError::ComputeError(format!(
"flags_array must be the same length as values array; got {} and {}",
flags_array.len(),
values.len(),
flags_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
izip!(values.iter(), regex_array.iter(), flags_array.iter())
.map(|(value, regex, flags)| {
Expand All @@ -434,21 +462,20 @@ where
if values.len() != regex_array.len() {
return Err(ArrowError::ComputeError(format!(
"regex_array must be the same length as values array; got {} and {}",
regex_array.len(),
values.len(),
regex_array.len()
)));
}

let start_array = start_array.unwrap();
if values.len() != start_array.len() {
return Err(ArrowError::ComputeError(format!(
"start_array must be the same length as values array; got {} and {}",
start_array.len(),
values.len(),
start_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
izip!(values.iter(), regex_array.iter(), start_array.iter())
.map(|(value, regex, start)| {
Expand All @@ -471,30 +498,29 @@ where
if values.len() != regex_array.len() {
return Err(ArrowError::ComputeError(format!(
"regex_array must be the same length as values array; got {} and {}",
regex_array.len(),
values.len(),
regex_array.len()
)));
}

let start_array = start_array.unwrap();
if values.len() != start_array.len() {
return Err(ArrowError::ComputeError(format!(
"start_array must be the same length as values array; got {} and {}",
start_array.len(),
values.len(),
start_array.len()
)));
}

let flags_array = flags_array.unwrap();
if values.len() != flags_array.len() {
return Err(ArrowError::ComputeError(format!(
"flags_array must be the same length as values array; got {} and {}",
flags_array.len(),
values.len(),
flags_array.len()
)));
}

let mut regex_cache = HashMap::new();
Ok(Arc::new(Int64Array::from_iter_values(
izip!(
values.iter(),
Expand Down
25 changes: 0 additions & 25 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -569,38 +569,13 @@ Apache DataFusion uses a [PCRE-like] regular expression [syntax]
(minus support for several features including look-around and backreferences).
The following regular expression functions are supported:

- [regexp_count](#regex_count)
- [regexp_like](#regexp_like)
- [regexp_match](#regexp_match)
- [regexp_replace](#regexp_replace)

[pcre-like]: https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions
[syntax]: https://docs.rs/regex/latest/regex/#syntax

### `regexp_count`

Returns the number of matchs that a [regular expression] has in a string.

```
regexp_count(str, regexp[, start, flags])
```

#### Arguments

- **str**: String expression to operate on.
Can be a constant, column, or function, and any combination of string operators.
- **regexp**: Regular expression to test against the string expression.
Can be a constant, column, or function.
- **start**: Optional start position to search for the regular expression.
Can be a constant, column, or function.
- **flags**: Optional regular expression flags that control the behavior of the
regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?

### `regexp_like`

Returns true if a [regular expression] has at least one match in a string,
Expand Down