Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

doc-gen: migrate scalar functions (encoding & regex) documentation #13919

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 29 additions & 36 deletions datafusion/functions/src/encoding/inner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,27 @@ use datafusion_common::{
use datafusion_common::{exec_err, ScalarValue};
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::{ColumnarValue, Documentation};
use std::sync::{Arc, OnceLock};
use std::sync::Arc;
use std::{fmt, str::FromStr};

use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;
use std::any::Any;

#[user_doc(
doc_section(label = "Binary String Functions"),
description = "Encode binary data into a textual representation.",
syntax_example = "encode(expression, format)",
argument(
name = "expression",
description = "Expression containing string or binary data"
),
argument(
name = "format",
description = "Supported formats are: `base64`, `hex`"
),
related_udf(name = "decode")
)]
#[derive(Debug)]
pub struct EncodeFunc {
signature: Signature,
Expand All @@ -58,22 +72,6 @@ impl EncodeFunc {
}
}

static ENCODE_DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_encode_doc() -> &'static Documentation {
ENCODE_DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_BINARY_STRING,
"Encode binary data into a textual representation.",
"encode(expression, format)",
)
.with_argument("expression", "Expression containing string or binary data")
.with_argument("format", "Supported formats are: `base64`, `hex`")
.with_related_udf("decode")
.build()
})
}

impl ScalarUDFImpl for EncodeFunc {
fn as_any(&self) -> &dyn Any {
self
Expand Down Expand Up @@ -126,10 +124,21 @@ impl ScalarUDFImpl for EncodeFunc {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_encode_doc())
self.doc()
}
}

#[user_doc(
doc_section(label = "Binary String Functions"),
description = "Decode binary data from textual representation in string.",
syntax_example = "decode(e xpression, format)",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
syntax_example = "decode(e xpression, format)",
syntax_example = "decode(expression, format)",

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alamb thanks for the correction

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @alamb for the correction, I have fixed the typo

argument(
name = "expression",
description = "Expression containing string or binary data"
),
argument(name = "format", description = "Same arguments as [encode](#encode)"),
related_udf(name = "encode")
)]
#[derive(Debug)]
pub struct DecodeFunc {
signature: Signature,
Expand All @@ -149,22 +158,6 @@ impl DecodeFunc {
}
}

static DECODE_DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_decode_doc() -> &'static Documentation {
DECODE_DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_BINARY_STRING,
"Decode binary data from textual representation in string.",
"decode(expression, format)",
)
.with_argument("expression", "Expression containing encoded string data")
.with_argument("format", "Same arguments as [encode](#encode)")
.with_related_udf("encode")
.build()
})
}

impl ScalarUDFImpl for DecodeFunc {
fn as_any(&self) -> &dyn Any {
self
Expand Down Expand Up @@ -217,7 +210,7 @@ impl ScalarUDFImpl for DecodeFunc {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_decode_doc())
self.doc()
}
}

Expand Down
64 changes: 31 additions & 33 deletions datafusion/functions/src/regex/regexpcount.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,45 @@ use arrow::datatypes::{
};
use arrow::error::ArrowError;
use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{
ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
TypeSignature::Uniform, Volatility,
};
use datafusion_macros::user_doc;
use itertools::izip;
use regex::Regex;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::{Arc, OnceLock};
use std::sync::Arc;

#[user_doc(
doc_section(label = "Regular Expression Functions"),
description = "Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string.",
syntax_example = "regexp_count(str, regexp[, start, flags])",
sql_example = r#"```sql
> select regexp_count('abcAbAbc', 'abc', 2, 'i');
+---------------------------------------------------------------+
| regexp_count(Utf8("abcAbAbc"),Utf8("abc"),Int64(2),Utf8("i")) |
+---------------------------------------------------------------+
| 1 |
+---------------------------------------------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
standard_argument(name = "Regexp", prefix = "Regular"),
argument(
name = "start",
description = "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function."
),
argument(
name = "flags",
description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#
)
)]
#[derive(Debug)]
pub struct RegexpCountFunc {
signature: Signature,
Expand Down Expand Up @@ -111,40 +139,10 @@ impl ScalarUDFImpl for RegexpCountFunc {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_count_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_count_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_REGEX,
"Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string.",
"regexp_count(str, regexp[, start, flags])")
.with_sql_example(r#"```sql
> select regexp_count('abcAbAbc', 'abc', 2, 'i');
+---------------------------------------------------------------+
| regexp_count(Utf8("abcAbAbc"),Utf8("abc"),Int64(2),Utf8("i")) |
+---------------------------------------------------------------+
| 1 |
+---------------------------------------------------------------+
```"#)
.with_standard_argument("str", Some("String"))
.with_standard_argument("regexp",Some("Regular"))
.with_argument("start", "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function.")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
})
}

pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
let args_len = args.len();
if !(2..=4).contains(&args_len) {
Expand Down
58 changes: 30 additions & 28 deletions datafusion/functions/src/regex/regexplike.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,18 @@ use datafusion_common::exec_err;
use datafusion_common::ScalarValue;
use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{internal_err, DataFusionError, Result};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;

use std::any::Any;
use std::sync::{Arc, OnceLock};
use std::sync::Arc;

#[derive(Debug)]
pub struct RegexpLikeFunc {
signature: Signature,
}

impl Default for RegexpLikeFunc {
fn default() -> Self {
Self::new()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_like_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(DOC_SECTION_REGEX,"Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.","regexp_like(str, regexp[, flags])")
.with_sql_example(r#"```sql
#[user_doc(
doc_section(label = "Regular Expression Functions"),
description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
syntax_example = "regexp_like(str, regexp[, flags])",
sql_example = r#"```sql
select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+--------------------------------------------------------+
| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
Expand All @@ -63,18 +51,32 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
+--------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", Some("String"))
.with_standard_argument("regexp", Some("Regular"))
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
"#,
standard_argument(name = "str", prefix = "String"),
standard_argument(name = "Regexp", prefix = "Regular"),
argument(
name = "start",
description = "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function."
),
argument(
name = "flags",
description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
})
- **U**: swap the meaning of x* and x*?"#
)
)]
#[derive(Debug)]
pub struct RegexpLikeFunc {
signature: Signature,
}

impl Default for RegexpLikeFunc {
fn default() -> Self {
Self::new()
}
}

impl RegexpLikeFunc {
Expand Down Expand Up @@ -142,7 +144,7 @@ impl ScalarUDFImpl for RegexpLikeFunc {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_like_doc())
self.doc()
}
}

Expand Down
79 changes: 38 additions & 41 deletions datafusion/functions/src/regex/regexpmatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,47 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
};
use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
use datafusion_macros::user_doc;
use std::any::Any;
use std::sync::{Arc, OnceLock};
use std::sync::Arc;

#[user_doc(
doc_section(label = "Regular Expression Functions"),
description = "Returns the first [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.",
syntax_example = "regexp_match(str, regexp[, flags])",
sql_example = r#"```sql
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#,
standard_argument(name = "str", prefix = "String"),
argument(
name = "regexp",
description = "Regular expression to match against. Can be a constant, column, or function."
),
argument(
name = "flags",
description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#
)
)]
#[derive(Debug)]
pub struct RegexpMatchFunc {
signature: Signature,
Expand Down Expand Up @@ -113,48 +148,10 @@ impl ScalarUDFImpl for RegexpMatchFunc {
}

fn documentation(&self) -> Option<&Documentation> {
Some(get_regexp_match_doc())
self.doc()
}
}

static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();

fn get_regexp_match_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_REGEX,
"Returns the first [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.",
"regexp_match(str, regexp[, flags])")
.with_sql_example(r#"```sql
> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
+---------------------------------------------------------+
| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
+---------------------------------------------------------+
| [Köln] |
+---------------------------------------------------------+
SELECT regexp_match('aBc', '(b|d)', 'i');
+---------------------------------------------------+
| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
+---------------------------------------------------+
| [B] |
+---------------------------------------------------+
```
Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
"#)
.with_standard_argument("str", Some("String"))
.with_argument("regexp","Regular expression to match against.
Can be a constant, column, or function.")
.with_argument("flags",
r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
- **i**: case-insensitive: letters match both upper and lower case
- **m**: multi-line mode: ^ and $ match begin/end of line
- **s**: allow . to match \n
- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
- **U**: swap the meaning of x* and x*?"#)
.build()
})
}

fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => regexp_match::<i32>(args),
Expand Down
Loading
Loading