apache · Chen-Yuan-Lai · Dec 25, 2024 · Jan 3, 2025 · alamb · Dec 28, 2024
diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs
@@ -32,13 +32,27 @@ use datafusion_common::{
 use datafusion_common::{exec_err, ScalarValue};
 use datafusion_common::{DataFusionError, Result};
 use datafusion_expr::{ColumnarValue, Documentation};
-use std::sync::{Arc, OnceLock};
+use std::sync::Arc;
 use std::{fmt, str::FromStr};
 
-use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
 use std::any::Any;
 
+#[user_doc(
+    doc_section(label = "Binary String Functions"),
+    description = "Encode binary data into a textual representation.",
+    syntax_example = "encode(expression, format)",
+    argument(
+        name = "expression",
+        description = "Expression containing string or binary data"
+    ),
+    argument(
+        name = "format",
+        description = "Supported formats are: `base64`, `hex`"
+    ),
+    related_udf(name = "decode")
+)]
 #[derive(Debug)]
 pub struct EncodeFunc {
     signature: Signature,
@@ -58,22 +72,6 @@ impl EncodeFunc {
     }
 }
 
-static ENCODE_DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
-
-fn get_encode_doc() -> &'static Documentation {
-    ENCODE_DOCUMENTATION.get_or_init(|| {
-        Documentation::builder(
-            DOC_SECTION_BINARY_STRING,
-            "Encode binary data into a textual representation.",
-            "encode(expression, format)",
-        )
-        .with_argument("expression", "Expression containing string or binary data")
-        .with_argument("format", "Supported formats are: `base64`, `hex`")
-        .with_related_udf("decode")
-        .build()
-    })
-}
-
 impl ScalarUDFImpl for EncodeFunc {
     fn as_any(&self) -> &dyn Any {
         self
@@ -126,10 +124,21 @@ impl ScalarUDFImpl for EncodeFunc {
     }
 
     fn documentation(&self) -> Option<&Documentation> {
-        Some(get_encode_doc())
+        self.doc()
     }
 }
 
+#[user_doc(
+    doc_section(label = "Binary String Functions"),
+    description = "Decode binary data from textual representation in string.",
+    syntax_example = "decode(e xpression, format)",
-    syntax_example = "decode(e xpression, format)",
+    syntax_example = "decode(expression, format)",
-    syntax_example = "decode(e xpression, format)",
+    syntax_example = "decode(expression, format)",
+    argument(
+        name = "expression",
+        description = "Expression containing string or binary data"
+    ),
+    argument(name = "format", description = "Same arguments as [encode](#encode)"),
+    related_udf(name = "encode")
+)]
 #[derive(Debug)]
 pub struct DecodeFunc {
     signature: Signature,
@@ -149,22 +158,6 @@ impl DecodeFunc {
     }
 }
 
-static DECODE_DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
-
-fn get_decode_doc() -> &'static Documentation {
-    DECODE_DOCUMENTATION.get_or_init(|| {
-        Documentation::builder(
-            DOC_SECTION_BINARY_STRING,
-            "Decode binary data from textual representation in string.",
-            "decode(expression, format)",
-        )
-        .with_argument("expression", "Expression containing encoded string data")
-        .with_argument("format", "Same arguments as [encode](#encode)")
-        .with_related_udf("encode")
-        .build()
-    })
-}
-
 impl ScalarUDFImpl for DecodeFunc {
     fn as_any(&self) -> &dyn Any {
         self
@@ -217,7 +210,7 @@ impl ScalarUDFImpl for DecodeFunc {
     }
 
     fn documentation(&self) -> Option<&Documentation> {
-        Some(get_decode_doc())
+        self.doc()
     }
 }
 

diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs
@@ -23,17 +23,45 @@ use arrow::datatypes::{
 };
 use arrow::error::ArrowError;
 use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
-use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
 use datafusion_expr::{
     ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature::Exact,
     TypeSignature::Uniform, Volatility,
 };
+use datafusion_macros::user_doc;
 use itertools::izip;
 use regex::Regex;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::sync::Arc;
 
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string.",
+    syntax_example = "regexp_count(str, regexp[, start, flags])",
+    sql_example = r#"```sql
+> select regexp_count('abcAbAbc', 'abc', 2, 'i');
++---------------------------------------------------------------+
+| regexp_count(Utf8("abcAbAbc"),Utf8("abc"),Int64(2),Utf8("i")) |
++---------------------------------------------------------------+
+| 1                                                             |
++---------------------------------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "Regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    )
+)]
 #[derive(Debug)]
 pub struct RegexpCountFunc {
     signature: Signature,
@@ -111,40 +139,10 @@ impl ScalarUDFImpl for RegexpCountFunc {
     }
 
     fn documentation(&self) -> Option<&Documentation> {
-        Some(get_regexp_count_doc())
+        self.doc()
     }
 }
 
-static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
-
-fn get_regexp_count_doc() -> &'static Documentation {
-    DOCUMENTATION.get_or_init(|| {
-        Documentation::builder(
-            DOC_SECTION_REGEX,
-            "Returns the number of matches that a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has in a string.",
-            "regexp_count(str, regexp[, start, flags])")
-            .with_sql_example(r#"```sql
-> select regexp_count('abcAbAbc', 'abc', 2, 'i');
-+---------------------------------------------------------------+
-| regexp_count(Utf8("abcAbAbc"),Utf8("abc"),Int64(2),Utf8("i")) |
-+---------------------------------------------------------------+
-| 1                                                             |
-+---------------------------------------------------------------+
-```"#)
-            .with_standard_argument("str", Some("String"))
-            .with_standard_argument("regexp",Some("Regular"))
-            .with_argument("start", "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function.")
-            .with_argument("flags",
-                           r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
-  - **i**: case-insensitive: letters match both upper and lower case
-  - **m**: multi-line mode: ^ and $ match begin/end of line
-  - **s**: allow . to match \n
-  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
-  - **U**: swap the meaning of x* and x*?"#)
-            .build()
-    })
-}
-
 pub fn regexp_count_func(args: &[ArrayRef]) -> Result<ArrayRef> {
     let args_len = args.len();
     if !(2..=4).contains(&args_len) {

diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs
@@ -25,30 +25,18 @@ use datafusion_common::exec_err;
 use datafusion_common::ScalarValue;
 use datafusion_common::{arrow_datafusion_err, plan_err};
 use datafusion_common::{internal_err, DataFusionError, Result};
-use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
 
 use std::any::Any;
-use std::sync::{Arc, OnceLock};
+use std::sync::Arc;
 
-#[derive(Debug)]
-pub struct RegexpLikeFunc {
-    signature: Signature,
-}
-
-impl Default for RegexpLikeFunc {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
-
-fn get_regexp_like_doc() -> &'static Documentation {
-    DOCUMENTATION.get_or_init(|| {
-        Documentation::builder(DOC_SECTION_REGEX,"Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.","regexp_like(str, regexp[, flags])")
-            .with_sql_example(r#"```sql
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
+    syntax_example = "regexp_like(str, regexp[, flags])",
+    sql_example = r#"```sql
 select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
 +--------------------------------------------------------+
 | regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
@@ -63,18 +51,32 @@ SELECT regexp_like('aBc', '(b|d)', 'i');
 +--------------------------------------------------+
 ```
 Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
-"#)
-            .with_standard_argument("str", Some("String"))
-            .with_standard_argument("regexp", Some("Regular"))
-            .with_argument("flags",
-                           r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
+"#,
+    standard_argument(name = "str", prefix = "String"),
+    standard_argument(name = "Regexp", prefix = "Regular"),
+    argument(
+        name = "start",
+        description = "- **start**: Optional start position (the first position is 1) to search for the regular expression. Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
   - **i**: case-insensitive: letters match both upper and lower case
   - **m**: multi-line mode: ^ and $ match begin/end of line
   - **s**: allow . to match \n
   - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
-  - **U**: swap the meaning of x* and x*?"#)
-            .build()
-    })
+  - **U**: swap the meaning of x* and x*?"#
+    )
+)]
+#[derive(Debug)]
+pub struct RegexpLikeFunc {
+    signature: Signature,
+}
+
+impl Default for RegexpLikeFunc {
+    fn default() -> Self {
+        Self::new()
+    }
 }
 
 impl RegexpLikeFunc {
@@ -142,7 +144,7 @@ impl ScalarUDFImpl for RegexpLikeFunc {
     }
 
     fn documentation(&self) -> Option<&Documentation> {
-        Some(get_regexp_like_doc())
+        self.doc()
     }
 }
 

diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs
@@ -26,12 +26,47 @@ use datafusion_common::{arrow_datafusion_err, plan_err};
 use datafusion_common::{
     cast::as_generic_string_array, internal_err, DataFusionError, Result,
 };
-use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX;
 use datafusion_expr::{ColumnarValue, Documentation, TypeSignature};
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use datafusion_macros::user_doc;
 use std::any::Any;
-use std::sync::{Arc, OnceLock};
+use std::sync::Arc;
 
+#[user_doc(
+    doc_section(label = "Regular Expression Functions"),
+    description = "Returns the first [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.",
+    syntax_example = "regexp_match(str, regexp[, flags])",
+    sql_example = r#"```sql
+> select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
++---------------------------------------------------------+
+| regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
++---------------------------------------------------------+
+| [Köln]                                                  |
++---------------------------------------------------------+
+SELECT regexp_match('aBc', '(b|d)', 'i');
++---------------------------------------------------+
+| regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
++---------------------------------------------------+
+| [B]                                               |
++---------------------------------------------------+
+```
+Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
+"#,
+    standard_argument(name = "str", prefix = "String"),
+    argument(
+        name = "regexp",
+        description = "Regular expression to match against. Can be a constant, column, or function."
+    ),
+    argument(
+        name = "flags",
+        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
+  - **i**: case-insensitive: letters match both upper and lower case
+  - **m**: multi-line mode: ^ and $ match begin/end of line
+  - **s**: allow . to match \n
+  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
+  - **U**: swap the meaning of x* and x*?"#
+    )
+)]
 #[derive(Debug)]
 pub struct RegexpMatchFunc {
     signature: Signature,
@@ -113,48 +148,10 @@ impl ScalarUDFImpl for RegexpMatchFunc {
     }
 
     fn documentation(&self) -> Option<&Documentation> {
-        Some(get_regexp_match_doc())
+        self.doc()
     }
 }
 
-static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
-
-fn get_regexp_match_doc() -> &'static Documentation {
-    DOCUMENTATION.get_or_init(|| {
-        Documentation::builder(
-            DOC_SECTION_REGEX,
-            "Returns the first [regular expression](https://docs.rs/regex/latest/regex/#syntax) matches in a string.",
-            "regexp_match(str, regexp[, flags])")
-            .with_sql_example(r#"```sql
-            > select regexp_match('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
-            +---------------------------------------------------------+
-            | regexp_match(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
-            +---------------------------------------------------------+
-            | [Köln]                                                  |
-            +---------------------------------------------------------+
-            SELECT regexp_match('aBc', '(b|d)', 'i');
-            +---------------------------------------------------+
-            | regexp_match(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
-            +---------------------------------------------------+
-            | [B]                                               |
-            +---------------------------------------------------+
-```
-Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
-"#)
-            .with_standard_argument("str", Some("String"))
-            .with_argument("regexp","Regular expression to match against.
-            Can be a constant, column, or function.")
-            .with_argument("flags",
-                           r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
-  - **i**: case-insensitive: letters match both upper and lower case
-  - **m**: multi-line mode: ^ and $ match begin/end of line
-  - **s**: allow . to match \n
-  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
-  - **U**: swap the meaning of x* and x*?"#)
-            .build()
-    })
-}
-
 fn regexp_match_func(args: &[ArrayRef]) -> Result<ArrayRef> {
     match args[0].data_type() {
         DataType::Utf8 => regexp_match::<i32>(args),