Skip to content

Commit

Permalink
feature: support posix classes in Re.Posix
Browse files Browse the repository at this point in the history
Signed-off-by: Rudi Grinberg <[email protected]>

<!-- ps-id: 0d54478a-0925-4ca9-bf78-c0d887a94d80 -->
  • Loading branch information
rgrinberg committed Apr 17, 2024
1 parent 07de6f6 commit 4de7ecd
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 66 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Unreleased
----------
* Add `Re.split_delim` (#233)
* Fix handling of empty matches in splitting and substitution functions (#233)
* Add support for character classes in `Re.Posix` (#263)

1.11.0 (19-Aug-2023)
--------------------
Expand Down
54 changes: 11 additions & 43 deletions lib/perl.ml
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,13 @@ module Re = Core
exception Parse_error = Parse_buffer.Parse_error
exception Not_supported

let posix_class_of_string = function
| "alpha" -> Re.alpha
| "alnum" -> Re.alnum
| "ascii" -> Re.ascii
| "blank" -> Re.blank
| "cntrl" -> Re.cntrl
| "digit" -> Re.digit
| "lower" -> Re.lower
| "print" -> Re.print
| "space" -> Re.space
| "upper" -> Re.upper
| "word" -> Re.wordc
| "punct" -> Re.punct
| "graph" -> Re.graph
| "xdigit" -> Re.xdigit
| class_ -> invalid_arg ("Invalid pcre class: " ^ class_)

let posix_class_strings =
[ "alpha" ; "alnum" ; "ascii"
; "blank" ; "cntrl" ; "digit"
; "lower" ; "print" ; "space"
; "upper" ; "word" ; "punct"
; "graph" ; "xdigit" ]

let parse multiline dollar_endonly dotall ungreedy s =
let buf = Parse_buffer.create s in
let accept = Parse_buffer.accept buf in
let eos () = Parse_buffer.eos buf in
let test c = Parse_buffer.test buf c in
let unget () = Parse_buffer.unget buf in
let get () = Parse_buffer.get buf in
let accept_s = Parse_buffer.accept_s buf in
let greedy_mod r =
let gr = accept '?' in
let gr = if ungreedy then not gr else gr in
Expand Down Expand Up @@ -222,24 +197,17 @@ let parse multiline dollar_endonly dotall ungreedy s =
let c = get () in
if c = '[' then begin
if accept '=' then raise Not_supported;
if accept ':' then
let compl = accept '^' in
let cls =
try List.find accept_s posix_class_strings
with Not_found -> raise Parse_error in
if not (accept_s ":]") then raise Parse_error;
let re =
let posix_class = posix_class_of_string cls in
if compl then Re.compl [posix_class] else posix_class in
`Set (re)
else if accept '.' then begin
if eos () then raise Parse_error;
let c = get () in
if not (accept '.') then raise Not_supported;
if not (accept ']') then raise Parse_error;
`Char c
end else
`Char c
match Posix_class.parse buf with
| Some set -> `Set set
| None ->
if accept '.' then begin
if eos () then raise Parse_error;
let c = get () in
if not (accept '.') then raise Not_supported;
if not (accept ']') then raise Parse_error;
`Char c
end else
`Char c
end else if c = '\\' then begin
if eos () then raise Parse_error;
let c = get () in
Expand Down
45 changes: 25 additions & 20 deletions lib/posix.ml
Original file line number Diff line number Diff line change
Expand Up @@ -97,32 +97,37 @@ let parse newline s =
end
and bracket s =
if s <> [] && accept ']' then s else begin
let c = char () in
if accept '-' then begin
if accept ']' then Re.char c :: Re.char '-' :: s else begin
let c' = char () in
bracket (Re.rg c c' :: s)
end
end else
bracket (Re.char c :: s)
match char () with
| `Char c ->
if accept '-' then begin
if accept ']' then Re.char c :: Re.char '-' :: s else begin
match char () with
`Char c' ->
bracket (Re.rg c c' :: s)
| `Set st' ->
bracket (Re.char c :: Re.char '-' :: st' :: s)
end
end else
bracket (Re.char c :: s)
| `Set st -> bracket (st :: s)
end
and char () =
if eos () then raise Parse_error;
let c = get () in
if c = '[' then begin
if accept '=' then raise Not_supported
else if accept ':' then begin
raise Not_supported (*XXX*)
end else if accept '.' then begin
if eos () then raise Parse_error;
let c = get () in
if not (accept '.') then raise Not_supported;
if not (accept ']') then raise Parse_error;
c
end else
c
match Posix_class.parse buf with
| Some set -> `Set set
| None ->
if accept '.' then begin
if eos () then raise Parse_error;
let c = get () in
if not (accept '.') then raise Not_supported;
if not (accept ']') then raise Parse_error;
`Char c
end else
`Char c
end else
c
`Char c
in
let res = regexp () in
if not (eos ()) then raise Parse_error;
Expand Down
40 changes: 40 additions & 0 deletions lib/posix_class.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
module Re = Core

let of_name = function
| "alpha" -> Re.alpha
| "alnum" -> Re.alnum
| "ascii" -> Re.ascii
| "blank" -> Re.blank
| "cntrl" -> Re.cntrl
| "digit" -> Re.digit
| "lower" -> Re.lower
| "print" -> Re.print
| "space" -> Re.space
| "upper" -> Re.upper
| "word" -> Re.wordc
| "punct" -> Re.punct
| "graph" -> Re.graph
| "xdigit" -> Re.xdigit
| class_ -> invalid_arg ("Invalid pcre class: " ^ class_)

let names =
[ "alpha" ; "alnum" ; "ascii"
; "blank" ; "cntrl" ; "digit"
; "lower" ; "print" ; "space"
; "upper" ; "word" ; "punct"
; "graph" ; "xdigit" ]

let parse buf =
let accept = Parse_buffer.accept buf in
let accept_s = Parse_buffer.accept_s buf in
match accept ':' with
| false -> None
| true ->
let compl = accept '^' in
let cls =
try List.find accept_s names
with Not_found -> raise Parse_buffer.Parse_error
in
if not (accept_s ":]") then raise Parse_buffer.Parse_error;
let posix_class = of_name cls in
Some (if compl then Re.compl [posix_class] else posix_class)
3 changes: 3 additions & 0 deletions lib/posix_class.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
val names : string list
val of_name : string -> Core.t
val parse : Parse_buffer.t -> Core.t option
10 changes: 7 additions & 3 deletions lib_test/test_posix.ml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
open OUnit

let execp = Re.execp

let test_class_space () =
match Re.Posix.compile_pat {|a[[:space:]]b|} with
| exception Re.Posix.Not_supported -> ()
| (_ : Re.re) -> assert false
let re = Re.Posix.compile_pat {|a[[:space:]]b|} in
let exec = Re.execp re in
assert_bool "matches with space" (exec "a b");
assert_bool "does not match without a space" (not (exec "ab"));
assert_bool "does not match with a different char" (not (exec "a_b"))

let suite = "posix" >:::
[ "regression 213" >:: test_class_space
Expand Down

0 comments on commit 4de7ecd

Please sign in to comment.