Skip to content

Commit

Permalink
Implementing Char for graphemes and all the necessary changes to ma…
Browse files Browse the repository at this point in the history
…ke it happen. (#698)

* feat!: add newtypes for working with graphemes

add `Grapheme`, `Graphemes` and `GraphemeIter` types

move the implementation of input traits to the `text` module

replace type `Input::Item` with `Grapheme`

implement seq traits for graphemes

* refactor!: update the `Char` trait

Remove `Char::Str` and everything related to it.

Remove a number of supertraits for the `Char` trait.

Replace `Char::to_char()` with `Char::to_ascii()`.

Remove `Char::from_ascii()`.

Add `Char::is_newline()`.

Remove the generic `C` from `StrInput`.

* feat: add a `Char` implementation for `Grapheme`

* fix: fix errors generated in features

* fix: remove the use of unstable features

* fix: fix backwards compatibility

* fix: fix `clippy` warnings

* refactor!: make `Grapheme` and `Graphemes` unsized

Replace their uses with references to them.

* fix: remove unused lifetime

in `src/text.rs:609:10`

* feat: implement `StrInput` for `&Graphemes`

* fix: fix `clippy` errors

* feat!: manually implement `Debug` and `Display` for graphemes
  • Loading branch information
Hedgehogo authored Jan 1, 2025
1 parent ed345d9 commit ae01819
Show file tree
Hide file tree
Showing 10 changed files with 636 additions and 288 deletions.
6 changes: 3 additions & 3 deletions benches/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ fn bench_then(c: &mut Criterion) {

#[cfg(feature = "regex")]
fn bench_regex(c: &mut Criterion) {
let re_foo = regex::<_, _, extra::Default>("foo");
let re_foo2 = regex::<_, _, extra::Default>("[fF]oo");
let re_rep = regex::<_, _, extra::Default>("(?:abc){4}");
let re_foo = regex::<_, extra::Default>("foo");
let re_foo2 = regex::<_, extra::Default>("[fF]oo");
let re_rep = regex::<_, extra::Default>("(?:abc){4}");

let mut group = c.benchmark_group("regex");

Expand Down
8 changes: 4 additions & 4 deletions src/combinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let row_4 = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let row_4 = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .at_most(4)
Expand Down Expand Up @@ -1661,7 +1661,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let coordinate_3d = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let coordinate_3d = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .exactly(3)
Expand Down Expand Up @@ -1690,7 +1690,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let r#enum = text::ascii::keyword::<_, _, _, extra::Err<Simple<char>>>("enum")
/// let r#enum = text::ascii::keyword::<_, _, extra::Err<Simple<char>>>("enum")
/// .padded()
/// .ignore_then(text::ascii::ident()
/// .padded()
Expand Down Expand Up @@ -1720,7 +1720,7 @@ where
///
/// ```
/// # use chumsky::prelude::*;
/// let numbers = text::int::<_, _, extra::Err<Simple<char>>>(10)
/// let numbers = text::int::<_, extra::Err<Simple<char>>>(10)
/// .padded()
/// .separated_by(just(','))
/// .allow_trailing()
Expand Down
68 changes: 65 additions & 3 deletions src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ impl<'p> Seq<'p, char> for str {
}
}

impl<'p> Seq<'p, char> for &'p str {
impl<'p> Seq<'p, char> for String {
type Item<'a>
= char
where
Expand Down Expand Up @@ -792,7 +792,7 @@ impl<'p> Seq<'p, char> for &'p str {
}
}

impl<'p> Seq<'p, char> for String {
impl<'p> Seq<'p, char> for &'p str {
type Item<'a>
= char
where
Expand Down Expand Up @@ -822,6 +822,66 @@ impl<'p> Seq<'p, char> for String {
}
}

impl<'p> Seq<'p, &'p Grapheme> for &'p str {
type Item<'a>
= &'p Grapheme
where
Self: 'a;

type Iter<'a>
= GraphemesIter<'p>
where
Self: 'a;

#[inline(always)]
fn seq_iter(&self) -> Self::Iter<'_> {
Graphemes::new(self).iter()
}

#[inline(always)]
fn contains(&self, val: &&'p Grapheme) -> bool {
Graphemes::new(self).contains(val)
}

#[inline]
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
where
'p: 'b,
{
MaybeRef::Val(item)
}
}

impl<'p> Seq<'p, &'p Grapheme> for &'p Graphemes {
type Item<'a>
= &'p Grapheme
where
Self: 'a;

type Iter<'a>
= GraphemesIter<'p>
where
Self: 'a;

#[inline(always)]
fn seq_iter(&self) -> Self::Iter<'_> {
self.iter()
}

#[inline(always)]
fn contains(&self, val: &&'p Grapheme) -> bool {
self.iter().any(|i| i == *val)
}

#[inline]
fn to_maybe_ref<'b>(item: Self::Item<'b>) -> MaybeRef<'p, &'p Grapheme>
where
'p: 'b,
{
MaybeRef::Val(item)
}
}

/// A utility trait to abstract over *linear* container-like things.
///
/// This trait is likely to change in future versions of the crate, so avoid implementing it yourself.
Expand All @@ -838,8 +898,10 @@ impl<'p, T> OrderedSeq<'p, T> for core::ops::RangeInclusive<T> where Self: Seq<'
impl<'p, T> OrderedSeq<'p, T> for RangeFrom<T> where Self: Seq<'p, T> {}

impl OrderedSeq<'_, char> for str {}
impl<'p> OrderedSeq<'p, char> for &'p str {}
impl OrderedSeq<'_, char> for String {}
impl<'p> OrderedSeq<'p, char> for &'p str {}
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p str {}
impl<'p> OrderedSeq<'p, &'p Grapheme> for &'p Graphemes {}

#[cfg(test)]
mod test {
Expand Down
107 changes: 12 additions & 95 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ pub use crate::stream::{BoxedExactSizeStream, BoxedStream, IterInput, Stream};
use super::*;
#[cfg(feature = "std")]
use std::io::{BufReader, Read, Seek};
use unicode_segmentation::{Graphemes, UnicodeSegmentation};

/// A trait for types that represents a stream of input tokens. Unlike [`Iterator`], this type
/// supports backtracking and a few other features required by the crate.
Expand Down Expand Up @@ -210,8 +209,9 @@ pub trait SliceInput<'src>: ExactSizeInput<'src> {
// Implemented by inputs that reference a string slice and use byte indices as their cursor. This trait is sealed right
// now because `StrInput` places additional requirements on its cursor semantics.
/// A trait for types that represent string-like streams of input tokens.
pub trait StrInput<'src, C: Char>:
Sealed + ValueInput<'src, Cursor = usize, Token = C> + SliceInput<'src, Slice = &'src C::Str>
pub trait StrInput<'src>: Sealed + ValueInput<'src, Cursor = usize> + SliceInput<'src>
where
Self::Token: Char,
{
}

Expand Down Expand Up @@ -298,7 +298,7 @@ impl<'src> ValueInput<'src> for &'src str {
}

impl Sealed for &str {}
impl<'src> StrInput<'src, char> for &'src str {}
impl<'src> StrInput<'src> for &'src str {}

impl<'src> SliceInput<'src> for &'src str {
type Slice = &'src str;
Expand All @@ -319,89 +319,6 @@ impl<'src> SliceInput<'src> for &'src str {
}
}

impl<'src> Input<'src> for Graphemes<'src> {
type Cursor = usize;
type Span = SimpleSpan<usize>;

type Token = &'src str;
type MaybeToken = &'src str;

type Cache = &'src str;

#[inline]
fn begin(self) -> (Self::Cursor, Self::Cache) {
(0, self.as_str())
}

#[inline]
fn cursor_location(cursor: &Self::Cursor) -> usize {
*cursor
}

#[inline(always)]
unsafe fn next_maybe(
this: &mut Self::Cache,
cursor: &mut Self::Cursor,
) -> Option<Self::MaybeToken> {
if *cursor < this.len() {
// SAFETY: `cursor < self.len()` above guarantees cursor is in-bounds
// We only ever return cursors that are at a code point boundary.
// The `next()` implementation returns `None`, only in the
// situation of zero length of the remaining part of the string.
// And the Unicode standard guarantees that any sequence of code
// points is a valid sequence of grapheme clusters, so the
// behaviour of the `next()` function should not change.
let c = this
.get_unchecked(*cursor..)
.graphemes(true)
.next()
.unwrap_unchecked();
*cursor += c.len();
Some(c)
} else {
None
}
}

#[inline(always)]
unsafe fn span(_this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Span {
(*range.start..*range.end).into()
}
}

impl<'src> ExactSizeInput<'src> for Graphemes<'src> {
#[inline(always)]
unsafe fn span_from(this: &mut Self::Cache, range: RangeFrom<&Self::Cursor>) -> Self::Span {
(*range.start..this.len()).into()
}
}

impl<'src> ValueInput<'src> for Graphemes<'src> {
#[inline(always)]
unsafe fn next(this: &mut Self::Cache, cursor: &mut Self::Cursor) -> Option<Self::Token> {
Self::next_maybe(this, cursor)
}
}

impl<'src> SliceInput<'src> for Graphemes<'src> {
type Slice = Graphemes<'src>;

#[inline(always)]
fn full_slice(this: &mut Self::Cache) -> Self::Slice {
this.graphemes(true)
}

#[inline(always)]
unsafe fn slice(this: &mut Self::Cache, range: Range<&Self::Cursor>) -> Self::Slice {
this[*range.start..*range.end].graphemes(true)
}

#[inline(always)]
unsafe fn slice_from(this: &mut Self::Cache, from: RangeFrom<&Self::Cursor>) -> Self::Slice {
this[*from.start..].graphemes(true)
}
}

impl<'src, T> Input<'src> for &'src [T] {
type Cursor = usize;
type Span = SimpleSpan<usize>;
Expand Down Expand Up @@ -448,7 +365,7 @@ impl<'src, T> ExactSizeInput<'src> for &'src [T] {
}

impl Sealed for &[u8] {}
impl<'src> StrInput<'src, u8> for &'src [u8] {}
impl<'src> StrInput<'src> for &'src [u8] {}

impl<'src, T> SliceInput<'src> for &'src [T] {
type Slice = &'src [T];
Expand Down Expand Up @@ -532,7 +449,7 @@ impl<'src, T: 'src, const N: usize> ExactSizeInput<'src> for &'src [T; N] {
}

impl<const N: usize> Sealed for &[u8; N] {}
impl<'src, const N: usize> StrInput<'src, u8> for &'src [u8; N] {}
impl<'src, const N: usize> StrInput<'src> for &'src [u8; N] {}

impl<'src, T: 'src, const N: usize> SliceInput<'src> for &'src [T; N] {
type Slice = &'src [T];
Expand Down Expand Up @@ -881,14 +798,14 @@ where
F: Fn(I::Span) -> S,
{
}
impl<'src, C, S, I, F: 'src> StrInput<'src, C> for MappedSpan<S, I, F>
impl<'src, S, I, F: 'src> StrInput<'src> for MappedSpan<S, I, F>
where
I: StrInput<'src, C>,
I: StrInput<'src>,
I::Token: Char,
S: Span + Clone + 'src,
S::Context: Clone + 'src,
S::Offset: From<<I::Span as Span>::Offset>,
F: Fn(I::Span) -> S,
C: Char,
{
}

Expand Down Expand Up @@ -1027,13 +944,13 @@ where
S::Offset: From<<I::Span as Span>::Offset>,
{
}
impl<'src, C, S, I> StrInput<'src, C> for WithContext<S, I>
impl<'src, S, I> StrInput<'src> for WithContext<S, I>
where
I: StrInput<'src, C>,
I: StrInput<'src>,
I::Token: Char,
S: Span + Clone + 'src,
S::Context: Clone + 'src,
S::Offset: From<<I::Span as Span>::Offset>,
C: Char,
{
}

Expand Down
Loading

0 comments on commit ae01819

Please sign in to comment.