diff --git a/lib/core/locale.nit b/lib/core/locale.nit new file mode 100644 index 0000000000..6cec3ba62f --- /dev/null +++ b/lib/core/locale.nit @@ -0,0 +1,38 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Enumeration of common locale names as well as default locale name +module locale is pkgconfig ("icu-io", "icu-i18n", "icu-uc") + +`{ + #include + #include +`} + +private fun get_default_locale(locale: CString): Int `{ + const char * default_locale = uloc_getDefault(); + if (locale != NULL) { + strcpy(locale, default_locale); + } + return strlen(default_locale); +`} + +redef class Sys + fun default_locale: String do + var required_length = get_default_locale(new CString.nul) + var locale = new CString(required_length + 1) + get_default_locale(locale) + return locale.to_s + end +end diff --git a/lib/core/text/case_modification.nit b/lib/core/text/case_modification.nit new file mode 100644 index 0000000000..3c6d9b346a --- /dev/null +++ b/lib/core/text/case_modification.nit @@ -0,0 +1,114 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Locale-sensitive case modification (lowercasing, uppercasing and titlecasing) +module case_modification is pkgconfig ("icu-io", "icu-i18n", "icu-uc") + +intrude import u16_string +import locale + +`{ + #include + #include +`} + +redef class U16String + # Returns an uppercased `U16String` from `self` considering a `locale` + fun uni_to_upper(locale : CString) : U16String + do + var required_length = uchar_string.uni_to_upper(new UCharString.nul, 0, locale, code_units) + var result = new U16String(required_length + 1, required_length + 1) + uchar_string.uni_to_upper(result.uchar_string, result.capacity , locale, code_units) + return result + end + + # Returns an lowercased `U16String` from `self` considering a `locale` + fun uni_to_lower(locale : CString) : U16String + do + var required_length = uchar_string.uni_to_lower(new UCharString.nul, 0, locale, code_units) + var result = new U16String(required_length + 1, required_length + 1) + uchar_string.uni_to_lower(result.uchar_string, result.capacity , locale, code_units) + return result + end + + # Returns an titlecased `U16String` from `self` considering a `locale` + fun uni_to_title(locale : CString) : U16String + do + var required_length = uchar_string.uni_to_title(new UCharString.nul, 0, locale, code_units) + var result = new U16String(required_length + 1, required_length + 1) + uchar_string.uni_to_title(result.uchar_string, result.capacity , locale, code_units) + return result + end +end + +redef class UCharString + # Returns the number of code units required for the uppercased `UCharString` from `self` and writes the resulting `UCharString` in `dest` + fun uni_to_upper(dest : UCharString, dest_cap : Int, locale : CString, src_length : Int) : Int `{ + UErrorCode error = U_ZERO_ERROR; + int32_t res = u_strToUpper(dest, dest_cap, self, src_length, locale, &error); + return res; + `} + + # Returns the number of code units required for the lowercased `UCharString` from `self` and writes the resulting `UCharString` in `dest` + fun uni_to_lower(dest : UCharString, dest_cap : Int, locale : CString, src_length : Int) : Int `{ + UErrorCode error = U_ZERO_ERROR; + int32_t res = u_strToLower(dest, dest_cap, self, src_length, locale, &error); + return res; + `} + + # Returns the number of code units required for the titlecased `UCharString` from `self` and writes the resulting `UCharString` in `dest` + fun uni_to_title(dest : UCharString, dest_cap : Int, locale : CString, src_length : Int) : Int `{ + UErrorCode error = U_ZERO_ERROR; + int32_t res = u_strToTitle(dest, dest_cap, self, src_length, NULL, locale, &error); + return res; + `} +end + +redef class String + # Returns an uppercased `String` from `self` considering an optional `locale` parameter + # + # ~~~raw + # assert "kedi".uni_to_upper("tr_TR") == "KEDİ" + # assert "kedi".uni_to_upper("en_US") != "KEDİ" + # ~~~ + fun uni_to_upper(locale : nullable String) : String + do + if locale == null then locale = default_locale + return to_u16string.uni_to_upper(locale.to_cstring).to_s + end + + # Returns a lowercased `String` from `self` considering an optional `locale` parameter + # + # ~~~raw + # assert "YAZIM".uni_to_lower("tr_TR") == "yazım" + # assert "YAZIM".uni_to_lower("en_US") != "yazım" + # ~~~ + fun uni_to_lower(locale : nullable String) : String + do + if locale == null then locale = default_locale + return to_u16string.uni_to_lower(locale.to_cstring).to_s + end + + # Returns a titlecased `String` from `self` considering an optional `locale` parameter + # + # ~~~raw + # assert "istanbul".uni_to_upper("tr_TR") == "İstanbul" + # assert "istanbul".uni_to_upper("en_US") != "Istanbul" + # ~~~ + fun uni_to_title(locale : nullable String) : String + do + if locale == null then locale = default_locale + return to_u16string.uni_to_title(locale.to_cstring).to_s + end +end diff --git a/lib/core/text/formatting.nit b/lib/core/text/formatting.nit new file mode 100644 index 0000000000..a229adb2e8 --- /dev/null +++ b/lib/core/text/formatting.nit @@ -0,0 +1,181 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Locale-sensitive number and date formatting module +module formatting is pkgconfig ("icu-io", "icu-i18n", "icu-uc") + +intrude import u16_string +import date +import locale + +`{ + #include + #include + #include + #include + #include +`} + +# Wrapper for ICU's `UNumberFormatStyle` enum. +# There are 6 styles available : decimal, percent, ordinal, currency and spellout. +extern class NumberFormatStyle `{ UNumberFormatStyle `} + new (style : CString) `{ + if (!strcmp(style, "decimal")) { + return UNUM_DECIMAL; + } else if (!strcmp(style, "percent")) { + return UNUM_PERCENT; + } else if (!strcmp(style, "scientific")) { + return UNUM_SCIENTIFIC; + } else if (!strcmp(style, "ordinal")) { + return UNUM_ORDINAL; + } else if (!strcmp(style, "currency")) { + return UNUM_CURRENCY; + } else if (!strcmp(style, "spellout")) { + return UNUM_SPELLOUT; + } else { + return UNUM_DEFAULT; + } + `} +end + +# Wrapper for ICU's `UDateFormatStyle` enum. +# There are 4 styles available : full, long, medium and short. +extern class DateFormatStyle `{ UDateFormatStyle `} + new (style : CString) `{ + if (!strcmp(style, "full")) { + return UDAT_FULL; + } else if (!strcmp(style, "long")) { + return UDAT_LONG; + } else if (!strcmp(style, "medium")) { + return UDAT_MEDIUM; + } else if (!strcmp(style, "short")) { + return UDAT_SHORT; + } else { + return UDAT_DEFAULT; + } + `} +end + +# Wrapper for ICU's `UNumberFormat` structure +extern class NumberFormatter `{ UNumberFormat * `} + new (locale_name : CString, style : NumberFormatStyle) `{ + UErrorCode error = U_ZERO_ERROR; + UNumberFormat* numberFormatter = unum_open(style, NULL, -1, locale_name, NULL, &error); + return numberFormatter; + `} + + # Returns a formatted `UCharString` from a `Float` + fun format (number : Float, dest : UCharString, dest_length : Int) : Int `{ + UErrorCode error = U_ZERO_ERROR; + UFieldPosition pos; + + uint32_t required_length = unum_formatDouble(self, number, dest, dest_length, &pos, &error); + return required_length; + `} + + redef fun free `{ + unum_close(self); + `} +end + +# Wrapper for ICU's `UDateFormat` structure +extern class TimeFormatter `{ UDateFormat *`} + new (locale_name : CString, style : DateFormatStyle) `{ + UErrorCode error = U_ZERO_ERROR; + UDateFormat * dateFormatter = udat_open(style, UDAT_NONE, locale_name, NULL, -1, NULL, -1, &error); + return dateFormatter; + `} + + # Returns a formatted `UCharString` from a `Int` (date in miliseconds) + fun format(time : Int, dest : UCharString, dest_length : Int) : Int `{ + UErrorCode error = U_ZERO_ERROR; + extern long timezone; + tzset(); + UDate date = time + timezone * 1000; + + uint32_t required_length = udat_format(self, date, dest, dest_length, NULL, &error); + return required_length; + `} + + redef fun free `{ + udat_close(self); + `} +end + +redef class Float + # Returns a formatted `String` from `self` considering a locale and a style. + # There are 6 styles available : decimal, percent, ordinal, currency and spellout. + # ~~~raw + # assert 12.7.format("spellout", "en_US") == "twelve point seven" + # ~~~ + fun format(style : String, locale_name : nullable String) : String + do + if locale_name == null then locale_name = default_locale + + var nf = new NumberFormatter(locale_name.to_cstring, new NumberFormatStyle(style.to_cstring)) + var required_length = nf.format(self, new UCharString.nul, 0) + var result = new U16String(required_length + 1, required_length + 1) + nf.format(self, result.uchar_string, result.capacity) + nf.free + return result.to_s + end +end + +redef class Int + # Returns a formatted `String` from `self` considering a locale and a style. + # There are 6 styles available : decimal, percent, ordinal, currency and spellout. + # ~~~raw + # assert 12.format("spellout", "en_US") == "twelve" + # ~~~ + fun format(style : String, locale_name : nullable String) : String + do + return to_f.format(style, locale_name) + end +end + +redef class Time + redef fun to_s : String do return format("", default_locale) + + # Returns the number of miliseconds in `self` + # ~~~raw + # var time = new Time(5,5,5) + # assert time.to_ms == 18305000 + # ~~~ + fun to_ms : Int + do + var h = hour * 60 * 60 * 1000 + var m = minute * 60 * 1000 + var s = second * 1000 + + return h + m + s + end + + # Returns a formatted `String` from `self` considring a locale and a style. + # There are 4 styles available : full, long, medium and short. + # ~~~raw + # var time = new Time(5,5,5) + # assert time.format("full", "en_US") == "5:05:05 AM" + # ~~~ + fun format(style : String, locale_name : nullable String) : String + do + if locale_name == null then locale_name = default_locale + + var df = new TimeFormatter(locale_name.to_cstring, new DateFormatStyle(style.to_cstring)) + var required_length = df.format(to_ms, new UCharString.nul, 0) + var result = new U16String(required_length + 1, required_length + 1) + df.format(to_ms, result.uchar_string, required_length + 1) + df.free + return result.to_s + end +end diff --git a/lib/core/text/native.nit b/lib/core/text/native.nit index 372ac5a301..6439efe01a 100644 --- a/lib/core/text/native.nit +++ b/lib/core/text/native.nit @@ -1,325 +1,328 @@ -# This file is part of NIT ( http://www.nitlanguage.org ). -# -# This file is free software, which comes along with NIT. This software is -# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; -# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -# PARTICULAR PURPOSE. You can modify it is you want, provided this header -# is kept unaltered, and a notification of the changes is added. -# You are allowed to redistribute it and sell it, alone or is a part of -# another product. - -# Native structures for text and bytes -module native - -import kernel -import math -import fixed_ints - -in "C" `{ -#ifdef __linux__ - #include -#endif -#ifdef __APPLE__ - #include - #define be32toh(x) OSSwapBigToHostInt32(x) -#endif -#ifdef _WIN32 - #define be32toh(val) _byteswap_ulong(val) -#endif - -#ifndef be32toh - #define be32toh(val) betoh32(val) -#endif - -#include -#include -`} - -redef class Int - # Gives the length of the UTF-8 char starting with `self` - fun u8len: Int do - if self & 0b1000_0000 == 0 then - return 1 - else if self & 0b1110_0000 == 0b1100_0000 then - return 2 - else if self & 0b1111_0000 == 0b1110_0000 then - return 3 - else if self & 0b1111_1000 == 0b1111_0000 then - return 4 - else - return 1 - end - end - - # Is `self` a valid UTF-8 sequence start ? - # - # ~~~nit - # assert 0.is_valid_utf8_start - # assert 0xC0.is_valid_utf8_start - # assert 0xE0.is_valid_utf8_start - # assert 0xF0.is_valid_utf8_start - # ~~~ - fun is_valid_utf8_start: Bool do - if self & 0x80 == 0 then return true - if self & 0b1110_0000 == 0b1100_0000 then return true - if self & 0b1111_0000 == 0b1110_0000 then return true - if self & 0b1111_1000 == 0b1111_0000 then return true - return false - end -end - -redef class UInt32 - # Returns the code_point from a utf16 surrogate pair - # - # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32 - fun from_utf16_surr: UInt32 do - var hi = (self & 0xFFFF0000u32) >> 16 - var lo = self & 0xFFFFu32 - var cp = 0u32 - cp += (hi - 0xD800u32) << 10 - cp += lo - 0xDC00u32 - cp += 0x10000u32 - return cp - end - - # The character which code point (unicode-wise) is `self` - # - # assert 65u32.code_point == 'A' - # assert 10u32.code_point == '\n' - # assert 0x220Bu32.code_point == '∋' - fun code_point: Char `{ return self; `} -end - -# C string `char *` -# -# Used as underlying implementation for `String` and some other `Text`. -extern class CString `{ char* `} - # Create a new `CString` with the capacity for `length` characters - new(length: Int) is intern - - # Get a char* starting at `index`. - # - # WARNING: Unsafe for extern code, use only for temporary - # pointer manipulation purposes (e.g. write to file or such) - fun fast_cstring(index: Int): CString is intern - - # Get char at `index`. - fun [](index: Int): Int is intern - - # Set char `item` at index. - fun []=(index: Int, item: Int) is intern - - # Copy `self` to `dest`. - fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern - - redef fun ==(o) is intern do return is_same_instance(o) - - redef fun !=(o) is intern do return not is_same_instance(o) - - # Position of the first nul character. - fun cstring_length: Int - do - var l = 0 - while self[l] != 0 do l += 1 - return l - end - - # Parse `self` as an Int. - fun atoi: Int is intern - - # Parse `self` as a Float. - fun atof: Float `{ return atof(self); `} - - # Gets the UTF-8 char at index `pos` - # - # Index is expressed in Unicode chars - # - # ~~~raw - # assert "かきく".as(FlatString).items.char_at(0) == 'か' - # ~~~ - # - # If the char at position pos is an invalid Unicode char, - # the Unicode replacement character � (0xFFFD) will be used. - # - # ~~~raw - # assert "かきく".as(FlatString).items.char_at(1) == '�' - # ~~~ - fun char_at(pos: Int): Char do - var c = self[pos] - if c & 0x80 == 0 then return c.code_point - var b = fetch_4_hchars(pos) - var ret = 0u32 - if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point - if b & 0xE0000000u32 == 0xC0000000u32 then - ret |= (b & 0x1F000000u32) >> 18 - ret |= (b & 0x3F0000u32) >> 16 - return ret.code_point - end - if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point - if b & 0xF0000000u32 == 0xE0000000u32 then - ret |= (b & 0xF000000u32) >> 12 - ret |= (b & 0x3F0000u32) >> 10 - ret |= (b & 0x3F00u32) >> 8 - return ret.code_point - end - if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point - if b & 0xF8000000u32 == 0xF0000000u32 then - ret |= (b & 0x7000000u32) >> 6 - ret |= (b & 0x3F0000u32) >> 4 - ret |= (b & 0x3F00u32) >> 2 - ret |= b & 0x3Fu32 - return ret.code_point - end - return 0xFFFD.code_point - end - - # Gets the byte index of char at position `n` in UTF-8 String - fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0) - - # Gets the length of the character at position `pos` (1 if invalid sequence) - fun length_of_char_at(pos: Int): Int do - var c = self[pos] - if c & 0x80 == 0x00 then - return 1 - else if c & 0xE0 == 0xC0 and self[pos + 1] & 0xC0 == 0x80 then - return 2 - else if c & 0xF0 == 0xE0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 then - return 3 - else if c & 0xF8 == 0xF0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 and self[pos + 3] & 0xC0 == 0x80 then - return 4 - else - return 1 - end - end - - # Gets the byte index of char at position `n` in UTF-8 String - # - # `char_from` and `byte_from` are cached values to seek from. - # - # NOTE: char_from and byte_from are not guaranteed to be valid cache values - # It it up to the client to ensure the validity of the information - fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do - var ns_i = byte_from - var my_i = char_from - - var dist = n - my_i - - while dist > 0 do - while dist >= 4 do - var i = fetch_4_chars(ns_i) - if i & 0x80808080u32 != 0u32 then break - ns_i += 4 - my_i += 4 - dist -= 4 - end - if dist == 0 then break - ns_i += length_of_char_at(ns_i) - my_i += 1 - dist -= 1 - end - - while dist < 0 do - while dist <= -4 do - var i = fetch_4_chars(ns_i - 4) - if i & 0x80808080u32 != 0u32 then break - ns_i -= 4 - my_i -= 4 - dist += 4 - end - if dist == 0 then break - ns_i = find_beginning_of_char_at(ns_i - 1) - my_i -= 1 - dist += 1 - end - - return ns_i - end - - # Gets the char index of byte at position `n` in a UTF-8 String - # - # `char_from` and `byte_from` are cached values to seek from. - # - # NOTE: char_from and byte_from are not guaranteed to be valid cache values - # It it up to the client to ensure the validity of the information - fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do - var ns_i = byte_from - var my_i = char_from - - while ns_i < n do - ns_i += length_of_char_at(ns_i) - my_i += 1 - end - - while ns_i > n do - ns_i = find_beginning_of_char_at(ns_i - 1) - my_i -= 1 - end - - return my_i - end - - # Returns the beginning position of the char at position `pos` - # - # If the char is invalid UTF-8, `pos` is returned as-is - # - # ~~~raw - # assert "abc".items.find_beginning_of_char_at(2) == 2 - # assert "か".items.find_beginning_of_char_at(1) == 0 - # assert [0x41, 233].to_s.items.find_beginning_of_char_at(1) == 1 - # ~~~ - fun find_beginning_of_char_at(pos: Int): Int do - var endpos = pos - var c = self[pos] - if c & 0x80 == 0x00 then return pos - while c & 0xC0 == 0x80 do - pos -= 1 - c = self[pos] - end - var stpos = pos - if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos - return endpos - end - - # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length` - fun utf8_length(from, byte_length: Int): Int is intern do - var st = from - var ln = 0 - while byte_length > 0 do - while byte_length >= 4 do - var i = fetch_4_chars(st) - if i & 0x80808080u32 != 0u32 then break - byte_length -= 4 - st += 4 - ln += 4 - end - if byte_length == 0 then break - var cln = length_of_char_at(st) - st += cln - ln += 1 - byte_length -= cln - end - return ln - end - - # Fetch 4 chars in `self` at `pos` - fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `} - - # Fetch 4 chars in `self` at `pos` - fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `} - - # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos` - fun rshift(sh, len, pos: Int) do - copy_to(self, len, pos, pos + sh) - end - - # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos` - fun lshift(sh, len, pos: Int) do - copy_to(self, len, pos, pos - sh) - end - - # Sets the contents of `self` to `value` for `len` bytes - fun memset(value, len: Int) `{ - assert(len >= 0); - memset(self, value, len); - `} -end +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# This file is free software, which comes along with NIT. This software is +# distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. You can modify it is you want, provided this header +# is kept unaltered, and a notification of the changes is added. +# You are allowed to redistribute it and sell it, alone or is a part of +# another product. + +# Native structures for text and bytes +module native + +import kernel +import math +import fixed_ints + +in "C" `{ +#ifdef __linux__ + #include +#endif +#ifdef __APPLE__ + #include + #define be32toh(x) OSSwapBigToHostInt32(x) +#endif +#ifdef _WIN32 + #define be32toh(val) _byteswap_ulong(val) +#endif + +#ifndef be32toh + #define be32toh(val) betoh32(val) +#endif + +#include +#include +`} + +redef class Int + # Gives the length of the UTF-8 char starting with `self` + fun u8len: Int do + if self & 0b1000_0000 == 0 then + return 1 + else if self & 0b1110_0000 == 0b1100_0000 then + return 2 + else if self & 0b1111_0000 == 0b1110_0000 then + return 3 + else if self & 0b1111_1000 == 0b1111_0000 then + return 4 + else + return 1 + end + end + + # Is `self` a valid UTF-8 sequence start ? + # + # ~~~nit + # assert 0.is_valid_utf8_start + # assert 0xC0.is_valid_utf8_start + # assert 0xE0.is_valid_utf8_start + # assert 0xF0.is_valid_utf8_start + # ~~~ + fun is_valid_utf8_start: Bool do + if self & 0x80 == 0 then return true + if self & 0b1110_0000 == 0b1100_0000 then return true + if self & 0b1111_0000 == 0b1110_0000 then return true + if self & 0b1111_1000 == 0b1111_0000 then return true + return false + end +end + +redef class UInt32 + # Returns the code_point from a utf16 surrogate pair + # + # assert 0xD83DDE02u32.from_utf16_surr == 0x1F602u32 + fun from_utf16_surr: UInt32 do + var hi = (self & 0xFFFF0000u32) >> 16 + var lo = self & 0xFFFFu32 + var cp = 0u32 + cp += (hi - 0xD800u32) << 10 + cp += lo - 0xDC00u32 + cp += 0x10000u32 + return cp + end + + # The character which code point (unicode-wise) is `self` + # + # assert 65u32.code_point == 'A' + # assert 10u32.code_point == '\n' + # assert 0x220Bu32.code_point == '∋' + fun code_point: Char `{ return self; `} +end + +# C string `char *` +# +# Used as underlying implementation for `String` and some other `Text`. +extern class CString `{ char* `} + # Create a new `CString` with the capacity for `length` characters + new(length: Int) is intern + + # Returns a null `char *` + new nul `{ return NULL; `} + + # Get a char* starting at `index`. + # + # WARNING: Unsafe for extern code, use only for temporary + # pointer manipulation purposes (e.g. write to file or such) + fun fast_cstring(index: Int): CString is intern + + # Get char at `index`. + fun [](index: Int): Int is intern + + # Set char `item` at index. + fun []=(index: Int, item: Int) is intern + + # Copy `self` to `dest`. + fun copy_to(dest: CString, length: Int, from: Int, to: Int) is intern + + redef fun ==(o) is intern do return is_same_instance(o) + + redef fun !=(o) is intern do return not is_same_instance(o) + + # Position of the first nul character. + fun cstring_length: Int + do + var l = 0 + while self[l] != 0 do l += 1 + return l + end + + # Parse `self` as an Int. + fun atoi: Int is intern + + # Parse `self` as a Float. + fun atof: Float `{ return atof(self); `} + + # Gets the UTF-8 char at index `pos` + # + # Index is expressed in Unicode chars + # + # ~~~raw + # assert "かきく".as(FlatString).items.char_at(0) == 'か' + # ~~~ + # + # If the char at position pos is an invalid Unicode char, + # the Unicode replacement character � (0xFFFD) will be used. + # + # ~~~raw + # assert "かきく".as(FlatString).items.char_at(1) == '�' + # ~~~ + fun char_at(pos: Int): Char do + var c = self[pos] + if c & 0x80 == 0 then return c.code_point + var b = fetch_4_hchars(pos) + var ret = 0u32 + if b & 0xC00000u32 != 0x800000u32 then return 0xFFFD.code_point + if b & 0xE0000000u32 == 0xC0000000u32 then + ret |= (b & 0x1F000000u32) >> 18 + ret |= (b & 0x3F0000u32) >> 16 + return ret.code_point + end + if not b & 0xC000u32 == 0x8000u32 then return 0xFFFD.code_point + if b & 0xF0000000u32 == 0xE0000000u32 then + ret |= (b & 0xF000000u32) >> 12 + ret |= (b & 0x3F0000u32) >> 10 + ret |= (b & 0x3F00u32) >> 8 + return ret.code_point + end + if not b & 0xC0u32 == 0x80u32 then return 0xFFFD.code_point + if b & 0xF8000000u32 == 0xF0000000u32 then + ret |= (b & 0x7000000u32) >> 6 + ret |= (b & 0x3F0000u32) >> 4 + ret |= (b & 0x3F00u32) >> 2 + ret |= b & 0x3Fu32 + return ret.code_point + end + return 0xFFFD.code_point + end + + # Gets the byte index of char at position `n` in UTF-8 String + fun char_to_byte_index(n: Int): Int do return char_to_byte_index_cached(n, 0, 0) + + # Gets the length of the character at position `pos` (1 if invalid sequence) + fun length_of_char_at(pos: Int): Int do + var c = self[pos] + if c & 0x80 == 0x00 then + return 1 + else if c & 0xE0 == 0xC0 and self[pos + 1] & 0xC0 == 0x80 then + return 2 + else if c & 0xF0 == 0xE0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 then + return 3 + else if c & 0xF8 == 0xF0 and self[pos + 1] & 0xC0 == 0x80 and self[pos + 2] & 0xC0 == 0x80 and self[pos + 3] & 0xC0 == 0x80 then + return 4 + else + return 1 + end + end + + # Gets the byte index of char at position `n` in UTF-8 String + # + # `char_from` and `byte_from` are cached values to seek from. + # + # NOTE: char_from and byte_from are not guaranteed to be valid cache values + # It it up to the client to ensure the validity of the information + fun char_to_byte_index_cached(n, char_from, byte_from: Int): Int do + var ns_i = byte_from + var my_i = char_from + + var dist = n - my_i + + while dist > 0 do + while dist >= 4 do + var i = fetch_4_chars(ns_i) + if i & 0x80808080u32 != 0u32 then break + ns_i += 4 + my_i += 4 + dist -= 4 + end + if dist == 0 then break + ns_i += length_of_char_at(ns_i) + my_i += 1 + dist -= 1 + end + + while dist < 0 do + while dist <= -4 do + var i = fetch_4_chars(ns_i - 4) + if i & 0x80808080u32 != 0u32 then break + ns_i -= 4 + my_i -= 4 + dist += 4 + end + if dist == 0 then break + ns_i = find_beginning_of_char_at(ns_i - 1) + my_i -= 1 + dist += 1 + end + + return ns_i + end + + # Gets the char index of byte at position `n` in a UTF-8 String + # + # `char_from` and `byte_from` are cached values to seek from. + # + # NOTE: char_from and byte_from are not guaranteed to be valid cache values + # It it up to the client to ensure the validity of the information + fun byte_to_char_index_cached(n, char_from, byte_from: Int): Int do + var ns_i = byte_from + var my_i = char_from + + while ns_i < n do + ns_i += length_of_char_at(ns_i) + my_i += 1 + end + + while ns_i > n do + ns_i = find_beginning_of_char_at(ns_i - 1) + my_i -= 1 + end + + return my_i + end + + # Returns the beginning position of the char at position `pos` + # + # If the char is invalid UTF-8, `pos` is returned as-is + # + # ~~~raw + # assert "abc".items.find_beginning_of_char_at(2) == 2 + # assert "か".items.find_beginning_of_char_at(1) == 0 + # assert [0x41, 233].to_s.items.find_beginning_of_char_at(1) == 1 + # ~~~ + fun find_beginning_of_char_at(pos: Int): Int do + var endpos = pos + var c = self[pos] + if c & 0x80 == 0x00 then return pos + while c & 0xC0 == 0x80 do + pos -= 1 + c = self[pos] + end + var stpos = pos + if length_of_char_at(stpos) >= (endpos - stpos + 1) then return pos + return endpos + end + + # Number of UTF-8 characters in `self` starting at `from`, for a length of `byte_length` + fun utf8_length(from, byte_length: Int): Int is intern do + var st = from + var ln = 0 + while byte_length > 0 do + while byte_length >= 4 do + var i = fetch_4_chars(st) + if i & 0x80808080u32 != 0u32 then break + byte_length -= 4 + st += 4 + ln += 4 + end + if byte_length == 0 then break + var cln = length_of_char_at(st) + st += cln + ln += 1 + byte_length -= cln + end + return ln + end + + # Fetch 4 chars in `self` at `pos` + fun fetch_4_chars(pos: Int): UInt32 is intern `{ return *((uint32_t*)(self+pos)); `} + + # Fetch 4 chars in `self` at `pos` + fun fetch_4_hchars(pos: Int): UInt32 is intern `{ return (uint32_t)be32toh(*((uint32_t*)(self+pos))); `} + + # Right shifts `len` bytes of `self` from `sh` bytes starting at position `pos` + fun rshift(sh, len, pos: Int) do + copy_to(self, len, pos, pos + sh) + end + + # Left shifts `len` bytes of `self` from `sh` bytes starting at position `pos` + fun lshift(sh, len, pos: Int) do + copy_to(self, len, pos, pos - sh) + end + + # Sets the contents of `self` to `value` for `len` bytes + fun memset(value, len: Int) `{ + assert(len >= 0); + memset(self, value, len); + `} +end diff --git a/lib/core/text/string_sort.nit b/lib/core/text/string_sort.nit new file mode 100644 index 0000000000..d451913461 --- /dev/null +++ b/lib/core/text/string_sort.nit @@ -0,0 +1,87 @@ +# This file is part of NIT ( http://www.nitlanguage.org ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Locale-sensitive string sorting module including a `Collator` class and a `Array[String]` sorter +module string_sort is pkgconfig ("icu-io", "icu-i18n", "icu-uc") + +intrude import u16_string +import core::collection::sorter +import locale + +`{ + #include + #include +`} + +redef class String + # Returns `self` <=> other considering a locale + fun uni_compare_to (other : String, locale_name : nullable String) : Int + do + if locale_name == null then locale_name = default_locale + + var u16other = other.to_u16string + + var collator = new Collator(locale_name.to_cstring) + var result = collator.collate(to_u16string.uchar_string, u16other.uchar_string, to_u16string.code_units , u16other.code_units) + collator.free + + return result + end +end + +# Wrapper for ICU's `UCollator` structure +extern class Collator `{ UCollator * `} + new (locale_name : CString) `{ + UErrorCode error = U_ZERO_ERROR; + UCollator * collator = ucol_open(locale_name, &error); + return collator; + `} + + # Performs the collation of `str1` and `str2` and returns the result. + # The collator returns `str1 <=> str2`. + # ~~~raw + # var collator = new Collator("fr_FR") + # assert collator.collate("côte", "coté") == 1 + # ~~~ + fun collate(str1, str2 : UCharString, len1, len2 : Int) : Int `{ + UCollationResult collRes = ucol_strcoll(self, str1, len1, str2, len2); + return collRes; + `} + + redef fun free `{ + ucol_close(self); + `} + +end + +# Sorter for `Array[String]`s +# ~~~raw +# var words = ["cote", "coté", "côte", "côté""] +# var frenchSorter = new StrSorter("fr_FR") +# assert frenchSorter.sort(words) == ["cote", "côte", "coté", "côté"] +# ~~~ +class StrSorter + super Comparator + + var locale_name : String = "" + + init do end + + redef type COMPARED: String is fixed + + redef fun compare(a: String, b: String): Int + do + return a.uni_compare_to(b, locale_name) + end +end diff --git a/lib/core/text/u16_string.nit b/lib/core/text/u16_string.nit index e9a9986e6a..9302baf1cb 100644 --- a/lib/core/text/u16_string.nit +++ b/lib/core/text/u16_string.nit @@ -243,8 +243,3 @@ redef class String return n end end - -redef class CString - # Returns a null `char *` - new nul `{ return NULL; `} -end