From 9b190fd3643cadeaf303c5c87c09ed344bacd089 Mon Sep 17 00:00:00 2001 From: Joshua James Venter <25530194@sun.ac.za> Date: Fri, 5 Jul 2024 23:19:25 +0200 Subject: [PATCH] Refactor and bug fix of atol function - Support leading underscores - Add handle_base_prefix and trim_and_handle_sign helper functions - Rename atol_error to str_to_base_error for clarity - Update atol docstring for improved clarity Breaks up the functionality of atol for better readability and reusability, as suggested in PR #3178. Signed-off-by: Joshua James Venter --- stdlib/src/builtin/string.mojo | 137 ++++++++++++++++++--------- stdlib/test/builtin/test_string.mojo | 16 +++- 2 files changed, 102 insertions(+), 51 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index dca633d6a3c..0b77bb71aec 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -219,7 +219,7 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: if (base != 0) and (base < 2 or base > 36): raise Error("Base must be >= 2 and <= 36, or 0.") if not str_ref: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) var real_base: Int var ord_num_max: Int @@ -229,35 +229,12 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: var is_negative: Bool = False var start: Int = 0 var str_len = len(str_ref) - var buff = str_ref.unsafe_ptr() - for pos in range(start, str_len): - if _isspace(buff[pos]): - continue - - if str_ref[pos] == "-": - is_negative = True - start = pos + 1 - elif str_ref[pos] == "+": - start = pos + 1 - else: - start = pos - break + start, is_negative = _trim_and_handle_sign(str_ref, str_len) - if str_ref[start] == "0" and start + 1 < str_len: - if base == 2 and ( - str_ref[start + 1] == "b" or str_ref[start + 1] == "B" - ): - start += 2 - elif base == 8 and ( - str_ref[start + 1] == "o" or str_ref[start + 1] == "O" - ): - start += 2 - elif base == 16 and ( - str_ref[start + 1] == "x" or str_ref[start + 1] == "X" - ): - start += 2 + start = _handle_base_prefix(start, str_ref, str_len, base) + var buff = str_ref.unsafe_ptr() alias ord_0 = ord("0") # FIXME: # Change this to `alias` after fixing support for __getitem__ of alias. @@ -269,7 +246,7 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: real_base = real_base_new_start[0] start = real_base_new_start[1] if real_base == -1: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) else: real_base = base @@ -284,20 +261,17 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: var found_valid_chars_after_start = False var has_space_after_number = False - # single underscores are only allowed between digits - # starting "was_last_digit_undescore" to true such that - # if the first digit is an undesrcore an error is raised - var was_last_digit_undescore = True + var was_last_digit_underscore = True if real_base == 10 else False for pos in range(start, str_len): var ord_current = int(buff[pos]) if ord_current == ord_underscore: - if was_last_digit_undescore: - raise Error(_atol_error(base, str_ref)) + if was_last_digit_underscore: + raise Error(_str_to_base_error(base, str_ref)) else: - was_last_digit_undescore = True + was_last_digit_underscore = True continue else: - was_last_digit_undescore = False + was_last_digit_underscore = False if ord_0 <= ord_current <= ord_num_max: result += ord_current - ord_0 found_valid_chars_after_start = True @@ -312,29 +286,86 @@ fn _atol(str_ref: StringRef, base: Int = 10) raises -> Int: start = pos + 1 break else: - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) if pos + 1 < str_len and not _isspace(buff[pos + 1]): var nextresult = result * real_base if nextresult < result: raise Error( - _atol_error(base, str_ref) + _str_to_base_error(base, str_ref) + " String expresses an integer too large to store in Int." ) result = nextresult - if was_last_digit_undescore or (not found_valid_chars_after_start): - raise Error(_atol_error(base, str_ref)) + if was_last_digit_underscore or (not found_valid_chars_after_start): + raise Error(_str_to_base_error(base, str_ref)) if has_space_after_number: for pos in range(start, str_len): if not _isspace(buff[pos]): - raise Error(_atol_error(base, str_ref)) + raise Error(_str_to_base_error(base, str_ref)) if is_negative: result = -result return result -fn _atol_error(base: Int, str_ref: StringRef) -> String: +@always_inline +fn _trim_and_handle_sign(str_ref: StringRef, str_len: Int) -> (Int, Bool): + """Trims leading whitespace and handles the sign of the number in the string. + Args: + str_ref: A StringRef containing the number to parse. + str_len: The length of the string. + Returns: + A tuple containing: + - The starting index of the number after whitespace and sign. + - A boolean indicating whether the number is negative. + """ + var buff = str_ref.unsafe_ptr() + var is_negative: Bool = False + var start: Int = 0 + for pos in range(start, str_len): + if _isspace(buff[pos]): + continue + + if str_ref[pos] == "-": + is_negative = True + start = pos + 1 + elif str_ref[pos] == "+": + start = pos + 1 + else: + start = pos + break + + return start, is_negative + + +@always_inline +fn _handle_base_prefix( + pos: Int, str_ref: StringRef, str_len: Int, base: Int +) -> Int: + """Adjusts the starting position if a valid base prefix is present. + Handles "0b"/"0B" for base 2, "0o"/"0O" for base 8, and "0x"/"0X" for base 16. + Only adjusts if the base matches the prefix. + Args: + pos: Current position in the string. + str_ref: The input string. + str_len: Length of the input string. + base: The specified base. + Returns: + Updated position after the prefix, if applicable. + """ + var start = pos + if start + 1 < str_len: + var prefix_char = str_ref[start + 1] + if str_ref[start] == "0" and ( + (base == 2 and (prefix_char == "b" or prefix_char == "B")) + or (base == 8 and (prefix_char == "o" or prefix_char == "O")) + or (base == 16 and (prefix_char == "x" or prefix_char == "X")) + ): + start += 2 + return start + + +fn _str_to_base_error(base: Int, str_ref: StringRef) -> String: return ( "String is not convertible to integer with base " + str(base) @@ -381,12 +412,18 @@ fn _identify_base(str_ref: StringRef, start: Int) -> Tuple[Int, Int]: fn atol(str: String, base: Int = 10) raises -> Int: """Parses and returns the given string as an integer in the given base. - For example, `atol("19")` returns `19`. If base is 0 the the string is - parsed as an Integer literal, see: https://docs.python.org/3/reference/lexical_analysis.html#integers. + For example, `atol("32")` returns `32`, and `atol("FF", 16)` returns `255`. + If base is set to 0, the string is parsed as an Integer literal, with the + following considerations: + - '0b' or '0B' prefix indicates binary (base 2) + - '0o' or '0O' prefix indicates octal (base 8) + - '0x' or '0X' prefix indicates hexadecimal (base 16) + - Without a prefix, it's treated as decimal (base 10) Raises: - If the given string cannot be parsed as an integer value. For example in - `atol("hi")`. + - If the given string cannot be parsed as an integer value. For example + in `atol("Mojo")`. + - Incorrect base is provided. Args: str: A string to be parsed as an integer in the given base. @@ -394,6 +431,14 @@ fn atol(str: String, base: Int = 10) raises -> Int: Returns: An integer value that represents the string, or otherwise raises. + + Notes: + This follows [Python's integer literals](\ + https://docs.python.org/3/reference/lexical_analysis.html#integers). + + See Also: + - function `stol`: A similar function that returns both the parsed + integer and the remaining unparsed string. """ return _atol(str._strref_dangerous(), base) diff --git a/stdlib/test/builtin/test_string.mojo b/stdlib/test/builtin/test_string.mojo index 541ef040a38..9d53ee5aef7 100644 --- a/stdlib/test/builtin/test_string.mojo +++ b/stdlib/test/builtin/test_string.mojo @@ -365,6 +365,9 @@ def test_atol(): assert_equal(10, atol("0o12", 8)) assert_equal(10, atol("0O12", 8)) assert_equal(35, atol("Z", 36)) + assert_equal(255, atol("0x_00_ff", 16)) + assert_equal(18, atol("0b0001_0010", 2)) + assert_equal(18, atol("0b_000_1001_0", 2)) # Negative cases with assert_raises( @@ -433,6 +436,14 @@ def test_atol_base_0(): assert_equal(0, atol("0X0", base=0)) + assert_equal(255, atol("0x_00_ff", base=0)) + + assert_equal(18, atol("0b_0001_0010", base=0)) + assert_equal(18, atol("0b000_1001_0", base=0)) + + assert_equal(10, atol("0o_000_12", base=0)) + assert_equal(10, atol("0o00_12", base=0)) + with assert_raises( contains="String is not convertible to integer with base 0: ' 0x'" ): @@ -453,11 +464,6 @@ def test_atol_base_0(): ): _ = atol("0r100", base=0) - with assert_raises( - contains="String is not convertible to integer with base 0: '0b_0'" - ): - _ = atol("0b_0", base=0) - with assert_raises( contains="String is not convertible to integer with base 0: '0xf__f'" ):