-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring.py
117 lines (104 loc) · 4.55 KB
/
string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# String utilities
# Imports
import re
import codecs
import unidecode
# Basic conversion of string to ranged integer
def ranged_int(string, imin=None, imax=None):
value = int(string) # Note: This may also raise a TypeError if 'string' is not a string
if imin is not None and value < imin:
raise ValueError(f"Value {value} is less than allowed minimum ({imin})")
if imax is not None and value > imax:
raise ValueError(f"Value {value} is greater than allowed maximum ({imax})")
return value
# Safely convert a string to a non-negative integer (just use int() if the string already passed a regex and MUST be valid, returns None if it fails)
def parse_uint(string, clean=True):
if clean:
string = ''.join(s for s in string if s.isdecimal())
try:
value = int(string)
except (ValueError, TypeError):
return None
return value if value >= 0 else None
# Safely convert a string to a float (just use float() if the string already passed a regex and MUST be valid, returns None if it fails)
def parse_float(string, clean=True):
if clean:
string = ''.join(string.split())
try:
return float(string)
except (ValueError, TypeError):
return None
# Remove whitespace from a string
def remove_spaces(string):
return ''.join(string.split())
# Clean up whitespace in a string (converts all sequences of whitespace to a single space, and removes whitespace completely from the start and end of the string)
def clean_spaces(string):
return ' '.join(string.split())
# Convert a string to its standard representation for comparisons (e.g. cleaning whitespace, removing accents, making lowercase and removing non-letter characters)
def clean_string(string):
string = unidecode.unidecode(string).lower()
string = re.sub(r'[^\w\s]', '', string) # Remove all non-word/non-whitespace characters from the string
string = clean_spaces(string)
return string
# Clean up a string somewhat (lite version of clean_string above)
def clean_string_lite(string):
string = unidecode.unidecode(string)
string = clean_spaces(string)
return string
# Convert a string representation of truth to a boolean
def strtobool(value):
value = value.lower()
if value in ('y', 'yes', 't', 'true', 'on', '1'):
return True
elif value in ('n', 'no', 'f', 'false', 'off', '0'):
return False
else:
raise ValueError(f"Invalid truth value: {value}")
# Ensure that a string corresponds to a legal unix filename, making the minimal changes possible
# Resource: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_170
def ensure_filename(string):
string = string.translate(str.maketrans('/', '_', '\0')) # Convert '/' to '_' and remove null characters
string = string.encode('utf-8')[:255].decode('utf-8', errors='ignore') # Limit to 255 bytes (this will often correspond to less than 255 actual characters due to multibyte unicode characters)
if string == '.' or string == '..':
string = '...'
if not string:
string = '_'
return string
# Capitalize the first letter of a string and leave the rest untouched (str.capitalize makes the rest lower case)
def capitalize_first(string):
return string[:1].upper() + string[1:]
# Add a certain prefix to each line of a multiline string
def add_line_prefix(string, line_prefix, include_first_line=True):
output = line_prefix.join(string.splitlines(True))
if include_first_line:
output = line_prefix + output
return output
# Decode escape sequences in a string (e.g. replace every instance of sequential characters '\' and 'n' with the one newline character)
# Inspiration: https://stackoverflow.com/questions/4020539/process-escape-sequences-in-a-string-in-python
# noinspection RegExpRedundantEscape
EscapeSeqRegex = re.compile(r'''
( \\U........ # 8-digit hex escapes
| \\u.... # 4-digit hex escapes
| \\x.. # 2-digit hex escapes
| \\[0-7]{1,3} # Octal escapes
| \\N\{[^}]+\} # Unicode characters by name
| \\[\\'"abfnrtv] # Single-character escapes
| \\. # Arbitrary single character escape
)''', re.VERBOSE)
# noinspection RegExpRedundantEscape
EscapeSeqNameRegex = re.compile(r'''
( U........ # 8-digit hex escapes
| u.... # 4-digit hex escapes
| x.. # 2-digit hex escapes
| [0-7]{1,3} # Octal escapes
| N\{[^}]+\} # Unicode characters by name
| [\\'"abfnrtv] # Single-character escapes
)''', re.VERBOSE)
def decode_escapes(string):
def decode_match(match):
if EscapeSeqNameRegex.fullmatch(match.group(0), pos=1) is not None:
return codecs.decode(match.group(0), 'unicode-escape')
else:
return match.group(0)[1:]
return EscapeSeqRegex.sub(decode_match, string)
# EOF