From 99f44a85b8e618e2b02bbc25c5f818bff4c7792f Mon Sep 17 00:00:00 2001 From: Sean Kim <40130483+theskim@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:42:24 -0500 Subject: [PATCH] Normalize URL paths: convert /.//p, /..//p, and //p to p --- url/src/lib.rs | 54 ++++++++++++++++++++++++++++++++- url/tests/expected_failures.txt | 4 --- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/url/src/lib.rs b/url/src/lib.rs index 3eacded1..b953d349 100644 --- a/url/src/lib.rs +++ b/url/src/lib.rs @@ -1757,6 +1757,39 @@ impl Url { let old_after_path_pos = to_u32(self.serialization.len()).unwrap(); let cannot_be_a_base = self.cannot_be_a_base(); let scheme_type = SchemeType::from(self.scheme()); + let mut path_empty = false; + + // Check ':' and then see if the next character is '/' + let mut has_host = if let Some(index) = self.serialization.find(":") { + if self.serialization.len() > index + 1 + && self.serialization.as_bytes().get(index + 1) == Some(&b'/') + { + let rest = &self.serialization[(index + ":/".len())..]; + let host_part = rest.split('/').next().unwrap_or(""); + path_empty = rest.is_empty(); + !host_part.is_empty() && !host_part.contains('@') + } else { + false + } + } else { + false + }; + + // Ensure the path length is greater than 1 to account + // for cases where "/." is already appended from serialization + // If we set path, then we already checked the other two conditions: + // https://url.spec.whatwg.org/#url-serializing + // 1. The host is null + // 2. the first segment of the URL's path is an empty string + if path.len() > 1 { + if let Some(index) = self.serialization.find(":") { + let removal_start = index + ":".len(); + if self.serialization[removal_start..].starts_with("/.") { + self.path_start -= "/.".len() as u32; + } + } + } + self.serialization.truncate(self.path_start as usize); self.mutate(|parser| { if cannot_be_a_base { @@ -1766,7 +1799,6 @@ impl Url { } parser.parse_cannot_be_a_base_path(parser::Input::new_no_trim(path)); } else { - let mut has_host = true; // FIXME parser.parse_path_start( scheme_type, &mut has_host, @@ -1774,6 +1806,26 @@ impl Url { ); } }); + + // For cases where normalization is applied across both the serialization and the path. + // Append "/." immediately after the scheme (up to ":") + // This is done if three conditions are met. + // https://url.spec.whatwg.org/#url-serializing + // 1. The host is null + // 2. The url's path length is greater than 1 + // 3. the first segment of the URL's path is an empty string + if !has_host && path.len() > 1 && path_empty { + if let Some(index) = self.serialization.find(":") { + if self.serialization.len() > index + 2 + && self.serialization.as_bytes().get(index + 1) == Some(&b'/') + && self.serialization.as_bytes().get(index + 2) == Some(&b'/') + { + self.serialization.insert_str(index + ":".len(), "/."); + self.path_start += "/.".len() as u32; + } + } + } + self.restore_after_path(old_after_path_pos, &after_path); } diff --git a/url/tests/expected_failures.txt b/url/tests/expected_failures.txt index 238f7a66..9bf60b34 100644 --- a/url/tests/expected_failures.txt +++ b/url/tests/expected_failures.txt @@ -41,7 +41,3 @@ set pathname to <\\\\> set pathname to set pathname to - set pathname to - set pathname to - set pathname to - set pathname to