Update to Chromium //url at Chromium commit 79dc59ac7602413181079ecb463873e29a1d7d0a. I think the most significant change is that url::Origin is now actually an origin. TBR=jamesr@chromium.org Review URL: https://codereview.chromium.org/2029803003 .
diff --git a/BUILD.gn b/BUILD.gn index 4f6b637..34dff8a 100644 --- a/BUILD.gn +++ b/BUILD.gn
@@ -24,6 +24,8 @@ "gurl.h", "origin.cc", "origin.h", + "scheme_host_port.cc", + "scheme_host_port.h", "third_party/mozilla/url_parse.cc", "third_party/mozilla/url_parse.h", "url_canon.h", @@ -91,10 +93,20 @@ # TODO(dpranke): crbug.com/360936. Get this to build and run on Android. if (!is_android) { + # TODO(GYP): Delete this after we've converted everything to GN. + # The _run targets exist only for compatibility w/ GYP. + group("url_unittests_run") { + testonly = true + deps = [ + ":url_unittests", + ] + } + test("url_unittests") { sources = [ "gurl_unittest.cc", "origin_unittest.cc", + "scheme_host_port_unittest.cc", "url_canon_icu_unittest.cc", "url_canon_unittest.cc", "url_parse_unittest.cc",
diff --git a/android/java/src/org/chromium/url/IDNStringUtil.java b/android/java/src/org/chromium/url/IDNStringUtil.java index 32000fd..37d77dc 100644 --- a/android/java/src/org/chromium/url/IDNStringUtil.java +++ b/android/java/src/org/chromium/url/IDNStringUtil.java
@@ -4,8 +4,8 @@ package org.chromium.url; -import org.chromium.base.CalledByNative; -import org.chromium.base.JNINamespace; +import org.chromium.base.annotations.CalledByNative; +import org.chromium.base.annotations.JNINamespace; import java.net.IDN;
diff --git a/gurl.cc b/gurl.cc index 46ca408..c22236f 100644 --- a/gurl.cc +++ b/gurl.cc
@@ -14,6 +14,8 @@ #include "url/gurl.h" #include "base/logging.h" +#include "base/strings/string_piece.h" +#include "base/strings/string_util.h" #include "url/url_canon_stdstring.h" #include "url/url_util.h" @@ -59,7 +61,7 @@ #endif // WIN32 -} // namespace +} // namespace GURL::GURL() : is_valid_(false) { } @@ -130,7 +132,7 @@ #ifndef NDEBUG // For testing purposes, check that the parsed canonical URL is identical to // what we would have produced. Skip checking for invalid URLs have no meaning - // and we can't always canonicalize then reproducabely. + // and we can't always canonicalize then reproducibly. if (is_valid_) { url::Component scheme; // We can't do this check on the inner_url of a filesystem URL, as @@ -193,17 +195,8 @@ return spec_ > other.spec_; } -GURL GURL::Resolve(const std::string& relative) const { - return ResolveWithCharsetConverter(relative, NULL); -} -GURL GURL::Resolve(const base::string16& relative) const { - return ResolveWithCharsetConverter(relative, NULL); -} - // Note: code duplicated below (it's inconvenient to use a template here). -GURL GURL::ResolveWithCharsetConverter( - const std::string& relative, - url::CharsetConverter* charset_converter) const { +GURL GURL::Resolve(const std::string& relative) const { // Not allowed for invalid URLs. if (!is_valid_) return GURL(); @@ -218,7 +211,7 @@ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()), parsed_, relative.data(), static_cast<int>(relative.length()), - charset_converter, &output, &result.parsed_)) { + nullptr, &output, &result.parsed_)) { // Error resolving, return an empty URL. return GURL(); } @@ -234,9 +227,7 @@ } // Note: code duplicated above (it's inconvenient to use a template here). -GURL GURL::ResolveWithCharsetConverter( - const base::string16& relative, - url::CharsetConverter* charset_converter) const { +GURL GURL::Resolve(const base::string16& relative) const { // Not allowed for invalid URLs. if (!is_valid_) return GURL(); @@ -251,7 +242,7 @@ if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()), parsed_, relative.data(), static_cast<int>(relative.length()), - charset_converter, &output, &result.parsed_)) { + nullptr, &output, &result.parsed_)) { // Error resolving, return an empty URL. return GURL(); } @@ -320,7 +311,7 @@ GURL GURL::GetOrigin() const { // This doesn't make sense for invalid or nonstandard URLs, so return - // the empty URL + // the empty URL. if (!is_valid_ || !IsStandard()) return GURL(); @@ -382,9 +373,10 @@ bool GURL::SchemeIs(const char* lower_ascii_scheme) const { if (parsed_.scheme.len <= 0) return lower_ascii_scheme == NULL; - return url::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin, - spec_.data() + parsed_.scheme.end(), - lower_ascii_scheme); + return base::LowerCaseEqualsASCII( + base::StringPiece(spec_.data() + parsed_.scheme.begin, + parsed_.scheme.len), + lower_ascii_scheme); } bool GURL::SchemeIsHTTPOrHTTPS() const { @@ -416,16 +408,17 @@ } std::string GURL::PathForRequest() const { - DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty"; + DCHECK(parsed_.path.len > 0) + << "Canonical path for requests should be non-empty"; if (parsed_.ref.len >= 0) { - // Clip off the reference when it exists. The reference starts after the # - // sign, so we have to subtract one to also remove it. + // Clip off the reference when it exists. The reference starts after the + // #-sign, so we have to subtract one to also remove it. return std::string(spec_, parsed_.path.begin, parsed_.ref.begin - parsed_.path.begin - 1); } // Compute the actual path length, rather than depending on the spec's - // terminator. If we're an inner_url, our spec continues on into our outer - // url's path/query/ref. + // terminator. If we're an inner_url, our spec continues on into our outer + // URL's path/query/ref. int path_len = parsed_.path.len; if (parsed_.query.is_valid()) path_len = parsed_.query.end() - parsed_.path.begin; @@ -490,48 +483,45 @@ #endif // WIN32 -bool GURL::DomainIs(const char* lower_ascii_domain, - int domain_len) const { - // Return false if this URL is not valid or domain is empty. - if (!is_valid_ || !domain_len) +bool GURL::DomainIs(base::StringPiece lower_ascii_domain) const { + if (!is_valid_ || lower_ascii_domain.empty()) return false; // FileSystem URLs have empty parsed_.host, so check this first. if (SchemeIsFileSystem() && inner_url_) - return inner_url_->DomainIs(lower_ascii_domain, domain_len); + return inner_url_->DomainIs(lower_ascii_domain); if (!parsed_.host.is_nonempty()) return false; - // Check whether the host name is end with a dot. If yes, treat it - // the same as no-dot unless the input comparison domain is end - // with dot. - const char* last_pos = spec_.data() + parsed_.host.end() - 1; + // If the host name ends with a dot but the input domain doesn't, + // then we ignore the dot in the host name. + const char* host_last_pos = spec_.data() + parsed_.host.end() - 1; int host_len = parsed_.host.len; - if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) { - last_pos--; + int domain_len = lower_ascii_domain.length(); + if ('.' == *host_last_pos && '.' != lower_ascii_domain[domain_len - 1]) { + host_last_pos--; host_len--; } - // Return false if host's length is less than domain's length. if (host_len < domain_len) return false; - // Compare this url whether belong specific domain. - const char* start_pos = spec_.data() + parsed_.host.begin + - host_len - domain_len; + // |host_first_pos| is the start of the compared part of the host name, not + // start of the whole host name. + const char* host_first_pos = spec_.data() + parsed_.host.begin + + host_len - domain_len; - if (!url::LowerCaseEqualsASCII(start_pos, - last_pos + 1, - lower_ascii_domain, - lower_ascii_domain + domain_len)) + if (!base::LowerCaseEqualsASCII( + base::StringPiece(host_first_pos, domain_len), lower_ascii_domain)) return false; - // Check whether host has right domain start with dot, make sure we got - // right domain range. For example www.google.com has domain - // "google.com" but www.iamnotgoogle.com does not. + // Make sure there aren't extra characters in host before the compared part; + // if the host name is longer than the input domain name, then the character + // immediately before the compared part should be a dot. For example, + // www.google.com has domain "google.com", but www.iamnotgoogle.com does not. if ('.' != lower_ascii_domain[0] && host_len > domain_len && - '.' != *(start_pos - 1)) + '.' != *(host_first_pos - 1)) return false; return true;
diff --git a/gurl.h b/gurl.h index 566fc5e..dccfec4 100644 --- a/gurl.h +++ b/gurl.h
@@ -10,11 +10,12 @@ #include "base/memory/scoped_ptr.h" #include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_canon.h" #include "url/url_canon_stdstring.h" #include "url/url_constants.h" #include "url/url_export.h" -#include "url/url_parse.h" class URL_EXPORT GURL { public: @@ -91,7 +92,7 @@ // Returns the potentially invalid spec for a the URL. This spec MUST NOT be // modified or sent over the network. It is designed to be displayed in error - // messages to the user, as the apperance of the spec may explain the error. + // messages to the user, as the appearance of the spec may explain the error. // If the spec is valid, the valid spec will be returned. // // The returned string is guaranteed to be valid UTF-8. @@ -124,9 +125,8 @@ // pages. // // It may be impossible to resolve the URLs properly. If the input is not - // "standard" (SchemeIsStandard() == false) and the input looks relative, we - // can't resolve it. In these cases, the result will be an empty, invalid - // GURL. + // "standard" (IsStandard() == false) and the input looks relative, we can't + // resolve it. In these cases, the result will be an empty, invalid GURL. // // The result may also be a nonempty, invalid URL if the input has some kind // of encoding error. In these cases, we will try to construct a "good" URL @@ -137,20 +137,6 @@ GURL Resolve(const std::string& relative) const; GURL Resolve(const base::string16& relative) const; - // Like Resolve() above but takes a character set encoder which will be used - // for any query text specified in the input. The charset converter parameter - // may be NULL, in which case it will be treated as UTF-8. - // - // TODO(brettw): These should be replaced with versions that take something - // more friendly than a raw CharsetConverter (maybe like an ICU character set - // name). - GURL ResolveWithCharsetConverter( - const std::string& relative, - url::CharsetConverter* charset_converter) const; - GURL ResolveWithCharsetConverter( - const base::string16& relative, - url::CharsetConverter* charset_converter) const; - // Creates a new GURL by replacing the current URL's components with the // supplied versions. See the Replacements class in url_canon.h for more. // @@ -194,10 +180,11 @@ // returned. GURL GetAsReferrer() const; - // Returns true if the scheme for the current URL is a known "standard" - // scheme. Standard schemes have an authority and a path section. This - // includes file: and filesystem:, which some callers may want to filter out - // explicitly by calling SchemeIsFile[System]. + // Returns true if the scheme for the current URL is a known "standard-format" + // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic + // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes + // file: and filesystem:, which some callers may want to filter out explicitly + // by calling SchemeIsFile[System]. bool IsStandard() const; // Returns true if the given parameter (should be lower-case ASCII to match @@ -223,10 +210,32 @@ return SchemeIs(url::kFileSystemScheme); } - // If the scheme indicates a secure connection + // Returns true if the scheme indicates a secure connection. + // + // NOTE: This function is deprecated. You probably want + // |SchemeIsCryptographic| (if you just want to know if a scheme uses TLS for + // network transport) or Chromium's |IsOriginSecure| for a higher-level test + // about an origin's security. See those functions' documentation for more + // detail. + // + // TODO(palmer): Audit callers and change them to |SchemeIsCryptographic| or + // |IsOriginSecure|, as appropriate. Then remove |SchemeIsSecure|. + // crbug.com/362214 bool SchemeIsSecure() const { return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme) || - (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure()); + (SchemeIsFileSystem() && inner_url() && + inner_url()->SchemeIsSecure()); + } + + // Returns true if the scheme indicates a network connection that uses TLS or + // some other cryptographic protocol (e.g. QUIC) for security. + // + // This function is a not a complete test of whether or not an origin's code + // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a + // higher-level and more complete semantics. See that function's documentation + // for more detail. + bool SchemeIsCryptographic() const { + return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme); } // Returns true if the scheme is "blob". @@ -235,13 +244,12 @@ } // The "content" of the URL is everything after the scheme (skipping the - // scheme delimiting colon). It is an error to get the origin of an invalid - // URL. The result will be an empty string. + // scheme delimiting colon). It is an error to get the content of an invalid + // URL: the result will be an empty string. std::string GetContent() const; // Returns true if the hostname is an IP address. Note: this function isn't // as cheap as a simple getter because it re-parses the hostname to verify. - // This currently identifies only IPv4 addresses (bug 822685). bool HostIsIPAddress() const; // Getters for various components of the URL. The returned string will be @@ -274,8 +282,8 @@ return ComponentString(parsed_.ref); } - // Existance querying. These functions will return true if the corresponding - // URL component exists in this URL. Note that existance is different than + // Existence querying. These functions will return true if the corresponding + // URL component exists in this URL. Note that existence is different than // being nonempty. http://www.google.com/? has a query that just happens to // be empty, and has_query() will return true. bool has_scheme() const { @@ -288,7 +296,7 @@ return parsed_.password.len >= 0; } bool has_host() const { - // Note that hosts are special, absense of host means length 0. + // Note that hosts are special, absence of host means length 0. return parsed_.host.len > 0; } bool has_port() const { @@ -310,7 +318,7 @@ // values defined in Parsed for ExtractPort. int IntPort() const; - // Returns the port number of the url, or the default port number. + // Returns the port number of the URL, or the default port number. // If the scheme has no concept of port (or unknown default) returns // PORT_UNSPECIFIED. int EffectiveIntPort() const; @@ -324,29 +332,21 @@ std::string PathForRequest() const; // Returns the host, excluding the square brackets surrounding IPv6 address - // literals. This can be useful for passing to getaddrinfo(). + // literals. This can be useful for passing to getaddrinfo(). std::string HostNoBrackets() const; // Returns true if this URL's host matches or is in the same domain as - // the given input string. For example if this URL was "www.google.com", - // this would match "com", "google.com", and "www.google.com - // (input domain should be lower-case ASCII to match the canonicalized - // scheme). This call is more efficient than getting the host and check + // the given input string. For example, if the hostname of the URL is + // "www.google.com", this will return true for "com", "google.com", and + // "www.google.com". + // + // The input domain should be lower-case ASCII to match the canonicalized + // scheme. This call is more efficient than getting the host and check // whether host has the specific domain or not because no copies or // object constructions are done. - // - // If function DomainIs has parameter domain_len, which means the parameter - // lower_ascii_domain does not gurantee to terminate with NULL character. - bool DomainIs(const char* lower_ascii_domain, int domain_len) const; + bool DomainIs(base::StringPiece lower_ascii_domain) const; - // If function DomainIs only has parameter lower_ascii_domain, which means - // domain string should be terminate with NULL character. - bool DomainIs(const char* lower_ascii_domain) const { - return DomainIs(lower_ascii_domain, - static_cast<int>(strlen(lower_ascii_domain))); - } - - // Swaps the contents of this GURL object with the argument without doing + // Swaps the contents of this GURL object with |other|, without doing // any memory allocations. void Swap(GURL* other); @@ -363,8 +363,8 @@ private: // Variant of the string parsing constructor that allows the caller to elect - // retain trailing whitespace, if any, on the passed URL spec but only if the - // scheme is one that allows trailing whitespace. The primary use-case is + // retain trailing whitespace, if any, on the passed URL spec, but only if + // the scheme is one that allows trailing whitespace. The primary use-case is // for data: URLs. In most cases, you want to use the single parameter // constructor above. enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
diff --git a/gurl_unittest.cc b/gurl_unittest.cc index bea1a0c..18aa2ae 100644 --- a/gurl_unittest.cc +++ b/gurl_unittest.cc
@@ -45,14 +45,15 @@ EXPECT_EQ("something:///HOSTNAME.com/", TypesTestCase("something:///HOSTNAME.com/")); - // In the reverse, known schemes should always trigger standard URL handling. + // Conversely, URLs with known schemes should always trigger standard URL + // handling. EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com")); EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com")); EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com")); EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com")); #ifdef WIN32 - // URLs that look like absolute Windows drive specs. + // URLs that look like Windows absolute path specs. EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt")); EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt")); EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt")); @@ -60,7 +61,7 @@ #endif } -// Test the basic creation and querying of components in a GURL. We assume +// Test the basic creation and querying of components in a GURL. We assume that // the parser is already tested and works, so we are mostly interested if the // object does the right thing with the results. TEST(GURLTest, Components) { @@ -175,7 +176,7 @@ EXPECT_EQ("", invalid2.ref()); } -// This is a regression test for http://crbug.com/309975 . +// This is a regression test for http://crbug.com/309975. TEST(GURLTest, SelfAssign) { GURL a("filesystem:http://example.com/temporary/"); // This should not crash. @@ -245,9 +246,9 @@ } TEST(GURLTest, ExtraSlashesBeforeAuthority) { - // According to RFC3986, the hier-part for URI with an authority must use only - // two slashes, GURL intentionally just ignores slashes more than 2 and parses - // the following part as an authority. + // According to RFC3986, the hierarchical part for URI with an authority + // must use only two slashes; GURL intentionally just ignores extra slashes + // if there are more than 2, and parses the following part as an authority. GURL url("http:///host"); EXPECT_EQ("host", url.host()); EXPECT_EQ("/", url.path()); @@ -378,7 +379,7 @@ } TEST(GURLTest, Replacements) { - // The url canonicalizer replacement test will handle most of these case. + // The URL canonicalizer replacement test will handle most of these case. // The most important thing to do here is to check that the proper // canonicalizer gets called based on the scheme of the input. struct ReplaceCase { @@ -395,7 +396,7 @@ } replace_cases[] = { {"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"}, {"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"}, - {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"}, + {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo", "search", "ref", "http://www.google.com:99/foo?search#ref"}, #ifdef WIN32 {"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"}, #endif @@ -435,7 +436,7 @@ EXPECT_EQ("data: one ? two ", url_no_ref.spec()); - // Importing a parsed url via this constructor overload will retain trailing + // Importing a parsed URL via this constructor overload will retain trailing // whitespace. GURL import_url(url_no_ref.spec(), url_no_ref.parsed_for_possibly_invalid_spec(), @@ -561,43 +562,56 @@ } TEST(GURLTest, DomainIs) { - const char google_domain[] = "google.com"; + GURL url_1("http://google.com/foo"); + EXPECT_TRUE(url_1.DomainIs("google.com")); - GURL url_1("http://www.google.com:99/foo"); - EXPECT_TRUE(url_1.DomainIs(google_domain)); + // Subdomain and port are ignored. + GURL url_2("http://www.google.com:99/foo"); + EXPECT_TRUE(url_2.DomainIs("google.com")); - GURL url_2("http://google.com:99/foo"); - EXPECT_TRUE(url_2.DomainIs(google_domain)); + // Different top-level domain. + GURL url_3("http://www.google.com.cn/foo"); + EXPECT_FALSE(url_3.DomainIs("google.com")); - GURL url_3("http://google.com./foo"); - EXPECT_TRUE(url_3.DomainIs(google_domain)); + // Different host name. + GURL url_4("http://www.iamnotgoogle.com/foo"); + EXPECT_FALSE(url_4.DomainIs("google.com")); - GURL url_4("http://google.com/foo"); - EXPECT_FALSE(url_4.DomainIs("google.com.")); + // The input must be lower-cased otherwise DomainIs returns false. + GURL url_5("http://www.google.com/foo"); + EXPECT_FALSE(url_5.DomainIs("Google.com")); - GURL url_5("http://google.com./foo"); - EXPECT_TRUE(url_5.DomainIs("google.com.")); + // If the URL is invalid, DomainIs returns false. + GURL invalid_url("google.com"); + EXPECT_FALSE(invalid_url.is_valid()); + EXPECT_FALSE(invalid_url.DomainIs("google.com")); +} - GURL url_6("http://www.google.com./foo"); - EXPECT_TRUE(url_6.DomainIs(".com.")); +TEST(GURLTest, DomainIsTerminatingDotBehavior) { + // If the host part ends with a dot, it matches input domains + // with or without a dot. + GURL url_with_dot("http://www.google.com./foo"); + EXPECT_TRUE(url_with_dot.DomainIs("google.com")); + EXPECT_TRUE(url_with_dot.DomainIs("google.com.")); + EXPECT_TRUE(url_with_dot.DomainIs(".com")); + EXPECT_TRUE(url_with_dot.DomainIs(".com.")); - GURL url_7("http://www.balabala.com/foo"); - EXPECT_FALSE(url_7.DomainIs(google_domain)); + // But, if the host name doesn't end with a dot and the input + // domain does, then it's considered to not match. + GURL url_without_dot("http://google.com/foo"); + EXPECT_FALSE(url_without_dot.DomainIs("google.com.")); - GURL url_8("http://www.google.com.cn/foo"); - EXPECT_FALSE(url_8.DomainIs(google_domain)); + // If the URL ends with two dots, it doesn't match. + GURL url_with_two_dots("http://www.google.com../foo"); + EXPECT_FALSE(url_with_two_dots.DomainIs("google.com")); +} - GURL url_9("http://www.iamnotgoogle.com/foo"); - EXPECT_FALSE(url_9.DomainIs(google_domain)); +TEST(GURLTest, DomainIsWithFilesystemScheme) { + GURL url_1("filesystem:http://www.google.com:99/foo/"); + EXPECT_TRUE(url_1.DomainIs("google.com")); - GURL url_10("http://www.iamnotgoogle.com../foo"); - EXPECT_FALSE(url_10.DomainIs(".com")); - - GURL url_11("filesystem:http://www.google.com:99/foo/"); - EXPECT_TRUE(url_11.DomainIs(google_domain)); - - GURL url_12("filesystem:http://www.iamnotgoogle.com/foo/"); - EXPECT_FALSE(url_12.DomainIs(google_domain)); + GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/"); + EXPECT_FALSE(url_2.DomainIs("google.com")); } // Newlines should be stripped from inputs. @@ -642,4 +656,29 @@ EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob()); } +TEST(GURLTest, ContentAndPathForNonStandardURLs) { + struct TestCase { + const char* url; + const char* expected; + } cases[] = { + {"null", ""}, + {"not-a-standard-scheme:this is arbitrary content", + "this is arbitrary content"}, + {"view-source:http://example.com/path", "http://example.com/path"}, + {"blob:http://example.com/GUID", "http://example.com/GUID"}, + {"blob://http://example.com/GUID", "//http://example.com/GUID"}, + {"blob:http://user:password@example.com/GUID", + "http://user:password@example.com/GUID"}, + + // TODO(mkwst): This seems like a bug. https://crbug.com/513600 + {"filesystem:http://example.com/path", "/"}, + }; + + for (const auto& test : cases) { + GURL url(test.url); + EXPECT_EQ(test.expected, url.path()) << test.url; + EXPECT_EQ(test.expected, url.GetContent()) << test.url; + } +} + } // namespace url
diff --git a/origin.cc b/origin.cc index cebf5dd..9d0c4f0 100644 --- a/origin.cc +++ b/origin.cc
@@ -1,20 +1,82 @@ -// Copyright 2014 The Chromium Authors. All rights reserved. +// Copyright 2015 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "url/origin.h" +#include <string.h> + #include "base/logging.h" -#include "base/strings/pattern.h" +#include "base/strings/string_number_conversions.h" +#include "url/gurl.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" +#include "url/url_util.h" namespace url { -Origin::Origin() : string_("null") {} +Origin::Origin() : unique_(true) { +} -Origin::Origin(const std::string& origin) : string_(origin) { - DCHECK(origin == "null" || base::MatchPattern(origin, "?*://?*")); - DCHECK_GT(origin.size(), 0u); - DCHECK(origin == "file://" || origin[origin.size() - 1] != '/'); +Origin::Origin(const GURL& url) : unique_(true) { + if (!url.is_valid() || (!url.IsStandard() && !url.SchemeIsBlob())) + return; + + if (url.SchemeIsFileSystem()) { + tuple_ = SchemeHostPort(*url.inner_url()); + } else if (url.SchemeIsBlob()) { + // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin + // defines the origin as the origin of the URL which results from parsing + // the "path", which boils down to everything after the scheme. GURL's + // 'GetContent()' gives us exactly that. + tuple_ = SchemeHostPort(GURL(url.GetContent())); + } else { + tuple_ = SchemeHostPort(url); + } + + unique_ = tuple_.IsInvalid(); +} + +Origin::Origin(base::StringPiece scheme, base::StringPiece host, uint16 port) + : tuple_(scheme, host, port) { + unique_ = tuple_.IsInvalid(); +} + +Origin::~Origin() { +} + +// static +Origin Origin::UnsafelyCreateOriginWithoutNormalization( + base::StringPiece scheme, + base::StringPiece host, + uint16 port) { + return Origin(scheme, host, port); +} + +std::string Origin::Serialize() const { + if (unique()) + return "null"; + + if (scheme() == kFileScheme) + return "file://"; + + return tuple_.Serialize(); +} + +bool Origin::IsSameOriginWith(const Origin& other) const { + if (unique_ || other.unique_) + return false; + + return tuple_.Equals(other.tuple_); +} + +bool Origin::operator<(const Origin& other) const { + return tuple_ < other.tuple_; +} + +std::ostream& operator<<(std::ostream& out, const url::Origin& origin) { + return out << origin.Serialize(); } } // namespace url
diff --git a/origin.h b/origin.h index 777e4e1..c94c38c 100644 --- a/origin.h +++ b/origin.h
@@ -1,4 +1,4 @@ -// Copyright 2014 The Chromium Authors. All rights reserved. +// Copyright 2015 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -7,27 +7,130 @@ #include <string> +#include "base/strings/string16.h" +#include "base/strings/string_piece.h" +#include "url/scheme_host_port.h" +#include "url/third_party/mozilla/url_parse.h" +#include "url/url_canon.h" +#include "url/url_constants.h" #include "url/url_export.h" +class GURL; + namespace url { -// Origin represents a Web Origin serialized to a string. -// See RFC6454 for details. +// An Origin is a tuple of (scheme, host, port), as described in RFC 6454. +// +// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'. +// If you only need to extract the bits of a URL which are relevant for a +// network connection, use 'url::SchemeHostPort'. +// +// STL;SDR: If you aren't making actual network connections, use 'url::Origin'. +// +// 'Origin', like 'SchemeHostPort', is composed of a tuple of (scheme, host, +// port), but contains a number of additional concepts which make it appropriate +// for use as a security boundary and access control mechanism between contexts. +// +// This class ought to be used when code needs to determine if two resources +// are "same-origin", and when a canonical serialization of an origin is +// required. Note that some origins are "unique", meaning that they are not +// same-origin with any other origin (including themselves). +// +// There are a few subtleties to note: +// +// * Invalid and non-standard GURLs are parsed as unique origins. This includes +// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'. +// +// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the +// internals of the URL. That is, 'filesystem:https://example.com/temporary/f' +// is parsed as ('https', 'example.com', 443). +// +// * Unique origins all serialize to the string "null"; this means that the +// serializations of two unique origins are identical to each other, though +// the origins themselves are not "the same". This means that origins' +// serializations must not be relied upon for security checks. +// +// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0), +// but their behavior may differ from embedder to embedder. +// +// * The host component of an IPv6 address includes brackets, just like the URL +// representation. +// +// Usage: +// +// * Origins are generally constructed from an already-canonicalized GURL: +// +// GURL url("https://example.com/"); +// url::Origin origin(url); +// origin.scheme(); // "https" +// origin.host(); // "example.com" +// origin.port(); // 443 +// origin.IsUnique(); // false +// +// * To answer the question "Are |this| and |that| "same-origin" with each +// other?", use |Origin::IsSameOriginWith|: +// +// if (this.IsSameOriginWith(that)) { +// // Amazingness goes here. +// } class URL_EXPORT Origin { public: + // Creates a unique Origin. Origin(); - explicit Origin(const std::string& origin); - const std::string& string() const { return string_; } + // Creates an Origin from |url|, as described at + // https://url.spec.whatwg.org/#origin, with the following additions: + // + // 1. If |url| is invalid or non-standard, a unique Origin is constructed. + // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed + // out of everything in the URL which follows the scheme). + // 3. 'file' URLs all parse as ("file", "", 0). + explicit Origin(const GURL& url); - bool IsSameAs(const Origin& that) const { - return string_ == that.string_; - } + // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters + // must be valid and canonicalized. In particular, note that this cannot be + // used to create unique origins; 'url::Origin()' is the right way to do that. + // + // This constructor should be used in order to pass 'Origin' objects back and + // forth over IPC (as transitioning through GURL would risk potentially + // dangerous recanonicalization); other potential callers should prefer the + // 'GURL'-based constructor. + static Origin UnsafelyCreateOriginWithoutNormalization( + base::StringPiece scheme, + base::StringPiece host, + uint16 port); + + ~Origin(); + + // For unique origins, these return ("", "", 0). + const std::string& scheme() const { return tuple_.scheme(); } + const std::string& host() const { return tuple_.host(); } + uint16 port() const { return tuple_.port(); } + + bool unique() const { return unique_; } + + // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with + // the addition that all Origins with a 'file' scheme serialize to "file://". + std::string Serialize() const; + + // Two Origins are "same-origin" if their schemes, hosts, and ports are exact + // matches; and neither is unique. + bool IsSameOriginWith(const Origin& other) const; + + // Allows SchemeHostPort to used as a key in STL (for example, a std::set or + // std::map). + bool operator<(const Origin& other) const; private: - std::string string_; + Origin(base::StringPiece scheme, base::StringPiece host, uint16 port); + + SchemeHostPort tuple_; + bool unique_; }; +URL_EXPORT std::ostream& operator<<(std::ostream& out, + const Origin& origin); + } // namespace url #endif // URL_ORIGIN_H_
diff --git a/origin_unittest.cc b/origin_unittest.cc index c094ee6..ec4ec65 100644 --- a/origin_unittest.cc +++ b/origin_unittest.cc
@@ -1,41 +1,251 @@ -// Copyright 2014 The Chromium Authors. All rights reserved. +// Copyright 2015 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "testing/gtest/include/gtest/gtest.h" +#include "base/logging.h" #include "url/origin.h" - -namespace url { +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" namespace { -// Each test examines the Origin is constructed correctly without -// violating DCHECKs. -TEST(OriginTest, constructEmpty) { - Origin origin; - EXPECT_EQ("null", origin.string()); +TEST(OriginTest, UniqueOriginComparison) { + url::Origin unique_origin; + EXPECT_EQ("", unique_origin.scheme()); + EXPECT_EQ("", unique_origin.host()); + EXPECT_EQ(0, unique_origin.port()); + EXPECT_TRUE(unique_origin.unique()); + EXPECT_FALSE(unique_origin.IsSameOriginWith(unique_origin)); + + const char* const urls[] = {"data:text/html,Hello!", + "javascript:alert(1)", + "file://example.com:443/etc/passwd", + "yay", + "http::///invalid.example.com/"}; + + for (const auto& test_url : urls) { + SCOPED_TRACE(test_url); + GURL url(test_url); + url::Origin origin(url); + EXPECT_EQ("", origin.scheme()); + EXPECT_EQ("", origin.host()); + EXPECT_EQ(0, origin.port()); + EXPECT_TRUE(origin.unique()); + EXPECT_FALSE(origin.IsSameOriginWith(origin)); + EXPECT_FALSE(unique_origin.IsSameOriginWith(origin)); + EXPECT_FALSE(origin.IsSameOriginWith(unique_origin)); + } } -TEST(OriginTest, constructNull) { - Origin origin("null"); - EXPECT_EQ("null", origin.string()); +TEST(OriginTest, ConstructFromGURL) { + url::Origin different_origin(GURL("https://not-in-the-list.test/")); + + struct TestCases { + const char* const url; + const char* const expected_scheme; + const char* const expected_host; + const uint16 expected_port; + } cases[] = { + // IP Addresses + {"http://192.168.9.1/", "http", "192.168.9.1", 80}, + {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80}, + + // Punycode + {"http://☃.net/", "http", "xn--n3h.net", 80}, + {"blob:http://☃.net/", "http", "xn--n3h.net", 80}, + + // Generic URLs + {"http://example.com/", "http", "example.com", 80}, + {"http://example.com:123/", "http", "example.com", 123}, + {"https://example.com/", "https", "example.com", 443}, + {"https://example.com:123/", "https", "example.com", 123}, + {"http://user:pass@example.com/", "http", "example.com", 80}, + {"http://example.com:123/?query", "http", "example.com", 123}, + {"https://example.com/#1234", "https", "example.com", 443}, + {"https://u:p@example.com:123/?query#1234", "https", "example.com", 123}, + + // Registered URLs + {"ftp://example.com/", "ftp", "example.com", 21}, + {"gopher://example.com/", "gopher", "example.com", 70}, + {"ws://example.com/", "ws", "example.com", 80}, + {"wss://example.com/", "wss", "example.com", 443}, + + // file: URLs + {"file:///etc/passwd", "file", "", 0}, + {"file://example.com/etc/passwd", "file", "example.com", 0}, + + // Filesystem: + {"filesystem:http://example.com/type/", "http", "example.com", 80}, + {"filesystem:http://example.com:123/type/", "http", "example.com", 123}, + {"filesystem:https://example.com/type/", "https", "example.com", 443}, + {"filesystem:https://example.com:123/type/", "https", "example.com", 123}, + + // Blob: + {"blob:http://example.com/guid-goes-here", "http", "example.com", 80}, + {"blob:http://example.com:123/guid-goes-here", "http", "example.com", 123}, + {"blob:https://example.com/guid-goes-here", "https", "example.com", 443}, + {"blob:http://u:p@example.com/guid-goes-here", "http", "example.com", 80}, + }; + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.url); + GURL url(test_case.url); + EXPECT_TRUE(url.is_valid()); + url::Origin origin(url); + EXPECT_EQ(test_case.expected_scheme, origin.scheme()); + EXPECT_EQ(test_case.expected_host, origin.host()); + EXPECT_EQ(test_case.expected_port, origin.port()); + EXPECT_FALSE(origin.unique()); + EXPECT_TRUE(origin.IsSameOriginWith(origin)); + EXPECT_FALSE(different_origin.IsSameOriginWith(origin)); + EXPECT_FALSE(origin.IsSameOriginWith(different_origin)); + } } -TEST(OriginTest, constructValidOrigin) { - Origin origin("http://example.com:8080"); - EXPECT_EQ("http://example.com:8080", origin.string()); +TEST(OriginTest, Serialization) { + struct TestCases { + const char* const url; + const char* const expected; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://"}, + {"file://example.com/etc/passwd", "file://"}, + }; + + for (const auto& test_case : cases) { + SCOPED_TRACE(test_case.url); + GURL url(test_case.url); + EXPECT_TRUE(url.is_valid()); + url::Origin origin(url); + EXPECT_EQ(test_case.expected, origin.Serialize()); + + // The '<<' operator should produce the same serialization as Serialize(). + std::stringstream out; + out << origin; + EXPECT_EQ(test_case.expected, out.str()); + } } -TEST(OriginTest, constructValidFileOrigin) { - Origin origin("file://"); - EXPECT_EQ("file://", origin.string()); +TEST(OriginTest, Comparison) { + // These URLs are arranged in increasing order: + const char* const urls[] = { + "data:uniqueness", + "http://a:80", + "http://b:80", + "https://a:80", + "https://b:80", + "http://a:81", + "http://b:81", + "https://a:81", + "https://b:81", + }; + + for (size_t i = 0; i < arraysize(urls); i++) { + GURL current_url(urls[i]); + url::Origin current(current_url); + for (size_t j = i; j < arraysize(urls); j++) { + GURL compare_url(urls[j]); + url::Origin to_compare(compare_url); + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } } -TEST(OriginTest, constructValidOriginWithoutPort) { - Origin origin("wss://example2.com"); - EXPECT_EQ("wss://example2.com", origin.string()); +TEST(OriginTest, UnsafelyCreate) { + struct TestCase { + const char* scheme; + const char* host; + uint16 port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"file", "", 0}, + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization( + test.scheme, test.host, test.port); + EXPECT_EQ(test.scheme, origin.scheme()); + EXPECT_EQ(test.host, origin.host()); + EXPECT_EQ(test.port, origin.port()); + EXPECT_FALSE(origin.unique()); + EXPECT_TRUE(origin.IsSameOriginWith(origin)); + } } -} // namespace +TEST(OriginTest, UnsafelyCreateUniqueOnInvalidInput) { + struct TestCases { + const char* scheme; + const char* host; + uint16 port; + } cases[] = {{"", "", 0}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"data", "example.com", 80}, + {"http", "☃.net", 80}, + {"http\nmore", "example.com", 80}, + {"http\rmore", "example.com", 80}, + {"http\n", "example.com", 80}, + {"http\r", "example.com", 80}, + {"http", "example.com\nnot-example.com", 80}, + {"http", "example.com\rnot-example.com", 80}, + {"http", "example.com\n", 80}, + {"http", "example.com\r", 80}, + {"http", "example.com", 0}, + {"file", "", 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization( + test.scheme, test.host, test.port); + EXPECT_EQ("", origin.scheme()); + EXPECT_EQ("", origin.host()); + EXPECT_EQ(0, origin.port()); + EXPECT_TRUE(origin.unique()); + EXPECT_FALSE(origin.IsSameOriginWith(origin)); + } +} + +TEST(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) { + struct TestCases { + const char* scheme; + size_t scheme_length; + const char* host; + size_t host_length; + uint16 port; + } cases[] = {{"http\0more", 9, "example.com", 11, 80}, + {"http\0", 5, "example.com", 11, 80}, + {"\0http", 5, "example.com", 11, 80}, + {"http", 4, "example.com\0not-example.com", 27, 80}, + {"http", 4, "example.com\0", 12, 80}, + {"http", 4, "\0example.com", 12, 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization( + std::string(test.scheme, test.scheme_length), + std::string(test.host, test.host_length), test.port); + EXPECT_EQ("", origin.scheme()); + EXPECT_EQ("", origin.host()); + EXPECT_EQ(0, origin.port()); + EXPECT_TRUE(origin.unique()); + EXPECT_FALSE(origin.IsSameOriginWith(origin)); + } +} } // namespace url
diff --git a/scheme_host_port.cc b/scheme_host_port.cc new file mode 100644 index 0000000..c2fe830 --- /dev/null +++ b/scheme_host_port.cc
@@ -0,0 +1,129 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "url/scheme_host_port.h" + +#include <string.h> + +#include "base/logging.h" +#include "base/strings/string_number_conversions.h" +#include "url/gurl.h" +#include "url/url_canon.h" +#include "url/url_canon_stdstring.h" +#include "url/url_constants.h" +#include "url/url_util.h" + +namespace url { + +SchemeHostPort::SchemeHostPort() : port_(0) { +} + +SchemeHostPort::SchemeHostPort(base::StringPiece scheme, + base::StringPiece host, + uint16 port) + : scheme_(scheme.data(), scheme.length()), + host_(host.data(), host.length()), + port_(port) { + // Try to canonicalize the host (copy/pasted from net/base. :( ). + const url::Component raw_host_component(0, static_cast<int>(host.length())); + std::string canon_host; + url::StdStringCanonOutput canon_host_output(&canon_host); + url::CanonHostInfo host_info; + url::CanonicalizeHostVerbose(host.data(), raw_host_component, + &canon_host_output, &host_info); + + if (host_info.out_host.is_nonempty() && + host_info.family != url::CanonHostInfo::BROKEN) { + // Success! Assert that there's no extra garbage. + canon_host_output.Complete(); + DCHECK_EQ(host_info.out_host.len, static_cast<int>(canon_host.length())); + } else { + // Empty host, or canonicalization failed. + canon_host.clear(); + } + + // Return an invalid SchemeHostPort object if any of the following conditions + // hold: + // + // 1. The provided scheme is non-standard, 'blob:', or 'filesystem:'. + // 2. The provided host is non-canonical. + // 3. The scheme is 'file' and the port is non-zero. + // 4. The scheme is not 'file', and the port is zero or the host is empty. + bool isUnsupportedScheme = + !url::IsStandard(scheme.data(), + url::Component(0, static_cast<int>(scheme.length()))) || + scheme == kFileSystemScheme || scheme == kBlobScheme; + bool isNoncanonicalHost = host != canon_host; + bool isFileSchemeWithPort = scheme == kFileScheme && port != 0; + bool isNonFileSchemeWithoutPortOrHost = + scheme != kFileScheme && (port == 0 || host.empty()); + if (isUnsupportedScheme || isNoncanonicalHost || isFileSchemeWithPort || + isNonFileSchemeWithoutPortOrHost) { + scheme_.clear(); + host_.clear(); + port_ = 0; + } +} + +SchemeHostPort::SchemeHostPort(const GURL& url) : port_(0) { + if (!url.is_valid() || !url.IsStandard()) + return; + + // These schemes do not follow the generic URL syntax, so we treat them as + // invalid (scheme, host, port) tuples (even though such URLs' _Origin_ might + // have a (scheme, host, port) tuple, they themselves do not). + if (url.SchemeIsBlob() || url.SchemeIsFileSystem()) + return; + + scheme_ = url.scheme(); + host_ = url.host(); + port_ = url.EffectiveIntPort() == url::PORT_UNSPECIFIED + ? 0 + : url.EffectiveIntPort(); +} + +SchemeHostPort::~SchemeHostPort() { +} + +bool SchemeHostPort::IsInvalid() const { + return scheme_.empty() && host_.empty() && !port_; +} + +std::string SchemeHostPort::Serialize() const { + std::string result; + if (IsInvalid()) + return result; + + bool is_default_port = + port_ == url::DefaultPortForScheme(scheme_.data(), + static_cast<int>(scheme_.length())); + + result.append(scheme_); + result.append(kStandardSchemeSeparator); + result.append(host_); + + if (scheme_ != kFileScheme && !is_default_port) { + result.push_back(':'); + result.append(base::IntToString(port_)); + } + + return result; +} + +bool SchemeHostPort::Equals(const SchemeHostPort& other) const { + return port_ == other.port() && scheme_ == other.scheme() && + host_ == other.host(); +} + +bool SchemeHostPort::operator<(const SchemeHostPort& other) const { + if (port_ != other.port_) + return port_ < other.port_; + if (scheme_ != other.scheme_) + return scheme_ < other.scheme_; + if (host_ != other.host_) + return host_ < other.host_; + return false; +} + +} // namespace url
diff --git a/scheme_host_port.h b/scheme_host_port.h new file mode 100644 index 0000000..2cc9e07 --- /dev/null +++ b/scheme_host_port.h
@@ -0,0 +1,132 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef URL_SCHEME_HOST_PORT_H_ +#define URL_SCHEME_HOST_PORT_H_ + +#include <string> + +#include "base/basictypes.h" +#include "base/strings/string_piece.h" +#include "url/url_export.h" + +class GURL; + +namespace url { + +// This class represents a (scheme, host, port) tuple extracted from a URL. +// +// The primary purpose of this class is to represent relevant network-authority +// information for a URL. It is _not_ an Origin, as described in RFC 6454. In +// particular, it is generally NOT the right thing to use for security +// decisions. +// +// Instead, this class is a mechanism for simplifying URLs with standard schemes +// (that is, those which follow the generic syntax of RFC 3986) down to the +// uniquely identifying information necessary for network fetches. This makes it +// suitable as a cache key for a collection of active connections, for instance. +// It may, however, be inappropriate to use as a cache key for persistent +// storage associated with a host. +// +// In particular, note that: +// +// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax +// (e.g. those registered with GURL as "standard schemes"). Non-standard +// schemes such as "blob", "filesystem", "data", and "javascript" can only be +// represented as invalid SchemeHostPort objects. +// +// * The "file" scheme follows the standard syntax, but it is important to note +// that the authority portion (host, port) is optional. URLs without an +// authority portion will be represented with an empty string for the host, +// and a port of 0 (e.g. "file:///etc/hosts" => ("file", "", 0)), and URLs +// with a host-only authority portion will be represented with a port of 0 +// (e.g. "file://example.com/etc/hosts" => ("file", "example.com", 0)). See +// Section 3 of RFC 3986 to better understand these constructs. +// +// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in +// particular, it has no notion of a "unique" Origin. If you need to take +// uniqueness into account (and, if you're making security-relevant decisions +// then you absolutely do), please use 'url::Origin' instead[1]. +// +// [1]: // TODO(mkwst): Land 'url::Origin'. :) +// +// Usage: +// +// * SchemeHostPort objects are commonly created from GURL objects: +// +// GURL url("https://example.com/"); +// url::SchemeHostPort tuple(url); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// * Objects may also be explicitly created and compared: +// +// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443); +// tuple.scheme(); // "https" +// tuple.host(); // "example.com" +// tuple.port(); // 443 +// +// GURL url("https://example.com/"); +// tuple.Equals(url::SchemeHostPort(url)); // true +class URL_EXPORT SchemeHostPort { + public: + // Creates an invalid (scheme, host, port) tuple, which represents an invalid + // or non-standard URL. + SchemeHostPort(); + + // Creates a (scheme, host, port) tuple. |host| must be a canonicalized + // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme| + // must be a standard scheme. |port| must not be 0, unless |scheme| does not + // support ports (e.g. 'file'). In that case, |port| must be 0. + // + // Copies the data in |scheme| and |host|. + SchemeHostPort(base::StringPiece scheme, base::StringPiece host, uint16 port); + + // Creates a (scheme, host, port) tuple from |url|, as described at + // https://tools.ietf.org/html/rfc6454#section-4 + // + // If |url| is invalid or non-standard, the result will be an invalid + // SchemeHostPort object. + explicit SchemeHostPort(const GURL& url); + + ~SchemeHostPort(); + + // Returns the host component, in URL form. That is all IDN domain names will + // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and + // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]"). + const std::string& host() const { return host_; } + const std::string& scheme() const { return scheme_; } + uint16 port() const { return port_; } + bool IsInvalid() const; + + // Serializes the SchemeHostPort tuple to a canonical form. + // + // While this string form resembles the Origin serialization specified in + // Section 6.2 of RFC 6454, it is important to note that invalid + // SchemeHostPort tuples serialize to the empty string, rather than being + // serialized as a unique Origin. + std::string Serialize() const; + + // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports + // are exact matches. + // + // Note that this comparison is _not_ the same as an origin-based comparison. + // In particular, invalid SchemeHostPort objects match each other (and + // themselves). Unique origins, on the other hand, would not. + bool Equals(const SchemeHostPort& other) const; + + // Allows SchemeHostPort to used as a key in STL (for example, a std::set or + // std::map). + bool operator<(const SchemeHostPort& other) const; + + private: + std::string scheme_; + std::string host_; + uint16 port_; +}; + +} // namespace url + +#endif // URL_SCHEME_HOST_PORT_H_
diff --git a/scheme_host_port_unittest.cc b/scheme_host_port_unittest.cc new file mode 100644 index 0000000..817631d --- /dev/null +++ b/scheme_host_port_unittest.cc
@@ -0,0 +1,215 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" +#include "url/scheme_host_port.h" + +namespace { + +TEST(SchemeHostPortTest, Invalid) { + url::SchemeHostPort invalid; + EXPECT_EQ("", invalid.scheme()); + EXPECT_EQ("", invalid.host()); + EXPECT_EQ(0, invalid.port()); + EXPECT_TRUE(invalid.IsInvalid()); + EXPECT_TRUE(invalid.Equals(invalid)); + + const char* urls[] = {"data:text/html,Hello!", + "javascript:alert(1)", + "file://example.com:443/etc/passwd", + "blob:https://example.com/uuid-goes-here", + "filesystem:https://example.com/temporary/yay.png"}; + + for (const auto& test : urls) { + SCOPED_TRACE(test); + GURL url(test); + url::SchemeHostPort tuple(url); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + EXPECT_TRUE(tuple.Equals(tuple)); + EXPECT_TRUE(tuple.Equals(invalid)); + EXPECT_TRUE(invalid.Equals(tuple)); + } +} + +TEST(SchemeHostPortTest, ExplicitConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16 port; + } cases[] = { + {"http", "example.com", 80}, + {"http", "example.com", 123}, + {"https", "example.com", 443}, + {"https", "example.com", 123}, + {"file", "", 0}, + {"file", "example.com", 0}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_FALSE(tuple.IsInvalid()); + EXPECT_TRUE(tuple.Equals(tuple)); + } +} + +TEST(SchemeHostPortTest, InvalidConstruction) { + struct TestCases { + const char* scheme; + const char* host; + uint16 port; + } cases[] = {{"", "", 0}, + {"data", "", 0}, + {"blob", "", 0}, + {"filesystem", "", 0}, + {"http", "", 80}, + {"data", "example.com", 80}, + {"http", "☃.net", 80}, + {"http\nmore", "example.com", 80}, + {"http\rmore", "example.com", 80}, + {"http\n", "example.com", 80}, + {"http\r", "example.com", 80}, + {"http", "example.com\nnot-example.com", 80}, + {"http", "example.com\rnot-example.com", 80}, + {"http", "example.com\n", 80}, + {"http", "example.com\r", 80}, + {"http", "example.com", 0}, + {"file", "", 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(test.scheme, test.host, test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + EXPECT_TRUE(tuple.Equals(tuple)); + } +} + +TEST(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) { + struct TestCases { + const char* scheme; + size_t scheme_length; + const char* host; + size_t host_length; + uint16 port; + } cases[] = {{"http\0more", 9, "example.com", 11, 80}, + {"http\0", 5, "example.com", 11, 80}, + {"\0http", 5, "example.com", 11, 80}, + {"http", 4, "example.com\0not-example.com", 27, 80}, + {"http", 4, "example.com\0", 12, 80}, + {"http", 4, "\0example.com", 12, 80}}; + + for (const auto& test : cases) { + SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":" + << test.port); + url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length), + std::string(test.host, test.host_length), + test.port); + EXPECT_EQ("", tuple.scheme()); + EXPECT_EQ("", tuple.host()); + EXPECT_EQ(0, tuple.port()); + EXPECT_TRUE(tuple.IsInvalid()); + } +} + +TEST(SchemeHostPortTest, GURLConstruction) { + struct TestCases { + const char* url; + const char* scheme; + const char* host; + uint16 port; + } cases[] = { + {"http://192.168.9.1/", "http", "192.168.9.1", 80}, + {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80}, + {"http://☃.net/", "http", "xn--n3h.net", 80}, + {"http://example.com/", "http", "example.com", 80}, + {"http://example.com:123/", "http", "example.com", 123}, + {"https://example.com/", "https", "example.com", 443}, + {"https://example.com:123/", "https", "example.com", 123}, + {"file:///etc/passwd", "file", "", 0}, + {"file://example.com/etc/passwd", "file", "example.com", 0}, + {"http://u:p@example.com/", "http", "example.com", 80}, + {"http://u:p@example.com/path", "http", "example.com", 80}, + {"http://u:p@example.com/path?123", "http", "example.com", 80}, + {"http://u:p@example.com/path?123#hash", "http", "example.com", 80}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + EXPECT_TRUE(url.is_valid()); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.scheme, tuple.scheme()); + EXPECT_EQ(test.host, tuple.host()); + EXPECT_EQ(test.port, tuple.port()); + EXPECT_FALSE(tuple.IsInvalid()); + EXPECT_TRUE(tuple.Equals(tuple)); + } +} + +TEST(SchemeHostPortTest, Serialization) { + struct TestCases { + const char* url; + const char* expected; + } cases[] = { + {"http://192.168.9.1/", "http://192.168.9.1"}, + {"http://[2001:db8::1]/", "http://[2001:db8::1]"}, + {"http://☃.net/", "http://xn--n3h.net"}, + {"http://example.com/", "http://example.com"}, + {"http://example.com:123/", "http://example.com:123"}, + {"https://example.com/", "https://example.com"}, + {"https://example.com:123/", "https://example.com:123"}, + {"file:///etc/passwd", "file://"}, + {"file://example.com/etc/passwd", "file://example.com"}, + }; + + for (const auto& test : cases) { + SCOPED_TRACE(test.url); + GURL url(test.url); + url::SchemeHostPort tuple(url); + EXPECT_EQ(test.expected, tuple.Serialize()); + } +} + +TEST(SchemeHostPortTest, Comparison) { + // These tuples are arranged in increasing order: + struct SchemeHostPorts { + const char* scheme; + const char* host; + uint16 port; + } tuples[] = { + {"http", "a", 80}, + {"http", "b", 80}, + {"https", "a", 80}, + {"https", "b", 80}, + {"http", "a", 81}, + {"http", "b", 81}, + {"https", "a", 81}, + {"https", "b", 81}, + }; + + for (size_t i = 0; i < arraysize(tuples); i++) { + url::SchemeHostPort current(tuples[i].scheme, tuples[i].host, + tuples[i].port); + for (size_t j = i; j < arraysize(tuples); j++) { + url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host, + tuples[j].port); + EXPECT_EQ(i < j, current < to_compare) << i << " < " << j; + EXPECT_EQ(j < i, to_compare < current) << j << " < " << i; + } + } +} + +} // namespace url
diff --git a/third_party/mozilla/url_parse.h b/third_party/mozilla/url_parse.h index 71dbb78..7bfcdc8 100644 --- a/third_party/mozilla/url_parse.h +++ b/third_party/mozilla/url_parse.h
@@ -5,9 +5,6 @@ #ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ #define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_ -#include <string> - -#include "base/basictypes.h" #include "base/strings/string16.h" #include "url/url_export.h"
diff --git a/url_canon.h b/url_canon.h index 432f291..95d5345 100644 --- a/url_canon.h +++ b/url_canon.h
@@ -9,8 +9,8 @@ #include <string.h> #include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_export.h" -#include "url/url_parse.h" namespace url { @@ -285,7 +285,7 @@ // User info: username/password. If present, this will add the delimiters so // the output will be "<username>:<password>@" or "<username>@". Empty // username/password pairs, or empty passwords, will get converted to -// nonexistant in the canonical version. +// nonexistent in the canonical version. // // The components for the username and password refer to ranges in the // respective source strings. Usually, these will be the same string, which @@ -317,13 +317,13 @@ // This field summarizes how the input was classified by the canonicalizer. enum Family { - NEUTRAL, // - Doesn't resemble an IP address. As far as the IP + NEUTRAL, // - Doesn't resemble an IP address. As far as the IP // canonicalizer is concerned, it should be treated as a // hostname. - BROKEN, // - Almost an IP, but was not canonicalized. This could be an + BROKEN, // - Almost an IP, but was not canonicalized. This could be an // IPv4 address where truncation occurred, or something // containing the special characters :[] which did not parse - // as an IPv6 address. Never attempt to connect to this + // as an IPv6 address. Never attempt to connect to this // address, because it might actually succeed! IPV4, // - Successfully canonicalized as an IPv4 address. IPV6, // - Successfully canonicalized as an IPv6 address. @@ -331,7 +331,7 @@ Family family; // If |family| is IPV4, then this is the number of nonempty dot-separated - // components in the input text, from 1 to 4. If |family| is not IPV4, + // components in the input text, from 1 to 4. If |family| is not IPV4, // this value is undefined. int num_ipv4_components; @@ -355,7 +355,7 @@ // Host. // -// The 8-bit version requires UTF-8 encoding. Use this version when you only +// The 8-bit version requires UTF-8 encoding. Use this version when you only // need to know whether canonicalization succeeded. URL_EXPORT bool CanonicalizeHost(const char* spec, const Component& host, @@ -368,7 +368,7 @@ // Extended version of CanonicalizeHost, which returns additional information. // Use this when you need to know whether the hostname was an IP address. -// A successful return is indicated by host_info->family != BROKEN. See the +// A successful return is indicated by host_info->family != BROKEN. See the // definition of CanonHostInfo above for details. URL_EXPORT void CanonicalizeHostVerbose(const char* spec, const Component& host, @@ -554,7 +554,7 @@ CanonOutput* output, Parsed* new_parsed); -// Use for mailto URLs. This "canonicalizes" the url into a path and query +// Use for mailto URLs. This "canonicalizes" the URL into a path and query // component. It does not attempt to merge "to" fields. It uses UTF-8 for // the query encoding if there is a query. This is because a mailto URL is // really intended for an external mail program, and the encoding of a page, @@ -578,9 +578,9 @@ // treated on the same code path as regular canonicalization (the same string // for each component). // -// A Parsed structure usually goes along with this. Those -// components identify offsets within these strings, so that they can all be -// in the same string, or spread arbitrarily across different ones. +// A Parsed structure usually goes along with this. Those components identify +// offsets within these strings, so that they can all be in the same string, +// or spread arbitrarily across different ones. // // This structures does not own any data. It is the caller's responsibility to // ensure that the data the pointers point to stays in scope and is not @@ -725,7 +725,7 @@ } bool IsRefOverridden() const { return sources_.ref != NULL; } - // Getters for the itnernal data. See the variables below for how the + // Getters for the internal data. See the variables below for how the // information is encoded. const URLComponentSource<CHAR>& sources() const { return sources_; } const Parsed& components() const { return components_; } @@ -863,7 +863,7 @@ // The base URL should be canonical and have a host (may be empty for file // URLs) and a path. If it doesn't have these, we can't resolve relative // URLs off of it and will return the base as the output with an error flag. -// Becausee it is canonical is should also be ASCII. +// Because it is canonical is should also be ASCII. // // The query charset converter follows the same rules as CanonicalizeQuery. //
diff --git a/url_canon_etc.cc b/url_canon_etc.cc index 7409efd..e9da94c 100644 --- a/url_canon_etc.cc +++ b/url_canon_etc.cc
@@ -95,9 +95,9 @@ // The output scheme starts from the current position. out_scheme->begin = output->length(); - // Danger: it's important that this code does not strip any characters: it - // only emits the canonical version (be it valid or escaped) of each of - // the input characters. Stripping would put it out of sync with + // Danger: it's important that this code does not strip any characters; + // it only emits the canonical version (be it valid or escaped) for each + // of the input characters. Stripping would put it out of sync with // FindAndCompareScheme, which could cause some security checks on // schemes to be incorrect. bool success = true; @@ -218,7 +218,7 @@ char buf[buf_size]; WritePortInt(buf, buf_size, port_num); - // Append the port number to the output, preceeded by a colon. + // Append the port number to the output, preceded by a colon. output->push_back(':'); out_port->begin = output->length(); for (int i = 0; i < buf_size && buf[i]; i++)
diff --git a/url_canon_host.cc b/url_canon_host.cc index 513248a..fce4d3a 100644 --- a/url_canon_host.cc +++ b/url_canon_host.cc
@@ -34,7 +34,7 @@ // NOTE: I didn't actually test all the control characters. Some may be // disallowed in the input, but they are all accepted escaped except for 0. // I also didn't test if characters affecting HTML parsing are allowed -// unescaped, eg. (") or (#), which would indicate the beginning of the path. +// unescaped, e.g. (") or (#), which would indicate the beginning of the path. // Surprisingly, space is accepted in the input and always escaped. // This table lists the canonical version of all characters we allow in the @@ -316,11 +316,11 @@ } if (!success) { - // Canonicalization failed. Set BROKEN to notify the caller. + // Canonicalization failed. Set BROKEN to notify the caller. host_info->family = CanonHostInfo::BROKEN; } else { // After all the other canonicalization, check if we ended up with an IP - // address. IP addresses are small, so writing into this temporary buffer + // address. IP addresses are small, so writing into this temporary buffer // should not cause an allocation. RawCanonOutput<64> canon_ip; CanonicalizeIPAddress(output->data(), @@ -328,7 +328,7 @@ &canon_ip, host_info); // If we got an IPv4/IPv6 address, copy the canonical form back to the - // real buffer. Otherwise, it's a hostname or broken IP, in which case + // real buffer. Otherwise, it's a hostname or broken IP, in which case // we just leave it in place. if (host_info->IsIPAddress()) { output->set_length(output_begin);
diff --git a/url_canon_icu.cc b/url_canon_icu.cc index 741bed2..8a80d71 100644 --- a/url_canon_icu.cc +++ b/url_canon_icu.cc
@@ -99,8 +99,10 @@ // TODO(jungshik): Change options as different parties (browsers, // registrars, search engines) converge toward a consensus. value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); - if (U_FAILURE(err)) + if (U_FAILURE(err)) { + CHECK(false) << "failed to open UTS46 data with error: " << err; value = NULL; + } } UIDNA* value;
diff --git a/url_canon_internal.cc b/url_canon_internal.cc index 1554814..164c6cf 100644 --- a/url_canon_internal.cc +++ b/url_canon_internal.cc
@@ -249,9 +249,9 @@ bool ReadUTFChar(const char* str, int* begin, int length, unsigned* code_point_out) { - // This depends on ints and int32s being the same thing. If they're not, it + // This depends on ints and int32s being the same thing. If they're not, it // will fail to compile. - // TODO(mmenke): This should probably be fixed. + // TODO(mmenke): This should probably be fixed. if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) || !base::IsValidCharacter(*code_point_out)) { *code_point_out = kUnicodeReplacementCharacter; @@ -262,9 +262,9 @@ bool ReadUTFChar(const base::char16* str, int* begin, int length, unsigned* code_point_out) { - // This depends on ints and int32s being the same thing. If they're not, it + // This depends on ints and int32s being the same thing. If they're not, it // will fail to compile. - // TODO(mmenke): This should probably be fixed. + // TODO(mmenke): This should probably be fixed. if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) || !base::IsValidCharacter(*code_point_out)) { *code_point_out = kUnicodeReplacementCharacter;
diff --git a/url_canon_internal.h b/url_canon_internal.h index 71bfc40..8a926b6 100644 --- a/url_canon_internal.h +++ b/url_canon_internal.h
@@ -7,7 +7,7 @@ // This file is intended to be included in another C++ file where the character // types are defined. This allows us to write mostly generic code, but not have -// templace bloat because everything is inlined when anybody calls any of our +// template bloat because everything is inlined when anybody calls any of our // functions. #include <stdlib.h> @@ -41,7 +41,7 @@ // Valid in an ASCII-representation of an octal digit. CHAR_OCT = 32, - // Characters that do not require escaping in encodeURIComponent. Characters + // Characters that do not require escaping in encodeURIComponent. Characters // that do not have this flag will be escaped; see url_util.cc. CHAR_COMPONENT = 64, }; @@ -175,7 +175,7 @@ output); Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)), output); - } else if (char_value <= 0x10FFFF) { // Max unicode code point. + } else if (char_value <= 0x10FFFF) { // Max Unicode code point. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)), output); @@ -199,7 +199,7 @@ } // Writes the given character to the output as UTF-8. This does NO checking -// of the validity of the unicode characters; the caller should ensure that +// of the validity of the Unicode characters; the caller should ensure that // the value it is appending is valid to append. inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) { DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output); @@ -207,7 +207,7 @@ // Writes the given character to the output as UTF-8, escaping ALL // characters (even when they are ASCII). This does NO checking of the -// validity of the unicode characters; the caller should ensure that the value +// validity of the Unicode characters; the caller should ensure that the value // it is appending is valid to append. inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) { DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output); @@ -260,7 +260,7 @@ // that any following characters are. inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin, int length, CanonOutput* output) { - // UTF-16 input. Readchar16 will handle invalid characters for us and give + // UTF-16 input. ReadUTFChar will handle invalid characters for us and give // us the kUnicodeReplacementCharacter, so we don't have to do special // checking after failure, just pass through the failure to the caller. unsigned char_value;
diff --git a/url_canon_internal_file.h b/url_canon_internal_file.h index 6903098..26a3eae 100644 --- a/url_canon_internal_file.h +++ b/url_canon_internal_file.h
@@ -113,15 +113,15 @@ new_parsed->path.begin = output->length(); output->push_back('/'); - // Copies and normalizes the "c:" at the beginning, if present. + // Copy and normalize the "c:" at the beginning, if present. int after_drive = FileDoDriveSpec(source.path, parsed.path.begin, parsed.path.end(), output); - // Copies the rest of the path + // Copy the rest of the path. FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output); new_parsed->path.len = output->length() - new_parsed->path.begin; - // Things following the path we can use the standard canonicalizers for. + // For things following the path, we can use the standard canonicalizers. success &= URLCanonInternal<CHAR, UCHAR>::DoQuery( source.query, parsed.query, output, &new_parsed->query); success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
diff --git a/url_canon_ip.cc b/url_canon_ip.cc index 45f95de..87c30c7 100644 --- a/url_canon_ip.cc +++ b/url_canon_ip.cc
@@ -4,9 +4,10 @@ #include "url/url_canon_ip.h" +#include <stdint.h> #include <stdlib.h> +#include <limits> -#include "base/basictypes.h" #include "base/logging.h" #include "url/url_canon_internal.h" @@ -92,7 +93,7 @@ template<typename CHAR> CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec, const Component& component, - uint32* number) { + uint32_t* number) { // Figure out the base SharedCharTypes base; int base_prefix_len = 0; // Size of the prefix for this base. @@ -118,7 +119,7 @@ base_prefix_len++; // Put the component, minus any base prefix, into a NULL-terminated buffer so - // we can call the standard library. Because leading zeros have already been + // we can call the standard library. Because leading zeros have already been // discarded, filling the entire buffer is guaranteed to trigger the 32-bit // overflow check. const int kMaxComponentLen = 16; @@ -133,7 +134,7 @@ if (!IsCharOfType(input, base)) return CanonHostInfo::NEUTRAL; - // Fill the buffer, if there's space remaining. This check allows us to + // Fill the buffer, if there's space remaining. This check allows us to // verify that all characters are numeric, even those that don't fit. if (dest_i < kMaxComponentLen) buf[dest_i++] = input; @@ -143,14 +144,14 @@ // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal // number can overflow a 64-bit number in <= 16 characters). - uint64 num = _strtoui64(buf, NULL, BaseForType(base)); + uint64_t num = _strtoui64(buf, NULL, BaseForType(base)); // Check for 32-bit overflow. - if (num > kuint32max) + if (num > std::numeric_limits<uint32_t>::max()) return CanonHostInfo::BROKEN; - // No overflow. Success! - *number = static_cast<uint32>(num); + // No overflow. Success! + *number = static_cast<uint32_t>(num); return CanonHostInfo::IPV4; } @@ -167,10 +168,10 @@ // Convert existing components to digits. Values up to // |existing_components| will be valid. - uint32 component_values[4]; + uint32_t component_values[4]; int existing_components = 0; - // Set to true if one or more components are BROKEN. BROKEN is only + // Set to true if one or more components are BROKEN. BROKEN is only // returned if all components are IPV4 or BROKEN, so, for example, // 12345678912345.de returns NEUTRAL rather than broken. bool broken = false; @@ -198,7 +199,7 @@ // First, process all components but the last, while making sure each fits // within an 8-bit field. for (int i = 0; i < existing_components - 1; i++) { - if (component_values[i] > kuint8max) + if (component_values[i] > std::numeric_limits<uint8_t>::max()) return CanonHostInfo::BROKEN; address[i] = static_cast<unsigned char>(component_values[i]); } @@ -209,7 +210,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif - uint32 last_value = component_values[existing_components - 1]; + uint32_t last_value = component_values[existing_components - 1]; #if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4) #pragma GCC diagnostic pop #endif @@ -440,11 +441,12 @@ return true; } -// Converts a hex comonent into a number. This cannot fail since the caller has +// Converts a hex component into a number. This cannot fail since the caller has // already verified that each character in the string was a hex digit, and // that there were no more than 4 characters. -template<typename CHAR> -uint16 IPv6HexComponentToNumber(const CHAR* spec, const Component& component) { +template <typename CHAR> +uint16_t IPv6HexComponentToNumber(const CHAR* spec, + const Component& component) { DCHECK(component.len <= 4); // Copy the hex string into a C-string. @@ -455,7 +457,7 @@ // Convert it to a number (overflow is not possible, since with 4 hex // characters we can at most have a 16 bit number). - return static_cast<uint16>(_strtoui64(buf, NULL, 16)); + return static_cast<uint16_t>(_strtoui64(buf, NULL, 16)); } // Converts an IPv6 address to a 128-bit number (network byte order), returning @@ -497,7 +499,7 @@ // Append the hex component's value. if (i != ipv6_parsed.num_hex_components) { // Get the 16-bit value for this hex component. - uint16 number = IPv6HexComponentToNumber<CHAR>( + uint16_t number = IPv6HexComponentToNumber<CHAR>( spec, ipv6_parsed.hex_components[i]); // Append to |address|, in network byte order. address[cur_index_in_address++] = (number & 0xFF00) >> 8; @@ -576,7 +578,7 @@ } } - // No invalid characters. Could still be IPv4 or a hostname. + // No invalid characters. Could still be IPv4 or a hostname. host_info->family = CanonHostInfo::NEUTRAL; return false; }
diff --git a/url_canon_ip.h b/url_canon_ip.h index 19ecfdb..937bd46 100644 --- a/url_canon_ip.h +++ b/url_canon_ip.h
@@ -6,9 +6,9 @@ #define URL_URL_CANON_IP_H_ #include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_canon.h" #include "url/url_export.h" -#include "url/url_parse.h" namespace url { @@ -30,14 +30,14 @@ // Not all components may exist. If there are only 3 components, for example, // the last one will have a length of -1 or 0 to indicate it does not exist. // -// Note that many platform's inet_addr will ignore everything after a space -// in certain curcumstances if the stuff before the space looks like an IP +// Note that many platforms' inet_addr will ignore everything after a space +// in certain circumstances if the stuff before the space looks like an IP // address. IE6 is included in this. We do NOT handle this case. In many cases, // the browser's canonicalization will get run before this which converts -// spaces to %20 (in the case of IE7) or rejects them (in the case of -// Mozilla), so this code path never gets hit. Our host canonicalization will -// notice these spaces and escape them, which will make IP address finding -// fail. This seems like better behavior than stripping after a space. +// spaces to %20 (in the case of IE7) or rejects them (in the case of Mozilla), +// so this code path never gets hit. Our host canonicalization will notice +// these spaces and escape them, which will make IP address finding fail. This +// seems like better behavior than stripping after a space. URL_EXPORT bool FindIPv4Components(const char* spec, const Component& host, Component components[4]);
diff --git a/url_canon_mailtourl.cc b/url_canon_mailtourl.cc index 7c48b95..fb6bc9a 100644 --- a/url_canon_mailtourl.cc +++ b/url_canon_mailtourl.cc
@@ -55,7 +55,7 @@ new_parsed->path.reset(); } - // Query -- always use the default utf8 charset converter. + // Query -- always use the default UTF8 charset converter. CanonicalizeQuery(source.query, parsed.query, NULL, output, &new_parsed->query);
diff --git a/url_canon_path.cc b/url_canon_path.cc index ceff689..ee1cd96 100644 --- a/url_canon_path.cc +++ b/url_canon_path.cc
@@ -173,7 +173,7 @@ // copied to the output. // // We do not collapse multiple slashes in a row to a single slash. It seems -// no web browsers do this, and we don't want incompababilities, even though +// no web browsers do this, and we don't want incompatibilities, even though // it would be correct for most systems. template<typename CHAR, typename UCHAR> bool DoPartialPath(const CHAR* spec, @@ -200,7 +200,7 @@ // Needs special handling of some sort. int dotlen; if ((dotlen = IsDot(spec, i, end)) > 0) { - // See if this dot was preceeded by a slash in the output. We + // See if this dot was preceded by a slash in the output. We // assume that when canonicalizing paths, they will always // start with a slash and not a dot, so we don't have to // bounds check the output. @@ -230,7 +230,7 @@ break; } } else { - // This dot is not preceeded by a slash, it is just part of some + // This dot is not preceded by a slash, it is just part of some // file name. output->push_back('.'); i += dotlen - 1;
diff --git a/url_canon_pathurl.cc b/url_canon_pathurl.cc index 0d23ccb..494fbda 100644 --- a/url_canon_pathurl.cc +++ b/url_canon_pathurl.cc
@@ -14,7 +14,7 @@ namespace { // Canonicalize the given |component| from |source| into |output| and -// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput| +// |new_component|. If |separator| is non-zero, it is pre-pended to |output| // prior to the canonicalized component; i.e. for the '?' or '#' characters. template<typename CHAR, typename UCHAR> bool DoCanonicalizePathComponent(const CHAR* source,
diff --git a/url_canon_query.cc b/url_canon_query.cc index 5494ddf..bf59d10 100644 --- a/url_canon_query.cc +++ b/url_canon_query.cc
@@ -80,7 +80,7 @@ } // Runs the converter with the given UTF-16 input. We don't have to do -// anything, but this overriddden function allows us to use the same code +// anything, but this overridden function allows us to use the same code // for both UTF-8 and UTF-16 input. void RunConverter(const base::char16* spec, const Component& query,
diff --git a/url_canon_relative.cc b/url_canon_relative.cc index 06ca99c..c2e94e4 100644 --- a/url_canon_relative.cc +++ b/url_canon_relative.cc
@@ -17,14 +17,14 @@ namespace { // Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug -// 379034), whereas IE is case-insensetive. +// 379034), whereas IE is case-insensitive. // // We choose to be more permissive like IE. We don't need to worry about // unescaping or anything here: neither IE or Firefox allow this. We also // don't have to worry about invalid scheme characters since we are comparing // against the canonical scheme of the base. // -// The base URL should always be canonical, therefore is ASCII. +// The base URL should always be canonical, therefore it should be ASCII. template<typename CHAR> bool AreSchemesEqual(const char* base, const Component& base_scheme, @@ -82,7 +82,7 @@ #ifdef WIN32 // We special case paths like "C:\foo" so they can link directly to the - // file on Windows (IE compatability). The security domain stuff should + // file on Windows (IE compatibility). The security domain stuff should // prevent a link like this from actually being followed if its on a // web page. // @@ -91,22 +91,22 @@ // is a file and the answer will still be correct. // // We require strict backslashes when detecting UNC since two forward - // shashes should be treated a a relative URL with a hostname. + // slashes should be treated a a relative URL with a hostname. if (DoesBeginWindowsDriveSpec(url, begin, url_len) || DoesBeginUNCPath(url, begin, url_len, true)) return true; #endif // WIN32 // See if we've got a scheme, if not, we know this is a relative URL. - // BUT: Just because we have a scheme, doesn't make it absolute. + // BUT, just because we have a scheme, doesn't make it absolute. // "http:foo.html" is a relative URL with path "foo.html". If the scheme is - // empty, we treat it as relative (":foo") like IE does. + // empty, we treat it as relative (":foo"), like IE does. Component scheme; const bool scheme_is_empty = !ExtractScheme(url, url_len, &scheme) || scheme.len == 0; if (scheme_is_empty) { if (url[begin] == '#') { - // |url| is a bare fragement (e.g. "#foo"). This can be resolved against + // |url| is a bare fragment (e.g. "#foo"). This can be resolved against // any base. Fall-through. } else if (!is_base_hierarchical) { // Don't allow relative URLs if the base scheme doesn't support it. @@ -145,7 +145,7 @@ int colon_offset = scheme.end(); // If it's a filesystem URL, the only valid way to make it relative is not to - // supply a scheme. There's no equivalent to e.g. http:index.html. + // supply a scheme. There's no equivalent to e.g. http:index.html. if (CompareSchemeComponent(url, scheme, kFileSystemScheme)) return true; @@ -394,7 +394,7 @@ query_converter, output, out_parsed); } -// Resolves a relative URL that happens to be an absolute file path. Examples +// Resolves a relative URL that happens to be an absolute file path. Examples // include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo". template<typename CHAR> bool DoResolveAbsoluteFile(const CHAR* relative_url, @@ -460,7 +460,7 @@ // how strict the UNC finder is). // // We also allow Windows absolute drive specs on any scheme (for example - // "c:\foo") like IE does. There must be no preceeding slashes in this + // "c:\foo") like IE does. There must be no preceding slashes in this // case (we reject anything like "/c:/foo") because that should be treated // as a path. For file URLs, we allow any number of slashes since that would // be setting the path.
diff --git a/url_canon_stdurl.cc b/url_canon_stdurl.cc index 7a61de8..7d1758b 100644 --- a/url_canon_stdurl.cc +++ b/url_canon_stdurl.cc
@@ -169,7 +169,7 @@ } // For 16-bit replacements, we turn all the replacements into UTF-8 so the -// regular codepath can be used. +// regular code path can be used. bool ReplaceStandardURL(const char* base, const Parsed& base_parsed, const Replacements<base::char16>& replacements,
diff --git a/url_canon_unittest.cc b/url_canon_unittest.cc index 3ab8710..0ccd6c9 100644 --- a/url_canon_unittest.cc +++ b/url_canon_unittest.cc
@@ -6,10 +6,10 @@ #include "base/macros.h" #include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_canon.h" #include "url/url_canon_internal.h" #include "url/url_canon_stdstring.h" -#include "url/url_parse.h" #include "url/url_test_utils.h" namespace url { @@ -38,7 +38,7 @@ bool expected_success; }; -// Test cases for CanonicalizeIPAddress(). The inputs are identical to +// Test cases for CanonicalizeIPAddress(). The inputs are identical to // DualComponentCase, but the output has extra CanonHostInfo fields. struct IPAddressCase { const char* input8; @@ -127,7 +127,7 @@ #if defined(GTEST_HAS_DEATH_TEST) // TODO(mattm): Can't run this in debug mode for now, since the DCHECK will -// cause the Chromium stacktrace dialog to appear and hang the test. +// cause the Chromium stack trace dialog to appear and hang the test. // See http://crbug.com/49580. #if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON) #define MAYBE_DoAppendUTF8Invalid DoAppendUTF8Invalid @@ -157,10 +157,10 @@ } utf_cases[] = { // Valid canonical input should get passed through & escaped. {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"}, - // Test a characer that takes > 16 bits (U+10300 = old italic letter A) + // Test a character that takes > 16 bits (U+10300 = old italic letter A) {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"}, - // Non-shortest-form UTF-8 are invalid. The bad char should be replaced - // with the invalid character (EF BF DB in UTF-8). + // Non-shortest-form UTF-8 characters are invalid. The bad character + // should be replaced with the invalid character (EF BF DB in UTF-8). {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"}, // Invalid UTF-8 sequences should be marked as invalid (the first // sequence is truncated). @@ -259,7 +259,7 @@ EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin); EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); - // Now try the wide version + // Now try the wide version. out_str.clear(); StdStringCanonOutput output2(&out_str); @@ -275,7 +275,7 @@ EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len); } - // Test the case where the scheme is declared nonexistant, it should be + // Test the case where the scheme is declared nonexistent, it should be // converted into an empty scheme. Component out_comp; out_str.clear(); @@ -638,7 +638,7 @@ {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"}, {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"}, {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"}, - // Old trunctations tests. They're all "BROKEN" now. + // Old trunctations tests. They're all "BROKEN" now. {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""}, {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""}, {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""}, @@ -754,16 +754,17 @@ {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"}, - // Can only have one "::" contraction in an IPv6 string literal. + // Can only have one "::" contraction in an IPv6 string literal. {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, - // No more than 2 consecutive ':'s. + // No more than 2 consecutive ':'s. {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, - // Non-IP addresses due to invalid characters. + // Non-IP addresses due to invalid characters. {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, - // If there are not enough components, the last one should fill them out. + // If there are not enough components, the last one should fill them out. // ... omitted at this time ... - // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses. + // Too many components means not an IP address. Similarly, with too few + // if using IPv4 compat or mapped addresses. {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""}, @@ -887,7 +888,7 @@ {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true}, {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true }, - // IE7 compatability: old versions allowed backslashes in usernames, but + // IE7 compatibility: old versions allowed backslashes in usernames, but // IE7 does not. We disallow it as well. {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true}, }; @@ -943,7 +944,7 @@ // buffer. The parser unit tests will test scanning the number correctly. // // Note that the CanonicalizePort will always prepend a colon to the output - // to separate it from the colon that it assumes preceeds it. + // to separate it from the colon that it assumes precedes it. struct PortCase { const char* input; int default_port; @@ -1329,7 +1330,7 @@ {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"}, // Replace nothing {"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"}, - // Replace scheme with filesystem. The result is garbage, but you asked + // Replace scheme with filesystem. The result is garbage, but you asked // for it. {"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"}, }; @@ -1594,7 +1595,7 @@ {"file:", "file:///", true, Component(), Component(7, 1)}, {"file:UNChost/path", "file://unchost/path", true, Component(7, 7), Component(14, 5)}, // CanonicalizeFileURL supports absolute Windows style paths for IE - // compatability. Note that the caller must decide that this is a file + // compatibility. Note that the caller must decide that this is a file // URL itself so it can call the file canonicalizer. This is usually // done automatically as part of relative URL resolving. {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)}, @@ -1605,7 +1606,7 @@ {"\\\\server\\file", "file://server/file", true, Component(7, 6), Component(13, 5)}, {"/\\server/file", "file://server/file", true, Component(7, 6), Component(13, 5)}, // We should preserve the number of slashes after the colon for IE - // compatability, except when there is none, in which case we should + // compatibility, except when there is none, in which case we should // add one. {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), Component(7, 16)}, {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, Component(), Component(7, 19)}, @@ -1807,7 +1808,7 @@ TEST(URLCanonTest, _itoa_s) { // We fill the buffer with 0xff to ensure that it's getting properly - // null-terminated. We also allocate one byte more than what we tell + // null-terminated. We also allocate one byte more than what we tell // _itoa_s about, and ensure that the extra byte is untouched. char buf[6]; memset(buf, 0xff, sizeof(buf)); @@ -1846,7 +1847,7 @@ TEST(URLCanonTest, _itow_s) { // We fill the buffer with 0xff to ensure that it's getting properly - // null-terminated. We also allocate one byte more than what we tell + // null-terminated. We also allocate one byte more than what we tell // _itoa_s about, and ensure that the extra byte is untouched. base::char16 buf[6]; const char fill_mem = 0xff; @@ -2022,7 +2023,7 @@ // which is what is required. {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"}, // Filesystem URL tests; filesystem URLs are only valid and relative if - // they have no scheme, e.g. "./index.html". There's no valid equivalent + // they have no scheme, e.g. "./index.html". There's no valid equivalent // to http:index.html. {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL}, {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL}, @@ -2090,10 +2091,10 @@ } } -// It used to be when we did a replacement with a long buffer of UTF-16 -// characters, we would get invalid data in the URL. This is because the buffer -// it used to hold the UTF-8 data was resized, while some pointers were still -// kept to the old buffer that was removed. +// It used to be the case that when we did a replacement with a long buffer of +// UTF-16 characters, we would get invalid data in the URL. This is because the +// buffer that it used to hold the UTF-8 data was resized, while some pointers +// were still kept to the old buffer that was removed. TEST(URLCanonTest, ReplacementOverflow) { const char src[] = "file:///C:/foo/bar"; int src_len = static_cast<int>(strlen(src)); @@ -2101,7 +2102,7 @@ ParseFileURL(src, src_len, &parsed); // Override two components, the path with something short, and the query with - // sonething long enough to trigger the bug. + // something long enough to trigger the bug. Replacements<base::char16> repl; base::string16 new_query; for (int i = 0; i < 4800; i++)
diff --git a/url_parse.h b/url_parse.h deleted file mode 100644 index 3b9c546..0000000 --- a/url_parse.h +++ /dev/null
@@ -1,11 +0,0 @@ -// Copyright 2013 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef URL_URL_PARSE_H_ -#define URL_URL_PARSE_H_ - -// TODO(tfarina): Remove this file when the callers are updated. -#include "url/third_party/mozilla/url_parse.h" - -#endif // URL_URL_PARSE_H_
diff --git a/url_parse_file.cc b/url_parse_file.cc index c08ddc6..fcbb12d 100644 --- a/url_parse_file.cc +++ b/url_parse_file.cc
@@ -3,8 +3,8 @@ // found in the LICENSE file. #include "base/logging.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_file.h" -#include "url/url_parse.h" #include "url/url_parse_internal.h" // Interesting IE file:isms...
diff --git a/url_parse_internal.h b/url_parse_internal.h index 4070b7e..7630878 100644 --- a/url_parse_internal.h +++ b/url_parse_internal.h
@@ -7,11 +7,11 @@ // Contains common inline helper functions used by the URL parsing routines. -#include "url/url_parse.h" +#include "url/third_party/mozilla/url_parse.h" namespace url { -// We treat slashes and backslashes the same for IE compatability. +// We treat slashes and backslashes the same for IE compatibility. inline bool IsURLSlash(base::char16 ch) { return ch == '/' || ch == '\\'; }
diff --git a/url_parse_unittest.cc b/url_parse_unittest.cc index 71b2438..6bf536e 100644 --- a/url_parse_unittest.cc +++ b/url_parse_unittest.cc
@@ -2,11 +2,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#include "url/url_parse.h" +#include "url/third_party/mozilla/url_parse.h" #include "base/macros.h" #include "testing/gtest/include/gtest/gtest.h" -#include "url/url_parse.h" +#include "url/third_party/mozilla/url_parse.h" // Interesting IE file:isms... // @@ -90,13 +90,13 @@ bool ComponentMatches(const char* input, const char* reference, const Component& component) { - // If the component is nonexistant (length == -1), it should begin at 0. + // If the component is nonexistent (length == -1), it should begin at 0. EXPECT_TRUE(component.len >= 0 || component.len == -1); // Begin should be valid. EXPECT_LE(0, component.begin); - // A NULL reference means the component should be nonexistant. + // A NULL reference means the component should be nonexistent. if (!reference) return component.len == -1; if (component.len < 0) @@ -345,7 +345,7 @@ TEST(URLParser, PathURL) { // Declared outside for loop to try to catch cases in init() where we forget - // to reset something that is reset by the construtor. + // to reset something that is reset by the constructor. Parsed parsed; for (size_t i = 0; i < arraysize(path_cases); i++) { const char* url = path_cases[i].input; @@ -356,7 +356,7 @@ EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent())) << i; - // The remaining components are never used for path urls. + // The remaining components are never used for path URLs. ExpectInvalidComponent(parsed.username); ExpectInvalidComponent(parsed.password); ExpectInvalidComponent(parsed.host); @@ -537,7 +537,7 @@ Component key, value; if (!ExtractQueryKeyValue(url, &query, &key, &value)) { if (parameter >= i && !expected_key) - return true; // Expected nonexistant key, got one. + return true; // Expected nonexistent key, got one. return false; // Not enough keys. } @@ -613,7 +613,7 @@ TEST(URLParser, MailtoUrl) { // Declared outside for loop to try to catch cases in init() where we forget - // to reset something that is reset by the construtor. + // to reset something that is reset by the constructor. Parsed parsed; for (size_t i = 0; i < arraysize(mailto_cases); ++i) { const char* url = mailto_cases[i].input; @@ -625,7 +625,7 @@ EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query)); EXPECT_EQ(PORT_UNSPECIFIED, port); - // The remaining components are never used for mailto urls. + // The remaining components are never used for mailto URLs. ExpectInvalidComponent(parsed.username); ExpectInvalidComponent(parsed.password); ExpectInvalidComponent(parsed.port); @@ -645,7 +645,7 @@ TEST(URLParser, FileSystemURL) { // Declared outside for loop to try to catch cases in init() where we forget - // to reset something that is reset by the construtor. + // to reset something that is reset by the constructor. Parsed parsed; for (size_t i = 0; i < arraysize(filesystem_cases); i++) { const FileSystemURLParseCase* parsecase = &filesystem_cases[i]; @@ -667,7 +667,7 @@ int port = ParsePort(url, parsed.inner_parsed()->port); EXPECT_EQ(parsecase->inner_port, port); - // The remaining components are never used for filesystem urls. + // The remaining components are never used for filesystem URLs. ExpectInvalidComponent(parsed.inner_parsed()->query); ExpectInvalidComponent(parsed.inner_parsed()->ref); } @@ -676,7 +676,7 @@ EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query)); EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref)); - // The remaining components are never used for filesystem urls. + // The remaining components are never used for filesystem URLs. ExpectInvalidComponent(parsed.username); ExpectInvalidComponent(parsed.password); ExpectInvalidComponent(parsed.host);
diff --git a/url_test_utils.h b/url_test_utils.h index 6400bac..156c428 100644 --- a/url_test_utils.h +++ b/url_test_utils.h
@@ -19,7 +19,7 @@ namespace test_utils { // Converts a UTF-16 string from native wchar_t format to char16, by -// truncating the high 32 bits. This is not meant to handle true UTF-32 +// truncating the high 32 bits. This is not meant to handle true UTF-32 // encoded strings. inline base::string16 WStringToUTF16(const wchar_t* src) { base::string16 str; @@ -30,7 +30,7 @@ return str; } -// Converts a string from UTF-8 to UTF-16 +// Converts a string from UTF-8 to UTF-16. inline base::string16 ConvertUTF8ToUTF16(const std::string& src) { int length = static_cast<int>(src.length()); EXPECT_LT(length, 1024); @@ -39,7 +39,7 @@ return base::string16(output.data(), output.length()); } -// Converts a string from UTF-16 to UTF-8 +// Converts a string from UTF-16 to UTF-8. inline std::string ConvertUTF16ToUTF8(const base::string16& src) { std::string str; StdStringCanonOutput output(&str);
diff --git a/url_util.cc b/url_util.cc index 008a5e4..279ab7e 100644 --- a/url_util.cc +++ b/url_util.cc
@@ -9,6 +9,7 @@ #include "base/debug/leak_annotations.h" #include "base/logging.h" +#include "base/strings/string_util.h" #include "url/url_canon_internal.h" #include "url/url_file.h" #include "url/url_util_internal.h" @@ -17,28 +18,11 @@ namespace { -// ASCII-specific tolower. The standard library's tolower is locale sensitive, -// so we don't want to use it here. -template<class Char> -inline Char ToLowerASCII(Char c) { - return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; -} - -// Backend for LowerCaseEqualsASCII. -template<typename Iter> -inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) { - for (Iter it = a_begin; it != a_end; ++it, ++b) { - if (!*b || ToLowerASCII(*it) != *b) - return false; - } - return *b == 0; -} - const int kNumStandardURLSchemes = 8; const char* kStandardURLSchemes[kNumStandardURLSchemes] = { kHttpScheme, kHttpsScheme, - kFileScheme, // Yes, file urls can have a hostname! + kFileScheme, // Yes, file URLs can have a hostname! kFtpScheme, kGopherScheme, kWsScheme, // WebSocket. @@ -54,6 +38,17 @@ // See the LockStandardSchemes declaration in the header. bool standard_schemes_locked = false; +// This template converts a given character type to the corresponding +// StringPiece type. +template<typename CHAR> struct CharToStringPiece { +}; +template<> struct CharToStringPiece<char> { + typedef base::StringPiece Piece; +}; +template<> struct CharToStringPiece<base::char16> { + typedef base::StringPiece16 Piece; +}; + // Ensures that the standard_schemes list is initialized, does nothing if it // already has values. void InitStandardSchemes() { @@ -72,9 +67,10 @@ const char* compare_to) { if (!component.is_nonempty()) return compare_to[0] == 0; // When component is empty, match empty scheme. - return LowerCaseEqualsASCII(&spec[component.begin], - &spec[component.end()], - compare_to); + return base::LowerCaseEqualsASCII( + typename CharToStringPiece<CHAR>::Piece( + &spec[component.begin], component.len), + compare_to); } // Returns true if the given scheme identified by |scheme| within |spec| is one @@ -86,8 +82,10 @@ InitStandardSchemes(); for (size_t i = 0; i < standard_schemes->size(); i++) { - if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()], - standard_schemes->at(i))) + if (base::LowerCaseEqualsASCII( + typename CharToStringPiece<CHAR>::Piece( + &spec[scheme.begin], scheme.len), + standard_schemes->at(i))) return true; } return false; @@ -134,7 +132,7 @@ Parsed parsed_input; #ifdef WIN32 // For Windows, we allow things that look like absolute Windows paths to be - // fixed up magically to file URLs. This is done for IE compatability. For + // fixed up magically to file URLs. This is done for IE compatibility. For // example, this will change "c:/foo" into a file URL rather than treating // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt"). // There is similar logic in url_canon_relative.cc for @@ -177,13 +175,14 @@ charset_converter, output, output_parsed); } else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) { - // Mailto are treated like a standard url with only a scheme, path, query + // Mailto URLs are treated like standard URLs, with only a scheme, path, + // and query. ParseMailtoURL(spec, spec_len, &parsed_input); success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output, output_parsed); } else { - // "Weird" URLs like data: and javascript: + // "Weird" URLs like data: and javascript:. ParsePathURL(spec, spec_len, trim_path_end, &parsed_input); success = CanonicalizePathURL(spec, spec_len, parsed_input, output, output_parsed); @@ -273,7 +272,7 @@ CanonOutput* output, Parsed* out_parsed) { // If the scheme is overridden, just do a simple string substitution and - // reparse the whole thing. There are lots of edge cases that we really don't + // re-parse the whole thing. There are lots of edge cases that we really don't // want to deal with. Like what happens if I replace "http://e:8080/foo" // with a file. Does it become "file:///E:/8080/foo" where the port number // becomes part of the path? Parsing that string as a file URL says "yes" @@ -320,7 +319,7 @@ // getting replaced here. If ReplaceComponents didn't re-check everything, // we wouldn't know if something *not* getting replaced is a problem. // If the scheme-specific replacers are made more intelligent so they don't - // re-check everything, we should instead recanonicalize the whole thing + // re-check everything, we should instead re-canonicalize the whole thing // after this call to check validity (this assumes replacing the scheme is // much much less common than other types of replacements, like clearing the // ref). @@ -373,7 +372,7 @@ // // This normally means you're trying to set up a new standard scheme too late // in your application's init process. Locate where your app does this - // initialization and calls LockStandardScheme, and add your new standard + // initialization and calls LockStandardSchemes, and add your new standard // scheme there. DCHECK(!standard_schemes_locked) << "Trying to add a standard scheme after the list has been locked."; @@ -382,7 +381,7 @@ if (scheme_len == 0) return; - // Dulicate the scheme into a new buffer and add it to the list of standard + // Duplicate the scheme into a new buffer and add it to the list of standard // schemes. This pointer will be leaked on shutdown. char* dup_scheme = new char[scheme_len + 1]; ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme); @@ -486,31 +485,6 @@ charset_converter, output, out_parsed); } -// Front-ends for LowerCaseEqualsASCII. -bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b) { - return DoLowerCaseEqualsASCII(a_begin, a_end, b); -} - -bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b_begin, - const char* b_end) { - while (a_begin != a_end && b_begin != b_end && - ToLowerASCII(*a_begin) == *b_begin) { - a_begin++; - b_begin++; - } - return a_begin == a_end && b_begin == b_end; -} - -bool LowerCaseEqualsASCII(const base::char16* a_begin, - const base::char16* a_end, - const char* b) { - return DoLowerCaseEqualsASCII(a_begin, a_end, b); -} - void DecodeURLEscapeSequences(const char* input, int length, CanonOutputW* output) {
diff --git a/url_util.h b/url_util.h index 458d1e8..5817044 100644 --- a/url_util.h +++ b/url_util.h
@@ -8,10 +8,10 @@ #include <string> #include "base/strings/string16.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_canon.h" #include "url/url_constants.h" #include "url/url_export.h" -#include "url/url_parse.h" namespace url { @@ -20,14 +20,13 @@ // Initialization is NOT required, it will be implicitly initialized when first // used. However, this implicit initialization is NOT threadsafe. If you are // using this library in a threaded environment and don't have a consistent -// "first call" (an example might be calling "AddStandardScheme" with your -// special application-specific schemes) then you will want to call initialize -// before spawning any threads. +// "first call" (an example might be calling AddStandardScheme with your special +// application-specific schemes) then you will want to call initialize before +// spawning any threads. // -// It is OK to call this function more than once, subsequent calls will simply -// "noop", unless Shutdown() was called in the mean time. This will also be a -// "noop" if other calls to the library have forced an initialization -// beforehand. +// It is OK to call this function more than once, subsequent calls will be +// no-ops, unless Shutdown was called in the mean time. This will also be a +// no-op if other calls to the library have forced an initialization beforehand. URL_EXPORT void Initialize(); // Cleanup is not required, except some strings may leak. For most user @@ -38,10 +37,13 @@ // Schemes -------------------------------------------------------------------- -// Adds an application-defined scheme to the internal list of "standard" URL -// schemes. This function is not threadsafe and can not be called concurrently -// with any other url_util function. It will assert if the list of standard -// schemes has been locked (see LockStandardSchemes). +// Adds an application-defined scheme to the internal list of "standard-format" +// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic +// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). +// +// This function is not threadsafe and can not be called concurrently with any +// other url_util function. It will assert if the list of standard schemes has +// been locked (see LockStandardSchemes). URL_EXPORT void AddStandardScheme(const char* new_scheme); // Sets a flag to prevent future calls to AddStandardScheme from succeeding. @@ -85,19 +87,11 @@ compare, found_scheme); } -// Returns true if the given string represents a standard URL. This means that -// either the scheme is in the list of known standard schemes. +// Returns true if the given string represents a URL whose scheme is in the list +// of known standard-format schemes (see AddStandardScheme). URL_EXPORT bool IsStandard(const char* spec, const Component& scheme); URL_EXPORT bool IsStandard(const base::char16* spec, const Component& scheme); -// TODO(brettw) remove this. This is a temporary compatibility hack to avoid -// breaking the WebKit build when this version is synced via Chrome. -inline bool IsStandard(const char* spec, - int spec_len, - const Component& scheme) { - return IsStandard(spec, scheme); -} - // URL library wrappers ------------------------------------------------------- // Parses the given spec according to the extracted scheme type. Normal users @@ -150,7 +144,7 @@ CanonOutput* output, Parsed* output_parsed); -// Replaces components in the given VALID input url. The new canonical URL info +// Replaces components in the given VALID input URL. The new canonical URL info // is written to output and out_parsed. // // Returns true if the resulting URL is valid. @@ -172,29 +166,12 @@ // String helper functions ---------------------------------------------------- -// Compare the lower-case form of the given string against the given ASCII -// string. This is useful for doing checking if an input string matches some -// token, and it is optimized to avoid intermediate string copies. -// -// The versions of this function that don't take a b_end assume that the b -// string is NULL terminated. -URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b); -URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, - const char* a_end, - const char* b_begin, - const char* b_end); -URL_EXPORT bool LowerCaseEqualsASCII(const base::char16* a_begin, - const base::char16* a_end, - const char* b); - // Unescapes the given string using URL escaping rules. URL_EXPORT void DecodeURLEscapeSequences(const char* input, int length, CanonOutputW* output); -// Escapes the given string as defined by the JS method encodeURIComponent. See +// Escapes the given string as defined by the JS method encodeURIComponent. See // https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent URL_EXPORT void EncodeURIComponent(const char* input, int length,
diff --git a/url_util_internal.h b/url_util_internal.h index c72598f..756c736 100644 --- a/url_util_internal.h +++ b/url_util_internal.h
@@ -8,7 +8,7 @@ #include <string> #include "base/strings/string16.h" -#include "url/url_parse.h" +#include "url/third_party/mozilla/url_parse.h" namespace url {
diff --git a/url_util_unittest.cc b/url_util_unittest.cc index 73ff93b..9297765 100644 --- a/url_util_unittest.cc +++ b/url_util_unittest.cc
@@ -4,9 +4,9 @@ #include "base/macros.h" #include "testing/gtest/include/gtest/gtest.h" +#include "url/third_party/mozilla/url_parse.h" #include "url/url_canon.h" #include "url/url_canon_stdstring.h" -#include "url/url_parse.h" #include "url/url_test_utils.h" #include "url/url_util.h" @@ -44,7 +44,7 @@ EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme)); EXPECT_TRUE(found_scheme == Component()); - // When there is a whitespace char in scheme, it should canonicalize the url + // When there is a whitespace char in scheme, it should canonicalize the URL // before comparison. const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)"; EXPECT_TRUE(FindAndCompareScheme(whtspc_str, @@ -305,8 +305,8 @@ } TEST(URLUtilTest, TestNoRefComponent) { - // The hash-mark must be ignored when mailto: scheme is - // parsed, even if the url has a base and relative part. + // The hash-mark must be ignored when mailto: scheme is parsed, + // even if the URL has a base and relative part. const char* base = "mailto://to/"; const char* rel = "any#body";