Update to Chromium //url at Chromium commit 79dc59ac7602413181079ecb463873e29a1d7d0a.
I think the most significant change is that url::Origin is now actually
an origin.
TBR=jamesr@chromium.org
Review URL: https://codereview.chromium.org/2029803003 .
diff --git a/BUILD.gn b/BUILD.gn
index 4f6b637..34dff8a 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -24,6 +24,8 @@
"gurl.h",
"origin.cc",
"origin.h",
+ "scheme_host_port.cc",
+ "scheme_host_port.h",
"third_party/mozilla/url_parse.cc",
"third_party/mozilla/url_parse.h",
"url_canon.h",
@@ -91,10 +93,20 @@
# TODO(dpranke): crbug.com/360936. Get this to build and run on Android.
if (!is_android) {
+ # TODO(GYP): Delete this after we've converted everything to GN.
+ # The _run targets exist only for compatibility w/ GYP.
+ group("url_unittests_run") {
+ testonly = true
+ deps = [
+ ":url_unittests",
+ ]
+ }
+
test("url_unittests") {
sources = [
"gurl_unittest.cc",
"origin_unittest.cc",
+ "scheme_host_port_unittest.cc",
"url_canon_icu_unittest.cc",
"url_canon_unittest.cc",
"url_parse_unittest.cc",
diff --git a/android/java/src/org/chromium/url/IDNStringUtil.java b/android/java/src/org/chromium/url/IDNStringUtil.java
index 32000fd..37d77dc 100644
--- a/android/java/src/org/chromium/url/IDNStringUtil.java
+++ b/android/java/src/org/chromium/url/IDNStringUtil.java
@@ -4,8 +4,8 @@
package org.chromium.url;
-import org.chromium.base.CalledByNative;
-import org.chromium.base.JNINamespace;
+import org.chromium.base.annotations.CalledByNative;
+import org.chromium.base.annotations.JNINamespace;
import java.net.IDN;
diff --git a/gurl.cc b/gurl.cc
index 46ca408..c22236f 100644
--- a/gurl.cc
+++ b/gurl.cc
@@ -14,6 +14,8 @@
#include "url/gurl.h"
#include "base/logging.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_util.h"
#include "url/url_canon_stdstring.h"
#include "url/url_util.h"
@@ -59,7 +61,7 @@
#endif // WIN32
-} // namespace
+} // namespace
GURL::GURL() : is_valid_(false) {
}
@@ -130,7 +132,7 @@
#ifndef NDEBUG
// For testing purposes, check that the parsed canonical URL is identical to
// what we would have produced. Skip checking for invalid URLs have no meaning
- // and we can't always canonicalize then reproducabely.
+ // and we can't always canonicalize then reproducibly.
if (is_valid_) {
url::Component scheme;
// We can't do this check on the inner_url of a filesystem URL, as
@@ -193,17 +195,8 @@
return spec_ > other.spec_;
}
-GURL GURL::Resolve(const std::string& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-GURL GURL::Resolve(const base::string16& relative) const {
- return ResolveWithCharsetConverter(relative, NULL);
-}
-
// Note: code duplicated below (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const std::string& relative,
- url::CharsetConverter* charset_converter) const {
+GURL GURL::Resolve(const std::string& relative) const {
// Not allowed for invalid URLs.
if (!is_valid_)
return GURL();
@@ -218,7 +211,7 @@
if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
parsed_, relative.data(),
static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
+ nullptr, &output, &result.parsed_)) {
// Error resolving, return an empty URL.
return GURL();
}
@@ -234,9 +227,7 @@
}
// Note: code duplicated above (it's inconvenient to use a template here).
-GURL GURL::ResolveWithCharsetConverter(
- const base::string16& relative,
- url::CharsetConverter* charset_converter) const {
+GURL GURL::Resolve(const base::string16& relative) const {
// Not allowed for invalid URLs.
if (!is_valid_)
return GURL();
@@ -251,7 +242,7 @@
if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
parsed_, relative.data(),
static_cast<int>(relative.length()),
- charset_converter, &output, &result.parsed_)) {
+ nullptr, &output, &result.parsed_)) {
// Error resolving, return an empty URL.
return GURL();
}
@@ -320,7 +311,7 @@
GURL GURL::GetOrigin() const {
// This doesn't make sense for invalid or nonstandard URLs, so return
- // the empty URL
+ // the empty URL.
if (!is_valid_ || !IsStandard())
return GURL();
@@ -382,9 +373,10 @@
bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
if (parsed_.scheme.len <= 0)
return lower_ascii_scheme == NULL;
- return url::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
- spec_.data() + parsed_.scheme.end(),
- lower_ascii_scheme);
+ return base::LowerCaseEqualsASCII(
+ base::StringPiece(spec_.data() + parsed_.scheme.begin,
+ parsed_.scheme.len),
+ lower_ascii_scheme);
}
bool GURL::SchemeIsHTTPOrHTTPS() const {
@@ -416,16 +408,17 @@
}
std::string GURL::PathForRequest() const {
- DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
+ DCHECK(parsed_.path.len > 0)
+ << "Canonical path for requests should be non-empty";
if (parsed_.ref.len >= 0) {
- // Clip off the reference when it exists. The reference starts after the #
- // sign, so we have to subtract one to also remove it.
+ // Clip off the reference when it exists. The reference starts after the
+ // #-sign, so we have to subtract one to also remove it.
return std::string(spec_, parsed_.path.begin,
parsed_.ref.begin - parsed_.path.begin - 1);
}
// Compute the actual path length, rather than depending on the spec's
- // terminator. If we're an inner_url, our spec continues on into our outer
- // url's path/query/ref.
+ // terminator. If we're an inner_url, our spec continues on into our outer
+ // URL's path/query/ref.
int path_len = parsed_.path.len;
if (parsed_.query.is_valid())
path_len = parsed_.query.end() - parsed_.path.begin;
@@ -490,48 +483,45 @@
#endif // WIN32
-bool GURL::DomainIs(const char* lower_ascii_domain,
- int domain_len) const {
- // Return false if this URL is not valid or domain is empty.
- if (!is_valid_ || !domain_len)
+bool GURL::DomainIs(base::StringPiece lower_ascii_domain) const {
+ if (!is_valid_ || lower_ascii_domain.empty())
return false;
// FileSystem URLs have empty parsed_.host, so check this first.
if (SchemeIsFileSystem() && inner_url_)
- return inner_url_->DomainIs(lower_ascii_domain, domain_len);
+ return inner_url_->DomainIs(lower_ascii_domain);
if (!parsed_.host.is_nonempty())
return false;
- // Check whether the host name is end with a dot. If yes, treat it
- // the same as no-dot unless the input comparison domain is end
- // with dot.
- const char* last_pos = spec_.data() + parsed_.host.end() - 1;
+ // If the host name ends with a dot but the input domain doesn't,
+ // then we ignore the dot in the host name.
+ const char* host_last_pos = spec_.data() + parsed_.host.end() - 1;
int host_len = parsed_.host.len;
- if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
- last_pos--;
+ int domain_len = lower_ascii_domain.length();
+ if ('.' == *host_last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
+ host_last_pos--;
host_len--;
}
- // Return false if host's length is less than domain's length.
if (host_len < domain_len)
return false;
- // Compare this url whether belong specific domain.
- const char* start_pos = spec_.data() + parsed_.host.begin +
- host_len - domain_len;
+ // |host_first_pos| is the start of the compared part of the host name, not
+ // start of the whole host name.
+ const char* host_first_pos = spec_.data() + parsed_.host.begin +
+ host_len - domain_len;
- if (!url::LowerCaseEqualsASCII(start_pos,
- last_pos + 1,
- lower_ascii_domain,
- lower_ascii_domain + domain_len))
+ if (!base::LowerCaseEqualsASCII(
+ base::StringPiece(host_first_pos, domain_len), lower_ascii_domain))
return false;
- // Check whether host has right domain start with dot, make sure we got
- // right domain range. For example www.google.com has domain
- // "google.com" but www.iamnotgoogle.com does not.
+ // Make sure there aren't extra characters in host before the compared part;
+ // if the host name is longer than the input domain name, then the character
+ // immediately before the compared part should be a dot. For example,
+ // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
- '.' != *(start_pos - 1))
+ '.' != *(host_first_pos - 1))
return false;
return true;
diff --git a/gurl.h b/gurl.h
index 566fc5e..dccfec4 100644
--- a/gurl.h
+++ b/gurl.h
@@ -10,11 +10,12 @@
#include "base/memory/scoped_ptr.h"
#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_stdstring.h"
#include "url/url_constants.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
class URL_EXPORT GURL {
public:
@@ -91,7 +92,7 @@
// Returns the potentially invalid spec for a the URL. This spec MUST NOT be
// modified or sent over the network. It is designed to be displayed in error
- // messages to the user, as the apperance of the spec may explain the error.
+ // messages to the user, as the appearance of the spec may explain the error.
// If the spec is valid, the valid spec will be returned.
//
// The returned string is guaranteed to be valid UTF-8.
@@ -124,9 +125,8 @@
// pages.
//
// It may be impossible to resolve the URLs properly. If the input is not
- // "standard" (SchemeIsStandard() == false) and the input looks relative, we
- // can't resolve it. In these cases, the result will be an empty, invalid
- // GURL.
+ // "standard" (IsStandard() == false) and the input looks relative, we can't
+ // resolve it. In these cases, the result will be an empty, invalid GURL.
//
// The result may also be a nonempty, invalid URL if the input has some kind
// of encoding error. In these cases, we will try to construct a "good" URL
@@ -137,20 +137,6 @@
GURL Resolve(const std::string& relative) const;
GURL Resolve(const base::string16& relative) const;
- // Like Resolve() above but takes a character set encoder which will be used
- // for any query text specified in the input. The charset converter parameter
- // may be NULL, in which case it will be treated as UTF-8.
- //
- // TODO(brettw): These should be replaced with versions that take something
- // more friendly than a raw CharsetConverter (maybe like an ICU character set
- // name).
- GURL ResolveWithCharsetConverter(
- const std::string& relative,
- url::CharsetConverter* charset_converter) const;
- GURL ResolveWithCharsetConverter(
- const base::string16& relative,
- url::CharsetConverter* charset_converter) const;
-
// Creates a new GURL by replacing the current URL's components with the
// supplied versions. See the Replacements class in url_canon.h for more.
//
@@ -194,10 +180,11 @@
// returned.
GURL GetAsReferrer() const;
- // Returns true if the scheme for the current URL is a known "standard"
- // scheme. Standard schemes have an authority and a path section. This
- // includes file: and filesystem:, which some callers may want to filter out
- // explicitly by calling SchemeIsFile[System].
+ // Returns true if the scheme for the current URL is a known "standard-format"
+ // scheme. A standard-format scheme adheres to what RFC 3986 calls "generic
+ // URI syntax" (https://tools.ietf.org/html/rfc3986#section-3). This includes
+ // file: and filesystem:, which some callers may want to filter out explicitly
+ // by calling SchemeIsFile[System].
bool IsStandard() const;
// Returns true if the given parameter (should be lower-case ASCII to match
@@ -223,10 +210,32 @@
return SchemeIs(url::kFileSystemScheme);
}
- // If the scheme indicates a secure connection
+ // Returns true if the scheme indicates a secure connection.
+ //
+ // NOTE: This function is deprecated. You probably want
+ // |SchemeIsCryptographic| (if you just want to know if a scheme uses TLS for
+ // network transport) or Chromium's |IsOriginSecure| for a higher-level test
+ // about an origin's security. See those functions' documentation for more
+ // detail.
+ //
+ // TODO(palmer): Audit callers and change them to |SchemeIsCryptographic| or
+ // |IsOriginSecure|, as appropriate. Then remove |SchemeIsSecure|.
+ // crbug.com/362214
bool SchemeIsSecure() const {
return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme) ||
- (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure());
+ (SchemeIsFileSystem() && inner_url() &&
+ inner_url()->SchemeIsSecure());
+ }
+
+ // Returns true if the scheme indicates a network connection that uses TLS or
+ // some other cryptographic protocol (e.g. QUIC) for security.
+ //
+ // This function is a not a complete test of whether or not an origin's code
+ // is minimally trustworthy. For that, see Chromium's |IsOriginSecure| for a
+ // higher-level and more complete semantics. See that function's documentation
+ // for more detail.
+ bool SchemeIsCryptographic() const {
+ return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme);
}
// Returns true if the scheme is "blob".
@@ -235,13 +244,12 @@
}
// The "content" of the URL is everything after the scheme (skipping the
- // scheme delimiting colon). It is an error to get the origin of an invalid
- // URL. The result will be an empty string.
+ // scheme delimiting colon). It is an error to get the content of an invalid
+ // URL: the result will be an empty string.
std::string GetContent() const;
// Returns true if the hostname is an IP address. Note: this function isn't
// as cheap as a simple getter because it re-parses the hostname to verify.
- // This currently identifies only IPv4 addresses (bug 822685).
bool HostIsIPAddress() const;
// Getters for various components of the URL. The returned string will be
@@ -274,8 +282,8 @@
return ComponentString(parsed_.ref);
}
- // Existance querying. These functions will return true if the corresponding
- // URL component exists in this URL. Note that existance is different than
+ // Existence querying. These functions will return true if the corresponding
+ // URL component exists in this URL. Note that existence is different than
// being nonempty. http://www.google.com/? has a query that just happens to
// be empty, and has_query() will return true.
bool has_scheme() const {
@@ -288,7 +296,7 @@
return parsed_.password.len >= 0;
}
bool has_host() const {
- // Note that hosts are special, absense of host means length 0.
+ // Note that hosts are special, absence of host means length 0.
return parsed_.host.len > 0;
}
bool has_port() const {
@@ -310,7 +318,7 @@
// values defined in Parsed for ExtractPort.
int IntPort() const;
- // Returns the port number of the url, or the default port number.
+ // Returns the port number of the URL, or the default port number.
// If the scheme has no concept of port (or unknown default) returns
// PORT_UNSPECIFIED.
int EffectiveIntPort() const;
@@ -324,29 +332,21 @@
std::string PathForRequest() const;
// Returns the host, excluding the square brackets surrounding IPv6 address
- // literals. This can be useful for passing to getaddrinfo().
+ // literals. This can be useful for passing to getaddrinfo().
std::string HostNoBrackets() const;
// Returns true if this URL's host matches or is in the same domain as
- // the given input string. For example if this URL was "www.google.com",
- // this would match "com", "google.com", and "www.google.com
- // (input domain should be lower-case ASCII to match the canonicalized
- // scheme). This call is more efficient than getting the host and check
+ // the given input string. For example, if the hostname of the URL is
+ // "www.google.com", this will return true for "com", "google.com", and
+ // "www.google.com".
+ //
+ // The input domain should be lower-case ASCII to match the canonicalized
+ // scheme. This call is more efficient than getting the host and check
// whether host has the specific domain or not because no copies or
// object constructions are done.
- //
- // If function DomainIs has parameter domain_len, which means the parameter
- // lower_ascii_domain does not gurantee to terminate with NULL character.
- bool DomainIs(const char* lower_ascii_domain, int domain_len) const;
+ bool DomainIs(base::StringPiece lower_ascii_domain) const;
- // If function DomainIs only has parameter lower_ascii_domain, which means
- // domain string should be terminate with NULL character.
- bool DomainIs(const char* lower_ascii_domain) const {
- return DomainIs(lower_ascii_domain,
- static_cast<int>(strlen(lower_ascii_domain)));
- }
-
- // Swaps the contents of this GURL object with the argument without doing
+ // Swaps the contents of this GURL object with |other|, without doing
// any memory allocations.
void Swap(GURL* other);
@@ -363,8 +363,8 @@
private:
// Variant of the string parsing constructor that allows the caller to elect
- // retain trailing whitespace, if any, on the passed URL spec but only if the
- // scheme is one that allows trailing whitespace. The primary use-case is
+ // retain trailing whitespace, if any, on the passed URL spec, but only if
+ // the scheme is one that allows trailing whitespace. The primary use-case is
// for data: URLs. In most cases, you want to use the single parameter
// constructor above.
enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE };
diff --git a/gurl_unittest.cc b/gurl_unittest.cc
index bea1a0c..18aa2ae 100644
--- a/gurl_unittest.cc
+++ b/gurl_unittest.cc
@@ -45,14 +45,15 @@
EXPECT_EQ("something:///HOSTNAME.com/",
TypesTestCase("something:///HOSTNAME.com/"));
- // In the reverse, known schemes should always trigger standard URL handling.
+ // Conversely, URLs with known schemes should always trigger standard URL
+ // handling.
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:/HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http://HOSTNAME.com"));
EXPECT_EQ("http://hostname.com/", TypesTestCase("http:///HOSTNAME.com"));
#ifdef WIN32
- // URLs that look like absolute Windows drive specs.
+ // URLs that look like Windows absolute path specs.
EXPECT_EQ("file:///C:/foo.txt", TypesTestCase("c:\\foo.txt"));
EXPECT_EQ("file:///Z:/foo.txt", TypesTestCase("Z|foo.txt"));
EXPECT_EQ("file://server/foo.txt", TypesTestCase("\\\\server\\foo.txt"));
@@ -60,7 +61,7 @@
#endif
}
-// Test the basic creation and querying of components in a GURL. We assume
+// Test the basic creation and querying of components in a GURL. We assume that
// the parser is already tested and works, so we are mostly interested if the
// object does the right thing with the results.
TEST(GURLTest, Components) {
@@ -175,7 +176,7 @@
EXPECT_EQ("", invalid2.ref());
}
-// This is a regression test for http://crbug.com/309975 .
+// This is a regression test for http://crbug.com/309975.
TEST(GURLTest, SelfAssign) {
GURL a("filesystem:http://example.com/temporary/");
// This should not crash.
@@ -245,9 +246,9 @@
}
TEST(GURLTest, ExtraSlashesBeforeAuthority) {
- // According to RFC3986, the hier-part for URI with an authority must use only
- // two slashes, GURL intentionally just ignores slashes more than 2 and parses
- // the following part as an authority.
+ // According to RFC3986, the hierarchical part for URI with an authority
+ // must use only two slashes; GURL intentionally just ignores extra slashes
+ // if there are more than 2, and parses the following part as an authority.
GURL url("http:///host");
EXPECT_EQ("host", url.host());
EXPECT_EQ("/", url.path());
@@ -378,7 +379,7 @@
}
TEST(GURLTest, Replacements) {
- // The url canonicalizer replacement test will handle most of these case.
+ // The URL canonicalizer replacement test will handle most of these case.
// The most important thing to do here is to check that the proper
// canonicalizer gets called based on the scheme of the input.
struct ReplaceCase {
@@ -395,7 +396,7 @@
} replace_cases[] = {
{"http://www.google.com/foo/bar.html?foo#bar", NULL, NULL, NULL, NULL, NULL, "/", "", "", "http://www.google.com/"},
{"http://www.google.com/foo/bar.html?foo#bar", "javascript", "", "", "", "", "window.open('foo');", "", "", "javascript:window.open('foo');"},
- {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo","search", "ref", "http://www.google.com:99/foo?search#ref"},
+ {"file:///C:/foo/bar.txt", "http", NULL, NULL, "www.google.com", "99", "/foo", "search", "ref", "http://www.google.com:99/foo?search#ref"},
#ifdef WIN32
{"http://www.google.com/foo/bar.html?foo#bar", "file", "", "", "", "", "c:\\", "", "", "file:///C:/"},
#endif
@@ -435,7 +436,7 @@
EXPECT_EQ("data: one ? two ", url_no_ref.spec());
- // Importing a parsed url via this constructor overload will retain trailing
+ // Importing a parsed URL via this constructor overload will retain trailing
// whitespace.
GURL import_url(url_no_ref.spec(),
url_no_ref.parsed_for_possibly_invalid_spec(),
@@ -561,43 +562,56 @@
}
TEST(GURLTest, DomainIs) {
- const char google_domain[] = "google.com";
+ GURL url_1("http://google.com/foo");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
- GURL url_1("http://www.google.com:99/foo");
- EXPECT_TRUE(url_1.DomainIs(google_domain));
+ // Subdomain and port are ignored.
+ GURL url_2("http://www.google.com:99/foo");
+ EXPECT_TRUE(url_2.DomainIs("google.com"));
- GURL url_2("http://google.com:99/foo");
- EXPECT_TRUE(url_2.DomainIs(google_domain));
+ // Different top-level domain.
+ GURL url_3("http://www.google.com.cn/foo");
+ EXPECT_FALSE(url_3.DomainIs("google.com"));
- GURL url_3("http://google.com./foo");
- EXPECT_TRUE(url_3.DomainIs(google_domain));
+ // Different host name.
+ GURL url_4("http://www.iamnotgoogle.com/foo");
+ EXPECT_FALSE(url_4.DomainIs("google.com"));
- GURL url_4("http://google.com/foo");
- EXPECT_FALSE(url_4.DomainIs("google.com."));
+ // The input must be lower-cased otherwise DomainIs returns false.
+ GURL url_5("http://www.google.com/foo");
+ EXPECT_FALSE(url_5.DomainIs("Google.com"));
- GURL url_5("http://google.com./foo");
- EXPECT_TRUE(url_5.DomainIs("google.com."));
+ // If the URL is invalid, DomainIs returns false.
+ GURL invalid_url("google.com");
+ EXPECT_FALSE(invalid_url.is_valid());
+ EXPECT_FALSE(invalid_url.DomainIs("google.com"));
+}
- GURL url_6("http://www.google.com./foo");
- EXPECT_TRUE(url_6.DomainIs(".com."));
+TEST(GURLTest, DomainIsTerminatingDotBehavior) {
+ // If the host part ends with a dot, it matches input domains
+ // with or without a dot.
+ GURL url_with_dot("http://www.google.com./foo");
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com"));
+ EXPECT_TRUE(url_with_dot.DomainIs("google.com."));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com"));
+ EXPECT_TRUE(url_with_dot.DomainIs(".com."));
- GURL url_7("http://www.balabala.com/foo");
- EXPECT_FALSE(url_7.DomainIs(google_domain));
+ // But, if the host name doesn't end with a dot and the input
+ // domain does, then it's considered to not match.
+ GURL url_without_dot("http://google.com/foo");
+ EXPECT_FALSE(url_without_dot.DomainIs("google.com."));
- GURL url_8("http://www.google.com.cn/foo");
- EXPECT_FALSE(url_8.DomainIs(google_domain));
+ // If the URL ends with two dots, it doesn't match.
+ GURL url_with_two_dots("http://www.google.com../foo");
+ EXPECT_FALSE(url_with_two_dots.DomainIs("google.com"));
+}
- GURL url_9("http://www.iamnotgoogle.com/foo");
- EXPECT_FALSE(url_9.DomainIs(google_domain));
+TEST(GURLTest, DomainIsWithFilesystemScheme) {
+ GURL url_1("filesystem:http://www.google.com:99/foo/");
+ EXPECT_TRUE(url_1.DomainIs("google.com"));
- GURL url_10("http://www.iamnotgoogle.com../foo");
- EXPECT_FALSE(url_10.DomainIs(".com"));
-
- GURL url_11("filesystem:http://www.google.com:99/foo/");
- EXPECT_TRUE(url_11.DomainIs(google_domain));
-
- GURL url_12("filesystem:http://www.iamnotgoogle.com/foo/");
- EXPECT_FALSE(url_12.DomainIs(google_domain));
+ GURL url_2("filesystem:http://www.iamnotgoogle.com/foo/");
+ EXPECT_FALSE(url_2.DomainIs("google.com"));
}
// Newlines should be stripped from inputs.
@@ -642,4 +656,29 @@
EXPECT_FALSE(GURL("http://bar/").SchemeIsBlob());
}
+TEST(GURLTest, ContentAndPathForNonStandardURLs) {
+ struct TestCase {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"null", ""},
+ {"not-a-standard-scheme:this is arbitrary content",
+ "this is arbitrary content"},
+ {"view-source:http://example.com/path", "http://example.com/path"},
+ {"blob:http://example.com/GUID", "http://example.com/GUID"},
+ {"blob://http://example.com/GUID", "//http://example.com/GUID"},
+ {"blob:http://user:password@example.com/GUID",
+ "http://user:password@example.com/GUID"},
+
+ // TODO(mkwst): This seems like a bug. https://crbug.com/513600
+ {"filesystem:http://example.com/path", "/"},
+ };
+
+ for (const auto& test : cases) {
+ GURL url(test.url);
+ EXPECT_EQ(test.expected, url.path()) << test.url;
+ EXPECT_EQ(test.expected, url.GetContent()) << test.url;
+ }
+}
+
} // namespace url
diff --git a/origin.cc b/origin.cc
index cebf5dd..9d0c4f0 100644
--- a/origin.cc
+++ b/origin.cc
@@ -1,20 +1,82 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "url/origin.h"
+#include <string.h>
+
#include "base/logging.h"
-#include "base/strings/pattern.h"
+#include "base/strings/string_number_conversions.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
namespace url {
-Origin::Origin() : string_("null") {}
+Origin::Origin() : unique_(true) {
+}
-Origin::Origin(const std::string& origin) : string_(origin) {
- DCHECK(origin == "null" || base::MatchPattern(origin, "?*://?*"));
- DCHECK_GT(origin.size(), 0u);
- DCHECK(origin == "file://" || origin[origin.size() - 1] != '/');
+Origin::Origin(const GURL& url) : unique_(true) {
+ if (!url.is_valid() || (!url.IsStandard() && !url.SchemeIsBlob()))
+ return;
+
+ if (url.SchemeIsFileSystem()) {
+ tuple_ = SchemeHostPort(*url.inner_url());
+ } else if (url.SchemeIsBlob()) {
+ // If we're dealing with a 'blob:' URL, https://url.spec.whatwg.org/#origin
+ // defines the origin as the origin of the URL which results from parsing
+ // the "path", which boils down to everything after the scheme. GURL's
+ // 'GetContent()' gives us exactly that.
+ tuple_ = SchemeHostPort(GURL(url.GetContent()));
+ } else {
+ tuple_ = SchemeHostPort(url);
+ }
+
+ unique_ = tuple_.IsInvalid();
+}
+
+Origin::Origin(base::StringPiece scheme, base::StringPiece host, uint16 port)
+ : tuple_(scheme, host, port) {
+ unique_ = tuple_.IsInvalid();
+}
+
+Origin::~Origin() {
+}
+
+// static
+Origin Origin::UnsafelyCreateOriginWithoutNormalization(
+ base::StringPiece scheme,
+ base::StringPiece host,
+ uint16 port) {
+ return Origin(scheme, host, port);
+}
+
+std::string Origin::Serialize() const {
+ if (unique())
+ return "null";
+
+ if (scheme() == kFileScheme)
+ return "file://";
+
+ return tuple_.Serialize();
+}
+
+bool Origin::IsSameOriginWith(const Origin& other) const {
+ if (unique_ || other.unique_)
+ return false;
+
+ return tuple_.Equals(other.tuple_);
+}
+
+bool Origin::operator<(const Origin& other) const {
+ return tuple_ < other.tuple_;
+}
+
+std::ostream& operator<<(std::ostream& out, const url::Origin& origin) {
+ return out << origin.Serialize();
}
} // namespace url
diff --git a/origin.h b/origin.h
index 777e4e1..c94c38c 100644
--- a/origin.h
+++ b/origin.h
@@ -1,4 +1,4 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -7,27 +7,130 @@
#include <string>
+#include "base/strings/string16.h"
+#include "base/strings/string_piece.h"
+#include "url/scheme_host_port.h"
+#include "url/third_party/mozilla/url_parse.h"
+#include "url/url_canon.h"
+#include "url/url_constants.h"
#include "url/url_export.h"
+class GURL;
+
namespace url {
-// Origin represents a Web Origin serialized to a string.
-// See RFC6454 for details.
+// An Origin is a tuple of (scheme, host, port), as described in RFC 6454.
+//
+// TL;DR: If you need to make a security-relevant decision, use 'url::Origin'.
+// If you only need to extract the bits of a URL which are relevant for a
+// network connection, use 'url::SchemeHostPort'.
+//
+// STL;SDR: If you aren't making actual network connections, use 'url::Origin'.
+//
+// 'Origin', like 'SchemeHostPort', is composed of a tuple of (scheme, host,
+// port), but contains a number of additional concepts which make it appropriate
+// for use as a security boundary and access control mechanism between contexts.
+//
+// This class ought to be used when code needs to determine if two resources
+// are "same-origin", and when a canonical serialization of an origin is
+// required. Note that some origins are "unique", meaning that they are not
+// same-origin with any other origin (including themselves).
+//
+// There are a few subtleties to note:
+//
+// * Invalid and non-standard GURLs are parsed as unique origins. This includes
+// non-hierarchical URLs like 'data:text/html,...' and 'javascript:alert(1)'.
+//
+// * GURLs with schemes of 'filesystem' or 'blob' parse the origin out of the
+// internals of the URL. That is, 'filesystem:https://example.com/temporary/f'
+// is parsed as ('https', 'example.com', 443).
+//
+// * Unique origins all serialize to the string "null"; this means that the
+// serializations of two unique origins are identical to each other, though
+// the origins themselves are not "the same". This means that origins'
+// serializations must not be relied upon for security checks.
+//
+// * GURLs with a 'file' scheme are tricky. They are parsed as ('file', '', 0),
+// but their behavior may differ from embedder to embedder.
+//
+// * The host component of an IPv6 address includes brackets, just like the URL
+// representation.
+//
+// Usage:
+//
+// * Origins are generally constructed from an already-canonicalized GURL:
+//
+// GURL url("https://example.com/");
+// url::Origin origin(url);
+// origin.scheme(); // "https"
+// origin.host(); // "example.com"
+// origin.port(); // 443
+// origin.IsUnique(); // false
+//
+// * To answer the question "Are |this| and |that| "same-origin" with each
+// other?", use |Origin::IsSameOriginWith|:
+//
+// if (this.IsSameOriginWith(that)) {
+// // Amazingness goes here.
+// }
class URL_EXPORT Origin {
public:
+ // Creates a unique Origin.
Origin();
- explicit Origin(const std::string& origin);
- const std::string& string() const { return string_; }
+ // Creates an Origin from |url|, as described at
+ // https://url.spec.whatwg.org/#origin, with the following additions:
+ //
+ // 1. If |url| is invalid or non-standard, a unique Origin is constructed.
+ // 2. 'filesystem' URLs behave as 'blob' URLs (that is, the origin is parsed
+ // out of everything in the URL which follows the scheme).
+ // 3. 'file' URLs all parse as ("file", "", 0).
+ explicit Origin(const GURL& url);
- bool IsSameAs(const Origin& that) const {
- return string_ == that.string_;
- }
+ // Creates an Origin from a |scheme|, |host|, and |port|. All the parameters
+ // must be valid and canonicalized. In particular, note that this cannot be
+ // used to create unique origins; 'url::Origin()' is the right way to do that.
+ //
+ // This constructor should be used in order to pass 'Origin' objects back and
+ // forth over IPC (as transitioning through GURL would risk potentially
+ // dangerous recanonicalization); other potential callers should prefer the
+ // 'GURL'-based constructor.
+ static Origin UnsafelyCreateOriginWithoutNormalization(
+ base::StringPiece scheme,
+ base::StringPiece host,
+ uint16 port);
+
+ ~Origin();
+
+ // For unique origins, these return ("", "", 0).
+ const std::string& scheme() const { return tuple_.scheme(); }
+ const std::string& host() const { return tuple_.host(); }
+ uint16 port() const { return tuple_.port(); }
+
+ bool unique() const { return unique_; }
+
+ // An ASCII serialization of the Origin as per Section 6.2 of RFC 6454, with
+ // the addition that all Origins with a 'file' scheme serialize to "file://".
+ std::string Serialize() const;
+
+ // Two Origins are "same-origin" if their schemes, hosts, and ports are exact
+ // matches; and neither is unique.
+ bool IsSameOriginWith(const Origin& other) const;
+
+ // Allows SchemeHostPort to used as a key in STL (for example, a std::set or
+ // std::map).
+ bool operator<(const Origin& other) const;
private:
- std::string string_;
+ Origin(base::StringPiece scheme, base::StringPiece host, uint16 port);
+
+ SchemeHostPort tuple_;
+ bool unique_;
};
+URL_EXPORT std::ostream& operator<<(std::ostream& out,
+ const Origin& origin);
+
} // namespace url
#endif // URL_ORIGIN_H_
diff --git a/origin_unittest.cc b/origin_unittest.cc
index c094ee6..ec4ec65 100644
--- a/origin_unittest.cc
+++ b/origin_unittest.cc
@@ -1,41 +1,251 @@
-// Copyright 2014 The Chromium Authors. All rights reserved.
+// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "testing/gtest/include/gtest/gtest.h"
+#include "base/logging.h"
#include "url/origin.h"
-
-namespace url {
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
namespace {
-// Each test examines the Origin is constructed correctly without
-// violating DCHECKs.
-TEST(OriginTest, constructEmpty) {
- Origin origin;
- EXPECT_EQ("null", origin.string());
+TEST(OriginTest, UniqueOriginComparison) {
+ url::Origin unique_origin;
+ EXPECT_EQ("", unique_origin.scheme());
+ EXPECT_EQ("", unique_origin.host());
+ EXPECT_EQ(0, unique_origin.port());
+ EXPECT_TRUE(unique_origin.unique());
+ EXPECT_FALSE(unique_origin.IsSameOriginWith(unique_origin));
+
+ const char* const urls[] = {"data:text/html,Hello!",
+ "javascript:alert(1)",
+ "file://example.com:443/etc/passwd",
+ "yay",
+ "http::///invalid.example.com/"};
+
+ for (const auto& test_url : urls) {
+ SCOPED_TRACE(test_url);
+ GURL url(test_url);
+ url::Origin origin(url);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(unique_origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(origin.IsSameOriginWith(unique_origin));
+ }
}
-TEST(OriginTest, constructNull) {
- Origin origin("null");
- EXPECT_EQ("null", origin.string());
+TEST(OriginTest, ConstructFromGURL) {
+ url::Origin different_origin(GURL("https://not-in-the-list.test/"));
+
+ struct TestCases {
+ const char* const url;
+ const char* const expected_scheme;
+ const char* const expected_host;
+ const uint16 expected_port;
+ } cases[] = {
+ // IP Addresses
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+
+ // Punycode
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"blob:http://☃.net/", "http", "xn--n3h.net", 80},
+
+ // Generic URLs
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"http://user:pass@example.com/", "http", "example.com", 80},
+ {"http://example.com:123/?query", "http", "example.com", 123},
+ {"https://example.com/#1234", "https", "example.com", 443},
+ {"https://u:p@example.com:123/?query#1234", "https", "example.com", 123},
+
+ // Registered URLs
+ {"ftp://example.com/", "ftp", "example.com", 21},
+ {"gopher://example.com/", "gopher", "example.com", 70},
+ {"ws://example.com/", "ws", "example.com", 80},
+ {"wss://example.com/", "wss", "example.com", 443},
+
+ // file: URLs
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+
+ // Filesystem:
+ {"filesystem:http://example.com/type/", "http", "example.com", 80},
+ {"filesystem:http://example.com:123/type/", "http", "example.com", 123},
+ {"filesystem:https://example.com/type/", "https", "example.com", 443},
+ {"filesystem:https://example.com:123/type/", "https", "example.com", 123},
+
+ // Blob:
+ {"blob:http://example.com/guid-goes-here", "http", "example.com", 80},
+ {"blob:http://example.com:123/guid-goes-here", "http", "example.com", 123},
+ {"blob:https://example.com/guid-goes-here", "https", "example.com", 443},
+ {"blob:http://u:p@example.com/guid-goes-here", "http", "example.com", 80},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ url::Origin origin(url);
+ EXPECT_EQ(test_case.expected_scheme, origin.scheme());
+ EXPECT_EQ(test_case.expected_host, origin.host());
+ EXPECT_EQ(test_case.expected_port, origin.port());
+ EXPECT_FALSE(origin.unique());
+ EXPECT_TRUE(origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(different_origin.IsSameOriginWith(origin));
+ EXPECT_FALSE(origin.IsSameOriginWith(different_origin));
+ }
}
-TEST(OriginTest, constructValidOrigin) {
- Origin origin("http://example.com:8080");
- EXPECT_EQ("http://example.com:8080", origin.string());
+TEST(OriginTest, Serialization) {
+ struct TestCases {
+ const char* const url;
+ const char* const expected;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://"},
+ {"file://example.com/etc/passwd", "file://"},
+ };
+
+ for (const auto& test_case : cases) {
+ SCOPED_TRACE(test_case.url);
+ GURL url(test_case.url);
+ EXPECT_TRUE(url.is_valid());
+ url::Origin origin(url);
+ EXPECT_EQ(test_case.expected, origin.Serialize());
+
+ // The '<<' operator should produce the same serialization as Serialize().
+ std::stringstream out;
+ out << origin;
+ EXPECT_EQ(test_case.expected, out.str());
+ }
}
-TEST(OriginTest, constructValidFileOrigin) {
- Origin origin("file://");
- EXPECT_EQ("file://", origin.string());
+TEST(OriginTest, Comparison) {
+ // These URLs are arranged in increasing order:
+ const char* const urls[] = {
+ "data:uniqueness",
+ "http://a:80",
+ "http://b:80",
+ "https://a:80",
+ "https://b:80",
+ "http://a:81",
+ "http://b:81",
+ "https://a:81",
+ "https://b:81",
+ };
+
+ for (size_t i = 0; i < arraysize(urls); i++) {
+ GURL current_url(urls[i]);
+ url::Origin current(current_url);
+ for (size_t j = i; j < arraysize(urls); j++) {
+ GURL compare_url(urls[j]);
+ url::Origin to_compare(compare_url);
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
}
-TEST(OriginTest, constructValidOriginWithoutPort) {
- Origin origin("wss://example2.com");
- EXPECT_EQ("wss://example2.com", origin.string());
+TEST(OriginTest, UnsafelyCreate) {
+ struct TestCase {
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ test.scheme, test.host, test.port);
+ EXPECT_EQ(test.scheme, origin.scheme());
+ EXPECT_EQ(test.host, origin.host());
+ EXPECT_EQ(test.port, origin.port());
+ EXPECT_FALSE(origin.unique());
+ EXPECT_TRUE(origin.IsSameOriginWith(origin));
+ }
}
-} // namespace
+TEST(OriginTest, UnsafelyCreateUniqueOnInvalidInput) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } cases[] = {{"", "", 0},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"data", "example.com", 80},
+ {"http", "☃.net", 80},
+ {"http\nmore", "example.com", 80},
+ {"http\rmore", "example.com", 80},
+ {"http\n", "example.com", 80},
+ {"http\r", "example.com", 80},
+ {"http", "example.com\nnot-example.com", 80},
+ {"http", "example.com\rnot-example.com", 80},
+ {"http", "example.com\n", 80},
+ {"http", "example.com\r", 80},
+ {"http", "example.com", 0},
+ {"file", "", 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ test.scheme, test.host, test.port);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ }
+}
+
+TEST(OriginTest, UnsafelyCreateUniqueViaEmbeddedNulls) {
+ struct TestCases {
+ const char* scheme;
+ size_t scheme_length;
+ const char* host;
+ size_t host_length;
+ uint16 port;
+ } cases[] = {{"http\0more", 9, "example.com", 11, 80},
+ {"http\0", 5, "example.com", 11, 80},
+ {"\0http", 5, "example.com", 11, 80},
+ {"http", 4, "example.com\0not-example.com", 27, 80},
+ {"http", 4, "example.com\0", 12, 80},
+ {"http", 4, "\0example.com", 12, 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::Origin origin = url::Origin::UnsafelyCreateOriginWithoutNormalization(
+ std::string(test.scheme, test.scheme_length),
+ std::string(test.host, test.host_length), test.port);
+ EXPECT_EQ("", origin.scheme());
+ EXPECT_EQ("", origin.host());
+ EXPECT_EQ(0, origin.port());
+ EXPECT_TRUE(origin.unique());
+ EXPECT_FALSE(origin.IsSameOriginWith(origin));
+ }
+}
} // namespace url
diff --git a/scheme_host_port.cc b/scheme_host_port.cc
new file mode 100644
index 0000000..c2fe830
--- /dev/null
+++ b/scheme_host_port.cc
@@ -0,0 +1,129 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "url/scheme_host_port.h"
+
+#include <string.h>
+
+#include "base/logging.h"
+#include "base/strings/string_number_conversions.h"
+#include "url/gurl.h"
+#include "url/url_canon.h"
+#include "url/url_canon_stdstring.h"
+#include "url/url_constants.h"
+#include "url/url_util.h"
+
+namespace url {
+
+SchemeHostPort::SchemeHostPort() : port_(0) {
+}
+
+SchemeHostPort::SchemeHostPort(base::StringPiece scheme,
+ base::StringPiece host,
+ uint16 port)
+ : scheme_(scheme.data(), scheme.length()),
+ host_(host.data(), host.length()),
+ port_(port) {
+ // Try to canonicalize the host (copy/pasted from net/base. :( ).
+ const url::Component raw_host_component(0, static_cast<int>(host.length()));
+ std::string canon_host;
+ url::StdStringCanonOutput canon_host_output(&canon_host);
+ url::CanonHostInfo host_info;
+ url::CanonicalizeHostVerbose(host.data(), raw_host_component,
+ &canon_host_output, &host_info);
+
+ if (host_info.out_host.is_nonempty() &&
+ host_info.family != url::CanonHostInfo::BROKEN) {
+ // Success! Assert that there's no extra garbage.
+ canon_host_output.Complete();
+ DCHECK_EQ(host_info.out_host.len, static_cast<int>(canon_host.length()));
+ } else {
+ // Empty host, or canonicalization failed.
+ canon_host.clear();
+ }
+
+ // Return an invalid SchemeHostPort object if any of the following conditions
+ // hold:
+ //
+ // 1. The provided scheme is non-standard, 'blob:', or 'filesystem:'.
+ // 2. The provided host is non-canonical.
+ // 3. The scheme is 'file' and the port is non-zero.
+ // 4. The scheme is not 'file', and the port is zero or the host is empty.
+ bool isUnsupportedScheme =
+ !url::IsStandard(scheme.data(),
+ url::Component(0, static_cast<int>(scheme.length()))) ||
+ scheme == kFileSystemScheme || scheme == kBlobScheme;
+ bool isNoncanonicalHost = host != canon_host;
+ bool isFileSchemeWithPort = scheme == kFileScheme && port != 0;
+ bool isNonFileSchemeWithoutPortOrHost =
+ scheme != kFileScheme && (port == 0 || host.empty());
+ if (isUnsupportedScheme || isNoncanonicalHost || isFileSchemeWithPort ||
+ isNonFileSchemeWithoutPortOrHost) {
+ scheme_.clear();
+ host_.clear();
+ port_ = 0;
+ }
+}
+
+SchemeHostPort::SchemeHostPort(const GURL& url) : port_(0) {
+ if (!url.is_valid() || !url.IsStandard())
+ return;
+
+ // These schemes do not follow the generic URL syntax, so we treat them as
+ // invalid (scheme, host, port) tuples (even though such URLs' _Origin_ might
+ // have a (scheme, host, port) tuple, they themselves do not).
+ if (url.SchemeIsBlob() || url.SchemeIsFileSystem())
+ return;
+
+ scheme_ = url.scheme();
+ host_ = url.host();
+ port_ = url.EffectiveIntPort() == url::PORT_UNSPECIFIED
+ ? 0
+ : url.EffectiveIntPort();
+}
+
+SchemeHostPort::~SchemeHostPort() {
+}
+
+bool SchemeHostPort::IsInvalid() const {
+ return scheme_.empty() && host_.empty() && !port_;
+}
+
+std::string SchemeHostPort::Serialize() const {
+ std::string result;
+ if (IsInvalid())
+ return result;
+
+ bool is_default_port =
+ port_ == url::DefaultPortForScheme(scheme_.data(),
+ static_cast<int>(scheme_.length()));
+
+ result.append(scheme_);
+ result.append(kStandardSchemeSeparator);
+ result.append(host_);
+
+ if (scheme_ != kFileScheme && !is_default_port) {
+ result.push_back(':');
+ result.append(base::IntToString(port_));
+ }
+
+ return result;
+}
+
+bool SchemeHostPort::Equals(const SchemeHostPort& other) const {
+ return port_ == other.port() && scheme_ == other.scheme() &&
+ host_ == other.host();
+}
+
+bool SchemeHostPort::operator<(const SchemeHostPort& other) const {
+ if (port_ != other.port_)
+ return port_ < other.port_;
+ if (scheme_ != other.scheme_)
+ return scheme_ < other.scheme_;
+ if (host_ != other.host_)
+ return host_ < other.host_;
+ return false;
+}
+
+} // namespace url
diff --git a/scheme_host_port.h b/scheme_host_port.h
new file mode 100644
index 0000000..2cc9e07
--- /dev/null
+++ b/scheme_host_port.h
@@ -0,0 +1,132 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef URL_SCHEME_HOST_PORT_H_
+#define URL_SCHEME_HOST_PORT_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/strings/string_piece.h"
+#include "url/url_export.h"
+
+class GURL;
+
+namespace url {
+
+// This class represents a (scheme, host, port) tuple extracted from a URL.
+//
+// The primary purpose of this class is to represent relevant network-authority
+// information for a URL. It is _not_ an Origin, as described in RFC 6454. In
+// particular, it is generally NOT the right thing to use for security
+// decisions.
+//
+// Instead, this class is a mechanism for simplifying URLs with standard schemes
+// (that is, those which follow the generic syntax of RFC 3986) down to the
+// uniquely identifying information necessary for network fetches. This makes it
+// suitable as a cache key for a collection of active connections, for instance.
+// It may, however, be inappropriate to use as a cache key for persistent
+// storage associated with a host.
+//
+// In particular, note that:
+//
+// * SchemeHostPort can only represent schemes which follow the RFC 3986 syntax
+// (e.g. those registered with GURL as "standard schemes"). Non-standard
+// schemes such as "blob", "filesystem", "data", and "javascript" can only be
+// represented as invalid SchemeHostPort objects.
+//
+// * The "file" scheme follows the standard syntax, but it is important to note
+// that the authority portion (host, port) is optional. URLs without an
+// authority portion will be represented with an empty string for the host,
+// and a port of 0 (e.g. "file:///etc/hosts" => ("file", "", 0)), and URLs
+// with a host-only authority portion will be represented with a port of 0
+// (e.g. "file://example.com/etc/hosts" => ("file", "example.com", 0)). See
+// Section 3 of RFC 3986 to better understand these constructs.
+//
+// * SchemeHostPort has no notion of the Origin concept (RFC 6454), and in
+// particular, it has no notion of a "unique" Origin. If you need to take
+// uniqueness into account (and, if you're making security-relevant decisions
+// then you absolutely do), please use 'url::Origin' instead[1].
+//
+// [1]: // TODO(mkwst): Land 'url::Origin'. :)
+//
+// Usage:
+//
+// * SchemeHostPort objects are commonly created from GURL objects:
+//
+// GURL url("https://example.com/");
+// url::SchemeHostPort tuple(url);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// * Objects may also be explicitly created and compared:
+//
+// url::SchemeHostPort tuple(url::kHttpsScheme, "example.com", 443);
+// tuple.scheme(); // "https"
+// tuple.host(); // "example.com"
+// tuple.port(); // 443
+//
+// GURL url("https://example.com/");
+// tuple.Equals(url::SchemeHostPort(url)); // true
+class URL_EXPORT SchemeHostPort {
+ public:
+ // Creates an invalid (scheme, host, port) tuple, which represents an invalid
+ // or non-standard URL.
+ SchemeHostPort();
+
+ // Creates a (scheme, host, port) tuple. |host| must be a canonicalized
+ // A-label (that is, '☃.net' must be provided as 'xn--n3h.net'). |scheme|
+ // must be a standard scheme. |port| must not be 0, unless |scheme| does not
+ // support ports (e.g. 'file'). In that case, |port| must be 0.
+ //
+ // Copies the data in |scheme| and |host|.
+ SchemeHostPort(base::StringPiece scheme, base::StringPiece host, uint16 port);
+
+ // Creates a (scheme, host, port) tuple from |url|, as described at
+ // https://tools.ietf.org/html/rfc6454#section-4
+ //
+ // If |url| is invalid or non-standard, the result will be an invalid
+ // SchemeHostPort object.
+ explicit SchemeHostPort(const GURL& url);
+
+ ~SchemeHostPort();
+
+ // Returns the host component, in URL form. That is all IDN domain names will
+ // be expressed as A-Labels ('☃.net' will be returned as 'xn--n3h.net'), and
+ // and all IPv6 addresses will be enclosed in brackets ("[2001:db8::1]").
+ const std::string& host() const { return host_; }
+ const std::string& scheme() const { return scheme_; }
+ uint16 port() const { return port_; }
+ bool IsInvalid() const;
+
+ // Serializes the SchemeHostPort tuple to a canonical form.
+ //
+ // While this string form resembles the Origin serialization specified in
+ // Section 6.2 of RFC 6454, it is important to note that invalid
+ // SchemeHostPort tuples serialize to the empty string, rather than being
+ // serialized as a unique Origin.
+ std::string Serialize() const;
+
+ // Two SchemeHostPort objects are "equal" iff their schemes, hosts, and ports
+ // are exact matches.
+ //
+ // Note that this comparison is _not_ the same as an origin-based comparison.
+ // In particular, invalid SchemeHostPort objects match each other (and
+ // themselves). Unique origins, on the other hand, would not.
+ bool Equals(const SchemeHostPort& other) const;
+
+ // Allows SchemeHostPort to used as a key in STL (for example, a std::set or
+ // std::map).
+ bool operator<(const SchemeHostPort& other) const;
+
+ private:
+ std::string scheme_;
+ std::string host_;
+ uint16 port_;
+};
+
+} // namespace url
+
+#endif // URL_SCHEME_HOST_PORT_H_
diff --git a/scheme_host_port_unittest.cc b/scheme_host_port_unittest.cc
new file mode 100644
index 0000000..817631d
--- /dev/null
+++ b/scheme_host_port_unittest.cc
@@ -0,0 +1,215 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+#include "url/scheme_host_port.h"
+
+namespace {
+
+TEST(SchemeHostPortTest, Invalid) {
+ url::SchemeHostPort invalid;
+ EXPECT_EQ("", invalid.scheme());
+ EXPECT_EQ("", invalid.host());
+ EXPECT_EQ(0, invalid.port());
+ EXPECT_TRUE(invalid.IsInvalid());
+ EXPECT_TRUE(invalid.Equals(invalid));
+
+ const char* urls[] = {"data:text/html,Hello!",
+ "javascript:alert(1)",
+ "file://example.com:443/etc/passwd",
+ "blob:https://example.com/uuid-goes-here",
+ "filesystem:https://example.com/temporary/yay.png"};
+
+ for (const auto& test : urls) {
+ SCOPED_TRACE(test);
+ GURL url(test);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ EXPECT_TRUE(tuple.Equals(invalid));
+ EXPECT_TRUE(invalid.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, ExplicitConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } cases[] = {
+ {"http", "example.com", 80},
+ {"http", "example.com", 123},
+ {"https", "example.com", 443},
+ {"https", "example.com", 123},
+ {"file", "", 0},
+ {"file", "example.com", 0},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, InvalidConstruction) {
+ struct TestCases {
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } cases[] = {{"", "", 0},
+ {"data", "", 0},
+ {"blob", "", 0},
+ {"filesystem", "", 0},
+ {"http", "", 80},
+ {"data", "example.com", 80},
+ {"http", "☃.net", 80},
+ {"http\nmore", "example.com", 80},
+ {"http\rmore", "example.com", 80},
+ {"http\n", "example.com", 80},
+ {"http\r", "example.com", 80},
+ {"http", "example.com\nnot-example.com", 80},
+ {"http", "example.com\rnot-example.com", 80},
+ {"http", "example.com\n", 80},
+ {"http", "example.com\r", 80},
+ {"http", "example.com", 0},
+ {"file", "", 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(test.scheme, test.host, test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, InvalidConstructionWithEmbeddedNulls) {
+ struct TestCases {
+ const char* scheme;
+ size_t scheme_length;
+ const char* host;
+ size_t host_length;
+ uint16 port;
+ } cases[] = {{"http\0more", 9, "example.com", 11, 80},
+ {"http\0", 5, "example.com", 11, 80},
+ {"\0http", 5, "example.com", 11, 80},
+ {"http", 4, "example.com\0not-example.com", 27, 80},
+ {"http", 4, "example.com\0", 12, 80},
+ {"http", 4, "\0example.com", 12, 80}};
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(testing::Message() << test.scheme << "://" << test.host << ":"
+ << test.port);
+ url::SchemeHostPort tuple(std::string(test.scheme, test.scheme_length),
+ std::string(test.host, test.host_length),
+ test.port);
+ EXPECT_EQ("", tuple.scheme());
+ EXPECT_EQ("", tuple.host());
+ EXPECT_EQ(0, tuple.port());
+ EXPECT_TRUE(tuple.IsInvalid());
+ }
+}
+
+TEST(SchemeHostPortTest, GURLConstruction) {
+ struct TestCases {
+ const char* url;
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } cases[] = {
+ {"http://192.168.9.1/", "http", "192.168.9.1", 80},
+ {"http://[2001:db8::1]/", "http", "[2001:db8::1]", 80},
+ {"http://☃.net/", "http", "xn--n3h.net", 80},
+ {"http://example.com/", "http", "example.com", 80},
+ {"http://example.com:123/", "http", "example.com", 123},
+ {"https://example.com/", "https", "example.com", 443},
+ {"https://example.com:123/", "https", "example.com", 123},
+ {"file:///etc/passwd", "file", "", 0},
+ {"file://example.com/etc/passwd", "file", "example.com", 0},
+ {"http://u:p@example.com/", "http", "example.com", 80},
+ {"http://u:p@example.com/path", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123", "http", "example.com", 80},
+ {"http://u:p@example.com/path?123#hash", "http", "example.com", 80},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ EXPECT_TRUE(url.is_valid());
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.scheme, tuple.scheme());
+ EXPECT_EQ(test.host, tuple.host());
+ EXPECT_EQ(test.port, tuple.port());
+ EXPECT_FALSE(tuple.IsInvalid());
+ EXPECT_TRUE(tuple.Equals(tuple));
+ }
+}
+
+TEST(SchemeHostPortTest, Serialization) {
+ struct TestCases {
+ const char* url;
+ const char* expected;
+ } cases[] = {
+ {"http://192.168.9.1/", "http://192.168.9.1"},
+ {"http://[2001:db8::1]/", "http://[2001:db8::1]"},
+ {"http://☃.net/", "http://xn--n3h.net"},
+ {"http://example.com/", "http://example.com"},
+ {"http://example.com:123/", "http://example.com:123"},
+ {"https://example.com/", "https://example.com"},
+ {"https://example.com:123/", "https://example.com:123"},
+ {"file:///etc/passwd", "file://"},
+ {"file://example.com/etc/passwd", "file://example.com"},
+ };
+
+ for (const auto& test : cases) {
+ SCOPED_TRACE(test.url);
+ GURL url(test.url);
+ url::SchemeHostPort tuple(url);
+ EXPECT_EQ(test.expected, tuple.Serialize());
+ }
+}
+
+TEST(SchemeHostPortTest, Comparison) {
+ // These tuples are arranged in increasing order:
+ struct SchemeHostPorts {
+ const char* scheme;
+ const char* host;
+ uint16 port;
+ } tuples[] = {
+ {"http", "a", 80},
+ {"http", "b", 80},
+ {"https", "a", 80},
+ {"https", "b", 80},
+ {"http", "a", 81},
+ {"http", "b", 81},
+ {"https", "a", 81},
+ {"https", "b", 81},
+ };
+
+ for (size_t i = 0; i < arraysize(tuples); i++) {
+ url::SchemeHostPort current(tuples[i].scheme, tuples[i].host,
+ tuples[i].port);
+ for (size_t j = i; j < arraysize(tuples); j++) {
+ url::SchemeHostPort to_compare(tuples[j].scheme, tuples[j].host,
+ tuples[j].port);
+ EXPECT_EQ(i < j, current < to_compare) << i << " < " << j;
+ EXPECT_EQ(j < i, to_compare < current) << j << " < " << i;
+ }
+ }
+}
+
+} // namespace url
diff --git a/third_party/mozilla/url_parse.h b/third_party/mozilla/url_parse.h
index 71dbb78..7bfcdc8 100644
--- a/third_party/mozilla/url_parse.h
+++ b/third_party/mozilla/url_parse.h
@@ -5,9 +5,6 @@
#ifndef URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
#define URL_THIRD_PARTY_MOZILLA_URL_PARSE_H_
-#include <string>
-
-#include "base/basictypes.h"
#include "base/strings/string16.h"
#include "url/url_export.h"
diff --git a/url_canon.h b/url_canon.h
index 432f291..95d5345 100644
--- a/url_canon.h
+++ b/url_canon.h
@@ -9,8 +9,8 @@
#include <string.h>
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -285,7 +285,7 @@
// User info: username/password. If present, this will add the delimiters so
// the output will be "<username>:<password>@" or "<username>@". Empty
// username/password pairs, or empty passwords, will get converted to
-// nonexistant in the canonical version.
+// nonexistent in the canonical version.
//
// The components for the username and password refer to ranges in the
// respective source strings. Usually, these will be the same string, which
@@ -317,13 +317,13 @@
// This field summarizes how the input was classified by the canonicalizer.
enum Family {
- NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
+ NEUTRAL, // - Doesn't resemble an IP address. As far as the IP
// canonicalizer is concerned, it should be treated as a
// hostname.
- BROKEN, // - Almost an IP, but was not canonicalized. This could be an
+ BROKEN, // - Almost an IP, but was not canonicalized. This could be an
// IPv4 address where truncation occurred, or something
// containing the special characters :[] which did not parse
- // as an IPv6 address. Never attempt to connect to this
+ // as an IPv6 address. Never attempt to connect to this
// address, because it might actually succeed!
IPV4, // - Successfully canonicalized as an IPv4 address.
IPV6, // - Successfully canonicalized as an IPv6 address.
@@ -331,7 +331,7 @@
Family family;
// If |family| is IPV4, then this is the number of nonempty dot-separated
- // components in the input text, from 1 to 4. If |family| is not IPV4,
+ // components in the input text, from 1 to 4. If |family| is not IPV4,
// this value is undefined.
int num_ipv4_components;
@@ -355,7 +355,7 @@
// Host.
//
-// The 8-bit version requires UTF-8 encoding. Use this version when you only
+// The 8-bit version requires UTF-8 encoding. Use this version when you only
// need to know whether canonicalization succeeded.
URL_EXPORT bool CanonicalizeHost(const char* spec,
const Component& host,
@@ -368,7 +368,7 @@
// Extended version of CanonicalizeHost, which returns additional information.
// Use this when you need to know whether the hostname was an IP address.
-// A successful return is indicated by host_info->family != BROKEN. See the
+// A successful return is indicated by host_info->family != BROKEN. See the
// definition of CanonHostInfo above for details.
URL_EXPORT void CanonicalizeHostVerbose(const char* spec,
const Component& host,
@@ -554,7 +554,7 @@
CanonOutput* output,
Parsed* new_parsed);
-// Use for mailto URLs. This "canonicalizes" the url into a path and query
+// Use for mailto URLs. This "canonicalizes" the URL into a path and query
// component. It does not attempt to merge "to" fields. It uses UTF-8 for
// the query encoding if there is a query. This is because a mailto URL is
// really intended for an external mail program, and the encoding of a page,
@@ -578,9 +578,9 @@
// treated on the same code path as regular canonicalization (the same string
// for each component).
//
-// A Parsed structure usually goes along with this. Those
-// components identify offsets within these strings, so that they can all be
-// in the same string, or spread arbitrarily across different ones.
+// A Parsed structure usually goes along with this. Those components identify
+// offsets within these strings, so that they can all be in the same string,
+// or spread arbitrarily across different ones.
//
// This structures does not own any data. It is the caller's responsibility to
// ensure that the data the pointers point to stays in scope and is not
@@ -725,7 +725,7 @@
}
bool IsRefOverridden() const { return sources_.ref != NULL; }
- // Getters for the itnernal data. See the variables below for how the
+ // Getters for the internal data. See the variables below for how the
// information is encoded.
const URLComponentSource<CHAR>& sources() const { return sources_; }
const Parsed& components() const { return components_; }
@@ -863,7 +863,7 @@
// The base URL should be canonical and have a host (may be empty for file
// URLs) and a path. If it doesn't have these, we can't resolve relative
// URLs off of it and will return the base as the output with an error flag.
-// Becausee it is canonical is should also be ASCII.
+// Because it is canonical is should also be ASCII.
//
// The query charset converter follows the same rules as CanonicalizeQuery.
//
diff --git a/url_canon_etc.cc b/url_canon_etc.cc
index 7409efd..e9da94c 100644
--- a/url_canon_etc.cc
+++ b/url_canon_etc.cc
@@ -95,9 +95,9 @@
// The output scheme starts from the current position.
out_scheme->begin = output->length();
- // Danger: it's important that this code does not strip any characters: it
- // only emits the canonical version (be it valid or escaped) of each of
- // the input characters. Stripping would put it out of sync with
+ // Danger: it's important that this code does not strip any characters;
+ // it only emits the canonical version (be it valid or escaped) for each
+ // of the input characters. Stripping would put it out of sync with
// FindAndCompareScheme, which could cause some security checks on
// schemes to be incorrect.
bool success = true;
@@ -218,7 +218,7 @@
char buf[buf_size];
WritePortInt(buf, buf_size, port_num);
- // Append the port number to the output, preceeded by a colon.
+ // Append the port number to the output, preceded by a colon.
output->push_back(':');
out_port->begin = output->length();
for (int i = 0; i < buf_size && buf[i]; i++)
diff --git a/url_canon_host.cc b/url_canon_host.cc
index 513248a..fce4d3a 100644
--- a/url_canon_host.cc
+++ b/url_canon_host.cc
@@ -34,7 +34,7 @@
// NOTE: I didn't actually test all the control characters. Some may be
// disallowed in the input, but they are all accepted escaped except for 0.
// I also didn't test if characters affecting HTML parsing are allowed
-// unescaped, eg. (") or (#), which would indicate the beginning of the path.
+// unescaped, e.g. (") or (#), which would indicate the beginning of the path.
// Surprisingly, space is accepted in the input and always escaped.
// This table lists the canonical version of all characters we allow in the
@@ -316,11 +316,11 @@
}
if (!success) {
- // Canonicalization failed. Set BROKEN to notify the caller.
+ // Canonicalization failed. Set BROKEN to notify the caller.
host_info->family = CanonHostInfo::BROKEN;
} else {
// After all the other canonicalization, check if we ended up with an IP
- // address. IP addresses are small, so writing into this temporary buffer
+ // address. IP addresses are small, so writing into this temporary buffer
// should not cause an allocation.
RawCanonOutput<64> canon_ip;
CanonicalizeIPAddress(output->data(),
@@ -328,7 +328,7 @@
&canon_ip, host_info);
// If we got an IPv4/IPv6 address, copy the canonical form back to the
- // real buffer. Otherwise, it's a hostname or broken IP, in which case
+ // real buffer. Otherwise, it's a hostname or broken IP, in which case
// we just leave it in place.
if (host_info->IsIPAddress()) {
output->set_length(output_begin);
diff --git a/url_canon_icu.cc b/url_canon_icu.cc
index 741bed2..8a80d71 100644
--- a/url_canon_icu.cc
+++ b/url_canon_icu.cc
@@ -99,8 +99,10 @@
// TODO(jungshik): Change options as different parties (browsers,
// registrars, search engines) converge toward a consensus.
value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);
- if (U_FAILURE(err))
+ if (U_FAILURE(err)) {
+ CHECK(false) << "failed to open UTS46 data with error: " << err;
value = NULL;
+ }
}
UIDNA* value;
diff --git a/url_canon_internal.cc b/url_canon_internal.cc
index 1554814..164c6cf 100644
--- a/url_canon_internal.cc
+++ b/url_canon_internal.cc
@@ -249,9 +249,9 @@
bool ReadUTFChar(const char* str, int* begin, int length,
unsigned* code_point_out) {
- // This depends on ints and int32s being the same thing. If they're not, it
+ // This depends on ints and int32s being the same thing. If they're not, it
// will fail to compile.
- // TODO(mmenke): This should probably be fixed.
+ // TODO(mmenke): This should probably be fixed.
if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
!base::IsValidCharacter(*code_point_out)) {
*code_point_out = kUnicodeReplacementCharacter;
@@ -262,9 +262,9 @@
bool ReadUTFChar(const base::char16* str, int* begin, int length,
unsigned* code_point_out) {
- // This depends on ints and int32s being the same thing. If they're not, it
+ // This depends on ints and int32s being the same thing. If they're not, it
// will fail to compile.
- // TODO(mmenke): This should probably be fixed.
+ // TODO(mmenke): This should probably be fixed.
if (!base::ReadUnicodeCharacter(str, length, begin, code_point_out) ||
!base::IsValidCharacter(*code_point_out)) {
*code_point_out = kUnicodeReplacementCharacter;
diff --git a/url_canon_internal.h b/url_canon_internal.h
index 71bfc40..8a926b6 100644
--- a/url_canon_internal.h
+++ b/url_canon_internal.h
@@ -7,7 +7,7 @@
// This file is intended to be included in another C++ file where the character
// types are defined. This allows us to write mostly generic code, but not have
-// templace bloat because everything is inlined when anybody calls any of our
+// template bloat because everything is inlined when anybody calls any of our
// functions.
#include <stdlib.h>
@@ -41,7 +41,7 @@
// Valid in an ASCII-representation of an octal digit.
CHAR_OCT = 32,
- // Characters that do not require escaping in encodeURIComponent. Characters
+ // Characters that do not require escaping in encodeURIComponent. Characters
// that do not have this flag will be escaped; see url_util.cc.
CHAR_COMPONENT = 64,
};
@@ -175,7 +175,7 @@
output);
Appender(static_cast<unsigned char>(0x80 | (char_value & 0x3f)),
output);
- } else if (char_value <= 0x10FFFF) { // Max unicode code point.
+ } else if (char_value <= 0x10FFFF) { // Max Unicode code point.
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
Appender(static_cast<unsigned char>(0xf0 | (char_value >> 18)),
output);
@@ -199,7 +199,7 @@
}
// Writes the given character to the output as UTF-8. This does NO checking
-// of the validity of the unicode characters; the caller should ensure that
+// of the validity of the Unicode characters; the caller should ensure that
// the value it is appending is valid to append.
inline void AppendUTF8Value(unsigned char_value, CanonOutput* output) {
DoAppendUTF8<CanonOutput, AppendCharToOutput>(char_value, output);
@@ -207,7 +207,7 @@
// Writes the given character to the output as UTF-8, escaping ALL
// characters (even when they are ASCII). This does NO checking of the
-// validity of the unicode characters; the caller should ensure that the value
+// validity of the Unicode characters; the caller should ensure that the value
// it is appending is valid to append.
inline void AppendUTF8EscapedValue(unsigned char_value, CanonOutput* output) {
DoAppendUTF8<CanonOutput, AppendEscapedChar>(char_value, output);
@@ -260,7 +260,7 @@
// that any following characters are.
inline bool AppendUTF8EscapedChar(const base::char16* str, int* begin,
int length, CanonOutput* output) {
- // UTF-16 input. Readchar16 will handle invalid characters for us and give
+ // UTF-16 input. ReadUTFChar will handle invalid characters for us and give
// us the kUnicodeReplacementCharacter, so we don't have to do special
// checking after failure, just pass through the failure to the caller.
unsigned char_value;
diff --git a/url_canon_internal_file.h b/url_canon_internal_file.h
index 6903098..26a3eae 100644
--- a/url_canon_internal_file.h
+++ b/url_canon_internal_file.h
@@ -113,15 +113,15 @@
new_parsed->path.begin = output->length();
output->push_back('/');
- // Copies and normalizes the "c:" at the beginning, if present.
+ // Copy and normalize the "c:" at the beginning, if present.
int after_drive = FileDoDriveSpec(source.path, parsed.path.begin,
parsed.path.end(), output);
- // Copies the rest of the path
+ // Copy the rest of the path.
FileDoPath<CHAR, UCHAR>(source.path, after_drive, parsed.path.end(), output);
new_parsed->path.len = output->length() - new_parsed->path.begin;
- // Things following the path we can use the standard canonicalizers for.
+ // For things following the path, we can use the standard canonicalizers.
success &= URLCanonInternal<CHAR, UCHAR>::DoQuery(
source.query, parsed.query, output, &new_parsed->query);
success &= URLCanonInternal<CHAR, UCHAR>::DoRef(
diff --git a/url_canon_ip.cc b/url_canon_ip.cc
index 45f95de..87c30c7 100644
--- a/url_canon_ip.cc
+++ b/url_canon_ip.cc
@@ -4,9 +4,10 @@
#include "url/url_canon_ip.h"
+#include <stdint.h>
#include <stdlib.h>
+#include <limits>
-#include "base/basictypes.h"
#include "base/logging.h"
#include "url/url_canon_internal.h"
@@ -92,7 +93,7 @@
template<typename CHAR>
CanonHostInfo::Family IPv4ComponentToNumber(const CHAR* spec,
const Component& component,
- uint32* number) {
+ uint32_t* number) {
// Figure out the base
SharedCharTypes base;
int base_prefix_len = 0; // Size of the prefix for this base.
@@ -118,7 +119,7 @@
base_prefix_len++;
// Put the component, minus any base prefix, into a NULL-terminated buffer so
- // we can call the standard library. Because leading zeros have already been
+ // we can call the standard library. Because leading zeros have already been
// discarded, filling the entire buffer is guaranteed to trigger the 32-bit
// overflow check.
const int kMaxComponentLen = 16;
@@ -133,7 +134,7 @@
if (!IsCharOfType(input, base))
return CanonHostInfo::NEUTRAL;
- // Fill the buffer, if there's space remaining. This check allows us to
+ // Fill the buffer, if there's space remaining. This check allows us to
// verify that all characters are numeric, even those that don't fit.
if (dest_i < kMaxComponentLen)
buf[dest_i++] = input;
@@ -143,14 +144,14 @@
// Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
// number can overflow a 64-bit number in <= 16 characters).
- uint64 num = _strtoui64(buf, NULL, BaseForType(base));
+ uint64_t num = _strtoui64(buf, NULL, BaseForType(base));
// Check for 32-bit overflow.
- if (num > kuint32max)
+ if (num > std::numeric_limits<uint32_t>::max())
return CanonHostInfo::BROKEN;
- // No overflow. Success!
- *number = static_cast<uint32>(num);
+ // No overflow. Success!
+ *number = static_cast<uint32_t>(num);
return CanonHostInfo::IPV4;
}
@@ -167,10 +168,10 @@
// Convert existing components to digits. Values up to
// |existing_components| will be valid.
- uint32 component_values[4];
+ uint32_t component_values[4];
int existing_components = 0;
- // Set to true if one or more components are BROKEN. BROKEN is only
+ // Set to true if one or more components are BROKEN. BROKEN is only
// returned if all components are IPV4 or BROKEN, so, for example,
// 12345678912345.de returns NEUTRAL rather than broken.
bool broken = false;
@@ -198,7 +199,7 @@
// First, process all components but the last, while making sure each fits
// within an 8-bit field.
for (int i = 0; i < existing_components - 1; i++) {
- if (component_values[i] > kuint8max)
+ if (component_values[i] > std::numeric_limits<uint8_t>::max())
return CanonHostInfo::BROKEN;
address[i] = static_cast<unsigned char>(component_values[i]);
}
@@ -209,7 +210,7 @@
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
- uint32 last_value = component_values[existing_components - 1];
+ uint32_t last_value = component_values[existing_components - 1];
#if ((__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4)
#pragma GCC diagnostic pop
#endif
@@ -440,11 +441,12 @@
return true;
}
-// Converts a hex comonent into a number. This cannot fail since the caller has
+// Converts a hex component into a number. This cannot fail since the caller has
// already verified that each character in the string was a hex digit, and
// that there were no more than 4 characters.
-template<typename CHAR>
-uint16 IPv6HexComponentToNumber(const CHAR* spec, const Component& component) {
+template <typename CHAR>
+uint16_t IPv6HexComponentToNumber(const CHAR* spec,
+ const Component& component) {
DCHECK(component.len <= 4);
// Copy the hex string into a C-string.
@@ -455,7 +457,7 @@
// Convert it to a number (overflow is not possible, since with 4 hex
// characters we can at most have a 16 bit number).
- return static_cast<uint16>(_strtoui64(buf, NULL, 16));
+ return static_cast<uint16_t>(_strtoui64(buf, NULL, 16));
}
// Converts an IPv6 address to a 128-bit number (network byte order), returning
@@ -497,7 +499,7 @@
// Append the hex component's value.
if (i != ipv6_parsed.num_hex_components) {
// Get the 16-bit value for this hex component.
- uint16 number = IPv6HexComponentToNumber<CHAR>(
+ uint16_t number = IPv6HexComponentToNumber<CHAR>(
spec, ipv6_parsed.hex_components[i]);
// Append to |address|, in network byte order.
address[cur_index_in_address++] = (number & 0xFF00) >> 8;
@@ -576,7 +578,7 @@
}
}
- // No invalid characters. Could still be IPv4 or a hostname.
+ // No invalid characters. Could still be IPv4 or a hostname.
host_info->family = CanonHostInfo::NEUTRAL;
return false;
}
diff --git a/url_canon_ip.h b/url_canon_ip.h
index 19ecfdb..937bd46 100644
--- a/url_canon_ip.h
+++ b/url_canon_ip.h
@@ -6,9 +6,9 @@
#define URL_URL_CANON_IP_H_
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -30,14 +30,14 @@
// Not all components may exist. If there are only 3 components, for example,
// the last one will have a length of -1 or 0 to indicate it does not exist.
//
-// Note that many platform's inet_addr will ignore everything after a space
-// in certain curcumstances if the stuff before the space looks like an IP
+// Note that many platforms' inet_addr will ignore everything after a space
+// in certain circumstances if the stuff before the space looks like an IP
// address. IE6 is included in this. We do NOT handle this case. In many cases,
// the browser's canonicalization will get run before this which converts
-// spaces to %20 (in the case of IE7) or rejects them (in the case of
-// Mozilla), so this code path never gets hit. Our host canonicalization will
-// notice these spaces and escape them, which will make IP address finding
-// fail. This seems like better behavior than stripping after a space.
+// spaces to %20 (in the case of IE7) or rejects them (in the case of Mozilla),
+// so this code path never gets hit. Our host canonicalization will notice
+// these spaces and escape them, which will make IP address finding fail. This
+// seems like better behavior than stripping after a space.
URL_EXPORT bool FindIPv4Components(const char* spec,
const Component& host,
Component components[4]);
diff --git a/url_canon_mailtourl.cc b/url_canon_mailtourl.cc
index 7c48b95..fb6bc9a 100644
--- a/url_canon_mailtourl.cc
+++ b/url_canon_mailtourl.cc
@@ -55,7 +55,7 @@
new_parsed->path.reset();
}
- // Query -- always use the default utf8 charset converter.
+ // Query -- always use the default UTF8 charset converter.
CanonicalizeQuery(source.query, parsed.query, NULL,
output, &new_parsed->query);
diff --git a/url_canon_path.cc b/url_canon_path.cc
index ceff689..ee1cd96 100644
--- a/url_canon_path.cc
+++ b/url_canon_path.cc
@@ -173,7 +173,7 @@
// copied to the output.
//
// We do not collapse multiple slashes in a row to a single slash. It seems
-// no web browsers do this, and we don't want incompababilities, even though
+// no web browsers do this, and we don't want incompatibilities, even though
// it would be correct for most systems.
template<typename CHAR, typename UCHAR>
bool DoPartialPath(const CHAR* spec,
@@ -200,7 +200,7 @@
// Needs special handling of some sort.
int dotlen;
if ((dotlen = IsDot(spec, i, end)) > 0) {
- // See if this dot was preceeded by a slash in the output. We
+ // See if this dot was preceded by a slash in the output. We
// assume that when canonicalizing paths, they will always
// start with a slash and not a dot, so we don't have to
// bounds check the output.
@@ -230,7 +230,7 @@
break;
}
} else {
- // This dot is not preceeded by a slash, it is just part of some
+ // This dot is not preceded by a slash, it is just part of some
// file name.
output->push_back('.');
i += dotlen - 1;
diff --git a/url_canon_pathurl.cc b/url_canon_pathurl.cc
index 0d23ccb..494fbda 100644
--- a/url_canon_pathurl.cc
+++ b/url_canon_pathurl.cc
@@ -14,7 +14,7 @@
namespace {
// Canonicalize the given |component| from |source| into |output| and
-// |new_component|. If |separator| is non-zero, it is pre-pended to |ouput|
+// |new_component|. If |separator| is non-zero, it is pre-pended to |output|
// prior to the canonicalized component; i.e. for the '?' or '#' characters.
template<typename CHAR, typename UCHAR>
bool DoCanonicalizePathComponent(const CHAR* source,
diff --git a/url_canon_query.cc b/url_canon_query.cc
index 5494ddf..bf59d10 100644
--- a/url_canon_query.cc
+++ b/url_canon_query.cc
@@ -80,7 +80,7 @@
}
// Runs the converter with the given UTF-16 input. We don't have to do
-// anything, but this overriddden function allows us to use the same code
+// anything, but this overridden function allows us to use the same code
// for both UTF-8 and UTF-16 input.
void RunConverter(const base::char16* spec,
const Component& query,
diff --git a/url_canon_relative.cc b/url_canon_relative.cc
index 06ca99c..c2e94e4 100644
--- a/url_canon_relative.cc
+++ b/url_canon_relative.cc
@@ -17,14 +17,14 @@
namespace {
// Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
-// 379034), whereas IE is case-insensetive.
+// 379034), whereas IE is case-insensitive.
//
// We choose to be more permissive like IE. We don't need to worry about
// unescaping or anything here: neither IE or Firefox allow this. We also
// don't have to worry about invalid scheme characters since we are comparing
// against the canonical scheme of the base.
//
-// The base URL should always be canonical, therefore is ASCII.
+// The base URL should always be canonical, therefore it should be ASCII.
template<typename CHAR>
bool AreSchemesEqual(const char* base,
const Component& base_scheme,
@@ -82,7 +82,7 @@
#ifdef WIN32
// We special case paths like "C:\foo" so they can link directly to the
- // file on Windows (IE compatability). The security domain stuff should
+ // file on Windows (IE compatibility). The security domain stuff should
// prevent a link like this from actually being followed if its on a
// web page.
//
@@ -91,22 +91,22 @@
// is a file and the answer will still be correct.
//
// We require strict backslashes when detecting UNC since two forward
- // shashes should be treated a a relative URL with a hostname.
+ // slashes should be treated a a relative URL with a hostname.
if (DoesBeginWindowsDriveSpec(url, begin, url_len) ||
DoesBeginUNCPath(url, begin, url_len, true))
return true;
#endif // WIN32
// See if we've got a scheme, if not, we know this is a relative URL.
- // BUT: Just because we have a scheme, doesn't make it absolute.
+ // BUT, just because we have a scheme, doesn't make it absolute.
// "http:foo.html" is a relative URL with path "foo.html". If the scheme is
- // empty, we treat it as relative (":foo") like IE does.
+ // empty, we treat it as relative (":foo"), like IE does.
Component scheme;
const bool scheme_is_empty =
!ExtractScheme(url, url_len, &scheme) || scheme.len == 0;
if (scheme_is_empty) {
if (url[begin] == '#') {
- // |url| is a bare fragement (e.g. "#foo"). This can be resolved against
+ // |url| is a bare fragment (e.g. "#foo"). This can be resolved against
// any base. Fall-through.
} else if (!is_base_hierarchical) {
// Don't allow relative URLs if the base scheme doesn't support it.
@@ -145,7 +145,7 @@
int colon_offset = scheme.end();
// If it's a filesystem URL, the only valid way to make it relative is not to
- // supply a scheme. There's no equivalent to e.g. http:index.html.
+ // supply a scheme. There's no equivalent to e.g. http:index.html.
if (CompareSchemeComponent(url, scheme, kFileSystemScheme))
return true;
@@ -394,7 +394,7 @@
query_converter, output, out_parsed);
}
-// Resolves a relative URL that happens to be an absolute file path. Examples
+// Resolves a relative URL that happens to be an absolute file path. Examples
// include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
template<typename CHAR>
bool DoResolveAbsoluteFile(const CHAR* relative_url,
@@ -460,7 +460,7 @@
// how strict the UNC finder is).
//
// We also allow Windows absolute drive specs on any scheme (for example
- // "c:\foo") like IE does. There must be no preceeding slashes in this
+ // "c:\foo") like IE does. There must be no preceding slashes in this
// case (we reject anything like "/c:/foo") because that should be treated
// as a path. For file URLs, we allow any number of slashes since that would
// be setting the path.
diff --git a/url_canon_stdurl.cc b/url_canon_stdurl.cc
index 7a61de8..7d1758b 100644
--- a/url_canon_stdurl.cc
+++ b/url_canon_stdurl.cc
@@ -169,7 +169,7 @@
}
// For 16-bit replacements, we turn all the replacements into UTF-8 so the
-// regular codepath can be used.
+// regular code path can be used.
bool ReplaceStandardURL(const char* base,
const Parsed& base_parsed,
const Replacements<base::char16>& replacements,
diff --git a/url_canon_unittest.cc b/url_canon_unittest.cc
index 3ab8710..0ccd6c9 100644
--- a/url_canon_unittest.cc
+++ b/url_canon_unittest.cc
@@ -6,10 +6,10 @@
#include "base/macros.h"
#include "testing/gtest/include/gtest/gtest.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_internal.h"
#include "url/url_canon_stdstring.h"
-#include "url/url_parse.h"
#include "url/url_test_utils.h"
namespace url {
@@ -38,7 +38,7 @@
bool expected_success;
};
-// Test cases for CanonicalizeIPAddress(). The inputs are identical to
+// Test cases for CanonicalizeIPAddress(). The inputs are identical to
// DualComponentCase, but the output has extra CanonHostInfo fields.
struct IPAddressCase {
const char* input8;
@@ -127,7 +127,7 @@
#if defined(GTEST_HAS_DEATH_TEST)
// TODO(mattm): Can't run this in debug mode for now, since the DCHECK will
-// cause the Chromium stacktrace dialog to appear and hang the test.
+// cause the Chromium stack trace dialog to appear and hang the test.
// See http://crbug.com/49580.
#if defined(NDEBUG) && !defined(DCHECK_ALWAYS_ON)
#define MAYBE_DoAppendUTF8Invalid DoAppendUTF8Invalid
@@ -157,10 +157,10 @@
} utf_cases[] = {
// Valid canonical input should get passed through & escaped.
{"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
- // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
+ // Test a character that takes > 16 bits (U+10300 = old italic letter A)
{"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
- // Non-shortest-form UTF-8 are invalid. The bad char should be replaced
- // with the invalid character (EF BF DB in UTF-8).
+ // Non-shortest-form UTF-8 characters are invalid. The bad character
+ // should be replaced with the invalid character (EF BF DB in UTF-8).
{"\xf0\x84\xbd\xa0\xe5\xa5\xbd", NULL, false, "%EF%BF%BD%E5%A5%BD"},
// Invalid UTF-8 sequences should be marked as invalid (the first
// sequence is truncated).
@@ -259,7 +259,7 @@
EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
- // Now try the wide version
+ // Now try the wide version.
out_str.clear();
StdStringCanonOutput output2(&out_str);
@@ -275,7 +275,7 @@
EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
}
- // Test the case where the scheme is declared nonexistant, it should be
+ // Test the case where the scheme is declared nonexistent, it should be
// converted into an empty scheme.
Component out_comp;
out_str.clear();
@@ -638,7 +638,7 @@
{"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
{"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
{"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
- // Old trunctations tests. They're all "BROKEN" now.
+ // Old trunctations tests. They're all "BROKEN" now.
{"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
@@ -754,16 +754,17 @@
{"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
- // Can only have one "::" contraction in an IPv6 string literal.
+ // Can only have one "::" contraction in an IPv6 string literal.
{"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // No more than 2 consecutive ':'s.
+ // No more than 2 consecutive ':'s.
{"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // Non-IP addresses due to invalid characters.
+ // Non-IP addresses due to invalid characters.
{"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
- // If there are not enough components, the last one should fill them out.
+ // If there are not enough components, the last one should fill them out.
// ... omitted at this time ...
- // Too many components means not an IP address. Similarly with too few if using IPv4 compat or mapped addresses.
+ // Too many components means not an IP address. Similarly, with too few
+ // if using IPv4 compat or mapped addresses.
{"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
{"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
@@ -887,7 +888,7 @@
{"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
{"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
- // IE7 compatability: old versions allowed backslashes in usernames, but
+ // IE7 compatibility: old versions allowed backslashes in usernames, but
// IE7 does not. We disallow it as well.
{"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true},
};
@@ -943,7 +944,7 @@
// buffer. The parser unit tests will test scanning the number correctly.
//
// Note that the CanonicalizePort will always prepend a colon to the output
- // to separate it from the colon that it assumes preceeds it.
+ // to separate it from the colon that it assumes precedes it.
struct PortCase {
const char* input;
int default_port;
@@ -1329,7 +1330,7 @@
{"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw", "host.com", "99", "/path", "query", "ref", "https://me:pw@host.com:99/path?query#ref"},
// Replace nothing
{"http://a:b@google.com:22/foo?baz@cat", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "http://a:b@google.com:22/foo?baz@cat"},
- // Replace scheme with filesystem. The result is garbage, but you asked
+ // Replace scheme with filesystem. The result is garbage, but you asked
// for it.
{"http://a:b@google.com:22/foo?baz@cat", "filesystem", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "filesystem://a:b@google.com:22/foo?baz@cat"},
};
@@ -1594,7 +1595,7 @@
{"file:", "file:///", true, Component(), Component(7, 1)},
{"file:UNChost/path", "file://unchost/path", true, Component(7, 7), Component(14, 5)},
// CanonicalizeFileURL supports absolute Windows style paths for IE
- // compatability. Note that the caller must decide that this is a file
+ // compatibility. Note that the caller must decide that this is a file
// URL itself so it can call the file canonicalizer. This is usually
// done automatically as part of relative URL resolving.
{"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
@@ -1605,7 +1606,7 @@
{"\\\\server\\file", "file://server/file", true, Component(7, 6), Component(13, 5)},
{"/\\server/file", "file://server/file", true, Component(7, 6), Component(13, 5)},
// We should preserve the number of slashes after the colon for IE
- // compatability, except when there is none, in which case we should
+ // compatibility, except when there is none, in which case we should
// add one.
{"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(), Component(7, 16)},
{"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true, Component(), Component(7, 19)},
@@ -1807,7 +1808,7 @@
TEST(URLCanonTest, _itoa_s) {
// We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
+ // null-terminated. We also allocate one byte more than what we tell
// _itoa_s about, and ensure that the extra byte is untouched.
char buf[6];
memset(buf, 0xff, sizeof(buf));
@@ -1846,7 +1847,7 @@
TEST(URLCanonTest, _itow_s) {
// We fill the buffer with 0xff to ensure that it's getting properly
- // null-terminated. We also allocate one byte more than what we tell
+ // null-terminated. We also allocate one byte more than what we tell
// _itoa_s about, and ensure that the extra byte is untouched.
base::char16 buf[6];
const char fill_mem = 0xff;
@@ -2022,7 +2023,7 @@
// which is what is required.
{"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"},
// Filesystem URL tests; filesystem URLs are only valid and relative if
- // they have no scheme, e.g. "./index.html". There's no valid equivalent
+ // they have no scheme, e.g. "./index.html". There's no valid equivalent
// to http:index.html.
{"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
{"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL},
@@ -2090,10 +2091,10 @@
}
}
-// It used to be when we did a replacement with a long buffer of UTF-16
-// characters, we would get invalid data in the URL. This is because the buffer
-// it used to hold the UTF-8 data was resized, while some pointers were still
-// kept to the old buffer that was removed.
+// It used to be the case that when we did a replacement with a long buffer of
+// UTF-16 characters, we would get invalid data in the URL. This is because the
+// buffer that it used to hold the UTF-8 data was resized, while some pointers
+// were still kept to the old buffer that was removed.
TEST(URLCanonTest, ReplacementOverflow) {
const char src[] = "file:///C:/foo/bar";
int src_len = static_cast<int>(strlen(src));
@@ -2101,7 +2102,7 @@
ParseFileURL(src, src_len, &parsed);
// Override two components, the path with something short, and the query with
- // sonething long enough to trigger the bug.
+ // something long enough to trigger the bug.
Replacements<base::char16> repl;
base::string16 new_query;
for (int i = 0; i < 4800; i++)
diff --git a/url_parse.h b/url_parse.h
deleted file mode 100644
index 3b9c546..0000000
--- a/url_parse.h
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright 2013 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef URL_URL_PARSE_H_
-#define URL_URL_PARSE_H_
-
-// TODO(tfarina): Remove this file when the callers are updated.
-#include "url/third_party/mozilla/url_parse.h"
-
-#endif // URL_URL_PARSE_H_
diff --git a/url_parse_file.cc b/url_parse_file.cc
index c08ddc6..fcbb12d 100644
--- a/url_parse_file.cc
+++ b/url_parse_file.cc
@@ -3,8 +3,8 @@
// found in the LICENSE file.
#include "base/logging.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_file.h"
-#include "url/url_parse.h"
#include "url/url_parse_internal.h"
// Interesting IE file:isms...
diff --git a/url_parse_internal.h b/url_parse_internal.h
index 4070b7e..7630878 100644
--- a/url_parse_internal.h
+++ b/url_parse_internal.h
@@ -7,11 +7,11 @@
// Contains common inline helper functions used by the URL parsing routines.
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
namespace url {
-// We treat slashes and backslashes the same for IE compatability.
+// We treat slashes and backslashes the same for IE compatibility.
inline bool IsURLSlash(base::char16 ch) {
return ch == '/' || ch == '\\';
}
diff --git a/url_parse_unittest.cc b/url_parse_unittest.cc
index 71b2438..6bf536e 100644
--- a/url_parse_unittest.cc
+++ b/url_parse_unittest.cc
@@ -2,11 +2,11 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "base/macros.h"
#include "testing/gtest/include/gtest/gtest.h"
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
// Interesting IE file:isms...
//
@@ -90,13 +90,13 @@
bool ComponentMatches(const char* input,
const char* reference,
const Component& component) {
- // If the component is nonexistant (length == -1), it should begin at 0.
+ // If the component is nonexistent (length == -1), it should begin at 0.
EXPECT_TRUE(component.len >= 0 || component.len == -1);
// Begin should be valid.
EXPECT_LE(0, component.begin);
- // A NULL reference means the component should be nonexistant.
+ // A NULL reference means the component should be nonexistent.
if (!reference)
return component.len == -1;
if (component.len < 0)
@@ -345,7 +345,7 @@
TEST(URLParser, PathURL) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(path_cases); i++) {
const char* url = path_cases[i].input;
@@ -356,7 +356,7 @@
EXPECT_TRUE(ComponentMatches(url, path_cases[i].path, parsed.GetContent()))
<< i;
- // The remaining components are never used for path urls.
+ // The remaining components are never used for path URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.host);
@@ -537,7 +537,7 @@
Component key, value;
if (!ExtractQueryKeyValue(url, &query, &key, &value)) {
if (parameter >= i && !expected_key)
- return true; // Expected nonexistant key, got one.
+ return true; // Expected nonexistent key, got one.
return false; // Not enough keys.
}
@@ -613,7 +613,7 @@
TEST(URLParser, MailtoUrl) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(mailto_cases); ++i) {
const char* url = mailto_cases[i].input;
@@ -625,7 +625,7 @@
EXPECT_TRUE(ComponentMatches(url, mailto_cases[i].query, parsed.query));
EXPECT_EQ(PORT_UNSPECIFIED, port);
- // The remaining components are never used for mailto urls.
+ // The remaining components are never used for mailto URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.port);
@@ -645,7 +645,7 @@
TEST(URLParser, FileSystemURL) {
// Declared outside for loop to try to catch cases in init() where we forget
- // to reset something that is reset by the construtor.
+ // to reset something that is reset by the constructor.
Parsed parsed;
for (size_t i = 0; i < arraysize(filesystem_cases); i++) {
const FileSystemURLParseCase* parsecase = &filesystem_cases[i];
@@ -667,7 +667,7 @@
int port = ParsePort(url, parsed.inner_parsed()->port);
EXPECT_EQ(parsecase->inner_port, port);
- // The remaining components are never used for filesystem urls.
+ // The remaining components are never used for filesystem URLs.
ExpectInvalidComponent(parsed.inner_parsed()->query);
ExpectInvalidComponent(parsed.inner_parsed()->ref);
}
@@ -676,7 +676,7 @@
EXPECT_TRUE(ComponentMatches(url, parsecase->query, parsed.query));
EXPECT_TRUE(ComponentMatches(url, parsecase->ref, parsed.ref));
- // The remaining components are never used for filesystem urls.
+ // The remaining components are never used for filesystem URLs.
ExpectInvalidComponent(parsed.username);
ExpectInvalidComponent(parsed.password);
ExpectInvalidComponent(parsed.host);
diff --git a/url_test_utils.h b/url_test_utils.h
index 6400bac..156c428 100644
--- a/url_test_utils.h
+++ b/url_test_utils.h
@@ -19,7 +19,7 @@
namespace test_utils {
// Converts a UTF-16 string from native wchar_t format to char16, by
-// truncating the high 32 bits. This is not meant to handle true UTF-32
+// truncating the high 32 bits. This is not meant to handle true UTF-32
// encoded strings.
inline base::string16 WStringToUTF16(const wchar_t* src) {
base::string16 str;
@@ -30,7 +30,7 @@
return str;
}
-// Converts a string from UTF-8 to UTF-16
+// Converts a string from UTF-8 to UTF-16.
inline base::string16 ConvertUTF8ToUTF16(const std::string& src) {
int length = static_cast<int>(src.length());
EXPECT_LT(length, 1024);
@@ -39,7 +39,7 @@
return base::string16(output.data(), output.length());
}
-// Converts a string from UTF-16 to UTF-8
+// Converts a string from UTF-16 to UTF-8.
inline std::string ConvertUTF16ToUTF8(const base::string16& src) {
std::string str;
StdStringCanonOutput output(&str);
diff --git a/url_util.cc b/url_util.cc
index 008a5e4..279ab7e 100644
--- a/url_util.cc
+++ b/url_util.cc
@@ -9,6 +9,7 @@
#include "base/debug/leak_annotations.h"
#include "base/logging.h"
+#include "base/strings/string_util.h"
#include "url/url_canon_internal.h"
#include "url/url_file.h"
#include "url/url_util_internal.h"
@@ -17,28 +18,11 @@
namespace {
-// ASCII-specific tolower. The standard library's tolower is locale sensitive,
-// so we don't want to use it here.
-template<class Char>
-inline Char ToLowerASCII(Char c) {
- return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c;
-}
-
-// Backend for LowerCaseEqualsASCII.
-template<typename Iter>
-inline bool DoLowerCaseEqualsASCII(Iter a_begin, Iter a_end, const char* b) {
- for (Iter it = a_begin; it != a_end; ++it, ++b) {
- if (!*b || ToLowerASCII(*it) != *b)
- return false;
- }
- return *b == 0;
-}
-
const int kNumStandardURLSchemes = 8;
const char* kStandardURLSchemes[kNumStandardURLSchemes] = {
kHttpScheme,
kHttpsScheme,
- kFileScheme, // Yes, file urls can have a hostname!
+ kFileScheme, // Yes, file URLs can have a hostname!
kFtpScheme,
kGopherScheme,
kWsScheme, // WebSocket.
@@ -54,6 +38,17 @@
// See the LockStandardSchemes declaration in the header.
bool standard_schemes_locked = false;
+// This template converts a given character type to the corresponding
+// StringPiece type.
+template<typename CHAR> struct CharToStringPiece {
+};
+template<> struct CharToStringPiece<char> {
+ typedef base::StringPiece Piece;
+};
+template<> struct CharToStringPiece<base::char16> {
+ typedef base::StringPiece16 Piece;
+};
+
// Ensures that the standard_schemes list is initialized, does nothing if it
// already has values.
void InitStandardSchemes() {
@@ -72,9 +67,10 @@
const char* compare_to) {
if (!component.is_nonempty())
return compare_to[0] == 0; // When component is empty, match empty scheme.
- return LowerCaseEqualsASCII(&spec[component.begin],
- &spec[component.end()],
- compare_to);
+ return base::LowerCaseEqualsASCII(
+ typename CharToStringPiece<CHAR>::Piece(
+ &spec[component.begin], component.len),
+ compare_to);
}
// Returns true if the given scheme identified by |scheme| within |spec| is one
@@ -86,8 +82,10 @@
InitStandardSchemes();
for (size_t i = 0; i < standard_schemes->size(); i++) {
- if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],
- standard_schemes->at(i)))
+ if (base::LowerCaseEqualsASCII(
+ typename CharToStringPiece<CHAR>::Piece(
+ &spec[scheme.begin], scheme.len),
+ standard_schemes->at(i)))
return true;
}
return false;
@@ -134,7 +132,7 @@
Parsed parsed_input;
#ifdef WIN32
// For Windows, we allow things that look like absolute Windows paths to be
- // fixed up magically to file URLs. This is done for IE compatability. For
+ // fixed up magically to file URLs. This is done for IE compatibility. For
// example, this will change "c:/foo" into a file URL rather than treating
// it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
// There is similar logic in url_canon_relative.cc for
@@ -177,13 +175,14 @@
charset_converter, output, output_parsed);
} else if (DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
- // Mailto are treated like a standard url with only a scheme, path, query
+ // Mailto URLs are treated like standard URLs, with only a scheme, path,
+ // and query.
ParseMailtoURL(spec, spec_len, &parsed_input);
success = CanonicalizeMailtoURL(spec, spec_len, parsed_input, output,
output_parsed);
} else {
- // "Weird" URLs like data: and javascript:
+ // "Weird" URLs like data: and javascript:.
ParsePathURL(spec, spec_len, trim_path_end, &parsed_input);
success = CanonicalizePathURL(spec, spec_len, parsed_input, output,
output_parsed);
@@ -273,7 +272,7 @@
CanonOutput* output,
Parsed* out_parsed) {
// If the scheme is overridden, just do a simple string substitution and
- // reparse the whole thing. There are lots of edge cases that we really don't
+ // re-parse the whole thing. There are lots of edge cases that we really don't
// want to deal with. Like what happens if I replace "http://e:8080/foo"
// with a file. Does it become "file:///E:/8080/foo" where the port number
// becomes part of the path? Parsing that string as a file URL says "yes"
@@ -320,7 +319,7 @@
// getting replaced here. If ReplaceComponents didn't re-check everything,
// we wouldn't know if something *not* getting replaced is a problem.
// If the scheme-specific replacers are made more intelligent so they don't
- // re-check everything, we should instead recanonicalize the whole thing
+ // re-check everything, we should instead re-canonicalize the whole thing
// after this call to check validity (this assumes replacing the scheme is
// much much less common than other types of replacements, like clearing the
// ref).
@@ -373,7 +372,7 @@
//
// This normally means you're trying to set up a new standard scheme too late
// in your application's init process. Locate where your app does this
- // initialization and calls LockStandardScheme, and add your new standard
+ // initialization and calls LockStandardSchemes, and add your new standard
// scheme there.
DCHECK(!standard_schemes_locked) <<
"Trying to add a standard scheme after the list has been locked.";
@@ -382,7 +381,7 @@
if (scheme_len == 0)
return;
- // Dulicate the scheme into a new buffer and add it to the list of standard
+ // Duplicate the scheme into a new buffer and add it to the list of standard
// schemes. This pointer will be leaked on shutdown.
char* dup_scheme = new char[scheme_len + 1];
ANNOTATE_LEAKING_OBJECT_PTR(dup_scheme);
@@ -486,31 +485,6 @@
charset_converter, output, out_parsed);
}
-// Front-ends for LowerCaseEqualsASCII.
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
-bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end) {
- while (a_begin != a_end && b_begin != b_end &&
- ToLowerASCII(*a_begin) == *b_begin) {
- a_begin++;
- b_begin++;
- }
- return a_begin == a_end && b_begin == b_end;
-}
-
-bool LowerCaseEqualsASCII(const base::char16* a_begin,
- const base::char16* a_end,
- const char* b) {
- return DoLowerCaseEqualsASCII(a_begin, a_end, b);
-}
-
void DecodeURLEscapeSequences(const char* input,
int length,
CanonOutputW* output) {
diff --git a/url_util.h b/url_util.h
index 458d1e8..5817044 100644
--- a/url_util.h
+++ b/url_util.h
@@ -8,10 +8,10 @@
#include <string>
#include "base/strings/string16.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_constants.h"
#include "url/url_export.h"
-#include "url/url_parse.h"
namespace url {
@@ -20,14 +20,13 @@
// Initialization is NOT required, it will be implicitly initialized when first
// used. However, this implicit initialization is NOT threadsafe. If you are
// using this library in a threaded environment and don't have a consistent
-// "first call" (an example might be calling "AddStandardScheme" with your
-// special application-specific schemes) then you will want to call initialize
-// before spawning any threads.
+// "first call" (an example might be calling AddStandardScheme with your special
+// application-specific schemes) then you will want to call initialize before
+// spawning any threads.
//
-// It is OK to call this function more than once, subsequent calls will simply
-// "noop", unless Shutdown() was called in the mean time. This will also be a
-// "noop" if other calls to the library have forced an initialization
-// beforehand.
+// It is OK to call this function more than once, subsequent calls will be
+// no-ops, unless Shutdown was called in the mean time. This will also be a
+// no-op if other calls to the library have forced an initialization beforehand.
URL_EXPORT void Initialize();
// Cleanup is not required, except some strings may leak. For most user
@@ -38,10 +37,13 @@
// Schemes --------------------------------------------------------------------
-// Adds an application-defined scheme to the internal list of "standard" URL
-// schemes. This function is not threadsafe and can not be called concurrently
-// with any other url_util function. It will assert if the list of standard
-// schemes has been locked (see LockStandardSchemes).
+// Adds an application-defined scheme to the internal list of "standard-format"
+// URL schemes. A standard-format scheme adheres to what RFC 3986 calls "generic
+// URI syntax" (https://tools.ietf.org/html/rfc3986#section-3).
+//
+// This function is not threadsafe and can not be called concurrently with any
+// other url_util function. It will assert if the list of standard schemes has
+// been locked (see LockStandardSchemes).
URL_EXPORT void AddStandardScheme(const char* new_scheme);
// Sets a flag to prevent future calls to AddStandardScheme from succeeding.
@@ -85,19 +87,11 @@
compare, found_scheme);
}
-// Returns true if the given string represents a standard URL. This means that
-// either the scheme is in the list of known standard schemes.
+// Returns true if the given string represents a URL whose scheme is in the list
+// of known standard-format schemes (see AddStandardScheme).
URL_EXPORT bool IsStandard(const char* spec, const Component& scheme);
URL_EXPORT bool IsStandard(const base::char16* spec, const Component& scheme);
-// TODO(brettw) remove this. This is a temporary compatibility hack to avoid
-// breaking the WebKit build when this version is synced via Chrome.
-inline bool IsStandard(const char* spec,
- int spec_len,
- const Component& scheme) {
- return IsStandard(spec, scheme);
-}
-
// URL library wrappers -------------------------------------------------------
// Parses the given spec according to the extracted scheme type. Normal users
@@ -150,7 +144,7 @@
CanonOutput* output,
Parsed* output_parsed);
-// Replaces components in the given VALID input url. The new canonical URL info
+// Replaces components in the given VALID input URL. The new canonical URL info
// is written to output and out_parsed.
//
// Returns true if the resulting URL is valid.
@@ -172,29 +166,12 @@
// String helper functions ----------------------------------------------------
-// Compare the lower-case form of the given string against the given ASCII
-// string. This is useful for doing checking if an input string matches some
-// token, and it is optimized to avoid intermediate string copies.
-//
-// The versions of this function that don't take a b_end assume that the b
-// string is NULL terminated.
-URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b);
-URL_EXPORT bool LowerCaseEqualsASCII(const char* a_begin,
- const char* a_end,
- const char* b_begin,
- const char* b_end);
-URL_EXPORT bool LowerCaseEqualsASCII(const base::char16* a_begin,
- const base::char16* a_end,
- const char* b);
-
// Unescapes the given string using URL escaping rules.
URL_EXPORT void DecodeURLEscapeSequences(const char* input,
int length,
CanonOutputW* output);
-// Escapes the given string as defined by the JS method encodeURIComponent. See
+// Escapes the given string as defined by the JS method encodeURIComponent. See
// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/encodeURIComponent
URL_EXPORT void EncodeURIComponent(const char* input,
int length,
diff --git a/url_util_internal.h b/url_util_internal.h
index c72598f..756c736 100644
--- a/url_util_internal.h
+++ b/url_util_internal.h
@@ -8,7 +8,7 @@
#include <string>
#include "base/strings/string16.h"
-#include "url/url_parse.h"
+#include "url/third_party/mozilla/url_parse.h"
namespace url {
diff --git a/url_util_unittest.cc b/url_util_unittest.cc
index 73ff93b..9297765 100644
--- a/url_util_unittest.cc
+++ b/url_util_unittest.cc
@@ -4,9 +4,9 @@
#include "base/macros.h"
#include "testing/gtest/include/gtest/gtest.h"
+#include "url/third_party/mozilla/url_parse.h"
#include "url/url_canon.h"
#include "url/url_canon_stdstring.h"
-#include "url/url_parse.h"
#include "url/url_test_utils.h"
#include "url/url_util.h"
@@ -44,7 +44,7 @@
EXPECT_FALSE(FindAndCompareScheme("", 0, "", &found_scheme));
EXPECT_TRUE(found_scheme == Component());
- // When there is a whitespace char in scheme, it should canonicalize the url
+ // When there is a whitespace char in scheme, it should canonicalize the URL
// before comparison.
const char whtspc_str[] = " \r\n\tjav\ra\nscri\tpt:alert(1)";
EXPECT_TRUE(FindAndCompareScheme(whtspc_str,
@@ -305,8 +305,8 @@
}
TEST(URLUtilTest, TestNoRefComponent) {
- // The hash-mark must be ignored when mailto: scheme is
- // parsed, even if the url has a base and relative part.
+ // The hash-mark must be ignored when mailto: scheme is parsed,
+ // even if the URL has a base and relative part.
const char* base = "mailto://to/";
const char* rel = "any#body";