James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef URL_GURL_H_ |
| 6 | #define URL_GURL_H_ |
| 7 | |
| 8 | #include <iosfwd> |
| 9 | #include <string> |
| 10 | |
| 11 | #include "base/memory/scoped_ptr.h" |
| 12 | #include "base/strings/string16.h" |
| 13 | #include "url/url_canon.h" |
| 14 | #include "url/url_canon_stdstring.h" |
| 15 | #include "url/url_constants.h" |
| 16 | #include "url/url_export.h" |
| 17 | #include "url/url_parse.h" |
| 18 | |
| 19 | class URL_EXPORT GURL { |
| 20 | public: |
James Robinson | 7b766f4 | 2015-02-06 15:14:04 -0800 | [diff] [blame] | 21 | typedef url::StringPieceReplacements<std::string> Replacements; |
| 22 | typedef url::StringPieceReplacements<base::string16> ReplacementsW; |
James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 23 | |
| 24 | // Creates an empty, invalid URL. |
| 25 | GURL(); |
| 26 | |
| 27 | // Copy construction is relatively inexpensive, with most of the time going |
| 28 | // to reallocating the string. It does not re-parse. |
| 29 | GURL(const GURL& other); |
| 30 | |
| 31 | // The narrow version requires the input be UTF-8. Invalid UTF-8 input will |
| 32 | // result in an invalid URL. |
| 33 | // |
| 34 | // The wide version should also take an encoding parameter so we know how to |
| 35 | // encode the query parameters. It is probably sufficient for the narrow |
| 36 | // version to assume the query parameter encoding should be the same as the |
| 37 | // input encoding. |
| 38 | explicit GURL(const std::string& url_string /*, output_param_encoding*/); |
| 39 | explicit GURL(const base::string16& url_string /*, output_param_encoding*/); |
| 40 | |
| 41 | // Constructor for URLs that have already been parsed and canonicalized. This |
| 42 | // is used for conversions from KURL, for example. The caller must supply all |
| 43 | // information associated with the URL, which must be correct and consistent. |
| 44 | GURL(const char* canonical_spec, |
| 45 | size_t canonical_spec_len, |
| 46 | const url::Parsed& parsed, |
| 47 | bool is_valid); |
| 48 | // Notice that we take the canonical_spec by value so that we can convert |
| 49 | // from WebURL without copying the string. When we call this constructor |
| 50 | // we pass in a temporary std::string, which lets the compiler skip the |
| 51 | // copy and just move the std::string into the function argument. In the |
| 52 | // implementation, we use swap to move the data into the GURL itself, |
| 53 | // which means we end up with zero copies. |
| 54 | GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid); |
| 55 | |
| 56 | ~GURL(); |
| 57 | |
| 58 | GURL& operator=(GURL other); |
| 59 | |
| 60 | // Returns true when this object represents a valid parsed URL. When not |
| 61 | // valid, other functions will still succeed, but you will not get canonical |
| 62 | // data out in the format you may be expecting. Instead, we keep something |
| 63 | // "reasonable looking" so that the user can see how it's busted if |
| 64 | // displayed to them. |
| 65 | bool is_valid() const { |
| 66 | return is_valid_; |
| 67 | } |
| 68 | |
| 69 | // Returns true if the URL is zero-length. Note that empty URLs are also |
| 70 | // invalid, and is_valid() will return false for them. This is provided |
| 71 | // because some users may want to treat the empty case differently. |
| 72 | bool is_empty() const { |
| 73 | return spec_.empty(); |
| 74 | } |
| 75 | |
| 76 | // Returns the raw spec, i.e., the full text of the URL, in canonical UTF-8, |
| 77 | // if the URL is valid. If the URL is not valid, this will assert and return |
| 78 | // the empty string (for safety in release builds, to keep them from being |
| 79 | // misused which might be a security problem). |
| 80 | // |
| 81 | // The URL will be ASCII except the reference fragment, which may be UTF-8. |
| 82 | // It is guaranteed to be valid UTF-8. |
| 83 | // |
| 84 | // The exception is for empty() URLs (which are !is_valid()) but this will |
| 85 | // return the empty string without asserting. |
| 86 | // |
| 87 | // Used invalid_spec() below to get the unusable spec of an invalid URL. This |
| 88 | // separation is designed to prevent errors that may cause security problems |
| 89 | // that could result from the mistaken use of an invalid URL. |
| 90 | const std::string& spec() const; |
| 91 | |
| 92 | // Returns the potentially invalid spec for a the URL. This spec MUST NOT be |
| 93 | // modified or sent over the network. It is designed to be displayed in error |
| 94 | // messages to the user, as the apperance of the spec may explain the error. |
| 95 | // If the spec is valid, the valid spec will be returned. |
| 96 | // |
| 97 | // The returned string is guaranteed to be valid UTF-8. |
| 98 | const std::string& possibly_invalid_spec() const { |
| 99 | return spec_; |
| 100 | } |
| 101 | |
| 102 | // Getter for the raw parsed structure. This allows callers to locate parts |
| 103 | // of the URL within the spec themselves. Most callers should consider using |
| 104 | // the individual component getters below. |
| 105 | // |
| 106 | // The returned parsed structure will reference into the raw spec, which may |
| 107 | // or may not be valid. If you are using this to index into the spec, BE |
| 108 | // SURE YOU ARE USING possibly_invalid_spec() to get the spec, and that you |
| 109 | // don't do anything "important" with invalid specs. |
| 110 | const url::Parsed& parsed_for_possibly_invalid_spec() const { |
| 111 | return parsed_; |
| 112 | } |
| 113 | |
| 114 | // Defiant equality operator! |
James Robinson | d2015d9 | 2014-12-08 13:45:40 -0800 | [diff] [blame] | 115 | bool operator==(const GURL& other) const; |
| 116 | bool operator!=(const GURL& other) const; |
James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 117 | |
| 118 | // Allows GURL to used as a key in STL (for example, a std::set or std::map). |
James Robinson | d2015d9 | 2014-12-08 13:45:40 -0800 | [diff] [blame] | 119 | bool operator<(const GURL& other) const; |
| 120 | bool operator>(const GURL& other) const; |
James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 121 | |
| 122 | // Resolves a URL that's possibly relative to this object's URL, and returns |
| 123 | // it. Absolute URLs are also handled according to the rules of URLs on web |
| 124 | // pages. |
| 125 | // |
| 126 | // It may be impossible to resolve the URLs properly. If the input is not |
| 127 | // "standard" (SchemeIsStandard() == false) and the input looks relative, we |
| 128 | // can't resolve it. In these cases, the result will be an empty, invalid |
| 129 | // GURL. |
| 130 | // |
| 131 | // The result may also be a nonempty, invalid URL if the input has some kind |
| 132 | // of encoding error. In these cases, we will try to construct a "good" URL |
| 133 | // that may have meaning to the user, but it will be marked invalid. |
| 134 | // |
| 135 | // It is an error to resolve a URL relative to an invalid URL. The result |
| 136 | // will be the empty URL. |
| 137 | GURL Resolve(const std::string& relative) const; |
| 138 | GURL Resolve(const base::string16& relative) const; |
| 139 | |
| 140 | // Like Resolve() above but takes a character set encoder which will be used |
| 141 | // for any query text specified in the input. The charset converter parameter |
| 142 | // may be NULL, in which case it will be treated as UTF-8. |
| 143 | // |
| 144 | // TODO(brettw): These should be replaced with versions that take something |
| 145 | // more friendly than a raw CharsetConverter (maybe like an ICU character set |
| 146 | // name). |
| 147 | GURL ResolveWithCharsetConverter( |
| 148 | const std::string& relative, |
| 149 | url::CharsetConverter* charset_converter) const; |
| 150 | GURL ResolveWithCharsetConverter( |
| 151 | const base::string16& relative, |
| 152 | url::CharsetConverter* charset_converter) const; |
| 153 | |
| 154 | // Creates a new GURL by replacing the current URL's components with the |
| 155 | // supplied versions. See the Replacements class in url_canon.h for more. |
| 156 | // |
| 157 | // These are not particularly quick, so avoid doing mutations when possible. |
| 158 | // Prefer the 8-bit version when possible. |
| 159 | // |
| 160 | // It is an error to replace components of an invalid URL. The result will |
| 161 | // be the empty URL. |
| 162 | // |
| 163 | // Note that we use the more general url::Replacements type to give |
| 164 | // callers extra flexibility rather than our override. |
| 165 | GURL ReplaceComponents(const url::Replacements<char>& replacements) const; |
| 166 | GURL ReplaceComponents( |
| 167 | const url::Replacements<base::char16>& replacements) const; |
| 168 | |
| 169 | // A helper function that is equivalent to replacing the path with a slash |
| 170 | // and clearing out everything after that. We sometimes need to know just the |
| 171 | // scheme and the authority. If this URL is not a standard URL (it doesn't |
| 172 | // have the regular authority and path sections), then the result will be |
| 173 | // an empty, invalid GURL. Note that this *does* work for file: URLs, which |
| 174 | // some callers may want to filter out before calling this. |
| 175 | // |
| 176 | // It is an error to get an empty path on an invalid URL. The result |
| 177 | // will be the empty URL. |
| 178 | GURL GetWithEmptyPath() const; |
| 179 | |
| 180 | // A helper function to return a GURL containing just the scheme, host, |
| 181 | // and port from a URL. Equivalent to clearing any username and password, |
| 182 | // replacing the path with a slash, and clearing everything after that. If |
| 183 | // this URL is not a standard URL, then the result will be an empty, |
| 184 | // invalid GURL. If the URL has neither username nor password, this |
| 185 | // degenerates to GetWithEmptyPath(). |
| 186 | // |
| 187 | // It is an error to get the origin of an invalid URL. The result |
| 188 | // will be the empty URL. |
| 189 | GURL GetOrigin() const; |
| 190 | |
| 191 | // A helper function to return a GURL stripped from the elements that are not |
| 192 | // supposed to be sent as HTTP referrer: username, password and ref fragment. |
James Robinson | 6a64b81 | 2014-12-03 13:38:42 -0800 | [diff] [blame] | 193 | // For invalid URLs or URLs that no valid referrers, an empty URL will be |
| 194 | // returned. |
James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 195 | GURL GetAsReferrer() const; |
| 196 | |
| 197 | // Returns true if the scheme for the current URL is a known "standard" |
| 198 | // scheme. Standard schemes have an authority and a path section. This |
| 199 | // includes file: and filesystem:, which some callers may want to filter out |
| 200 | // explicitly by calling SchemeIsFile[System]. |
| 201 | bool IsStandard() const; |
| 202 | |
| 203 | // Returns true if the given parameter (should be lower-case ASCII to match |
| 204 | // the canonicalized scheme) is the scheme for this URL. This call is more |
| 205 | // efficient than getting the scheme and comparing it because no copies or |
| 206 | // object constructions are done. |
| 207 | bool SchemeIs(const char* lower_ascii_scheme) const; |
| 208 | |
| 209 | // Returns true if the scheme is "http" or "https". |
| 210 | bool SchemeIsHTTPOrHTTPS() const; |
| 211 | |
| 212 | // Returns true is the scheme is "ws" or "wss". |
| 213 | bool SchemeIsWSOrWSS() const; |
| 214 | |
| 215 | // We often need to know if this is a file URL. File URLs are "standard", but |
| 216 | // are often treated separately by some programs. |
| 217 | bool SchemeIsFile() const { |
| 218 | return SchemeIs(url::kFileScheme); |
| 219 | } |
| 220 | |
| 221 | // FileSystem URLs need to be treated differently in some cases. |
| 222 | bool SchemeIsFileSystem() const { |
| 223 | return SchemeIs(url::kFileSystemScheme); |
| 224 | } |
| 225 | |
| 226 | // If the scheme indicates a secure connection |
| 227 | bool SchemeIsSecure() const { |
| 228 | return SchemeIs(url::kHttpsScheme) || SchemeIs(url::kWssScheme) || |
| 229 | (SchemeIsFileSystem() && inner_url() && inner_url()->SchemeIsSecure()); |
| 230 | } |
| 231 | |
| 232 | // Returns true if the scheme is "blob". |
| 233 | bool SchemeIsBlob() const { |
| 234 | return SchemeIs(url::kBlobScheme); |
| 235 | } |
| 236 | |
| 237 | // The "content" of the URL is everything after the scheme (skipping the |
| 238 | // scheme delimiting colon). It is an error to get the origin of an invalid |
| 239 | // URL. The result will be an empty string. |
| 240 | std::string GetContent() const; |
| 241 | |
| 242 | // Returns true if the hostname is an IP address. Note: this function isn't |
| 243 | // as cheap as a simple getter because it re-parses the hostname to verify. |
| 244 | // This currently identifies only IPv4 addresses (bug 822685). |
| 245 | bool HostIsIPAddress() const; |
| 246 | |
| 247 | // Getters for various components of the URL. The returned string will be |
| 248 | // empty if the component is empty or is not present. |
| 249 | std::string scheme() const { // Not including the colon. See also SchemeIs. |
| 250 | return ComponentString(parsed_.scheme); |
| 251 | } |
| 252 | std::string username() const { |
| 253 | return ComponentString(parsed_.username); |
| 254 | } |
| 255 | std::string password() const { |
| 256 | return ComponentString(parsed_.password); |
| 257 | } |
| 258 | // Note that this may be a hostname, an IPv4 address, or an IPv6 literal |
| 259 | // surrounded by square brackets, like "[2001:db8::1]". To exclude these |
| 260 | // brackets, use HostNoBrackets() below. |
| 261 | std::string host() const { |
| 262 | return ComponentString(parsed_.host); |
| 263 | } |
| 264 | std::string port() const { // Returns -1 if "default" |
| 265 | return ComponentString(parsed_.port); |
| 266 | } |
| 267 | std::string path() const { // Including first slash following host |
| 268 | return ComponentString(parsed_.path); |
| 269 | } |
| 270 | std::string query() const { // Stuff following '?' |
| 271 | return ComponentString(parsed_.query); |
| 272 | } |
| 273 | std::string ref() const { // Stuff following '#' |
| 274 | return ComponentString(parsed_.ref); |
| 275 | } |
| 276 | |
| 277 | // Existance querying. These functions will return true if the corresponding |
| 278 | // URL component exists in this URL. Note that existance is different than |
| 279 | // being nonempty. http://www.google.com/? has a query that just happens to |
| 280 | // be empty, and has_query() will return true. |
| 281 | bool has_scheme() const { |
| 282 | return parsed_.scheme.len >= 0; |
| 283 | } |
| 284 | bool has_username() const { |
| 285 | return parsed_.username.len >= 0; |
| 286 | } |
| 287 | bool has_password() const { |
| 288 | return parsed_.password.len >= 0; |
| 289 | } |
| 290 | bool has_host() const { |
| 291 | // Note that hosts are special, absense of host means length 0. |
| 292 | return parsed_.host.len > 0; |
| 293 | } |
| 294 | bool has_port() const { |
| 295 | return parsed_.port.len >= 0; |
| 296 | } |
| 297 | bool has_path() const { |
| 298 | // Note that http://www.google.com/" has a path, the path is "/". This can |
| 299 | // return false only for invalid or nonstandard URLs. |
| 300 | return parsed_.path.len >= 0; |
| 301 | } |
| 302 | bool has_query() const { |
| 303 | return parsed_.query.len >= 0; |
| 304 | } |
| 305 | bool has_ref() const { |
| 306 | return parsed_.ref.len >= 0; |
| 307 | } |
| 308 | |
| 309 | // Returns a parsed version of the port. Can also be any of the special |
| 310 | // values defined in Parsed for ExtractPort. |
| 311 | int IntPort() const; |
| 312 | |
| 313 | // Returns the port number of the url, or the default port number. |
| 314 | // If the scheme has no concept of port (or unknown default) returns |
| 315 | // PORT_UNSPECIFIED. |
| 316 | int EffectiveIntPort() const; |
| 317 | |
| 318 | // Extracts the filename portion of the path and returns it. The filename |
| 319 | // is everything after the last slash in the path. This may be empty. |
| 320 | std::string ExtractFileName() const; |
| 321 | |
| 322 | // Returns the path that should be sent to the server. This is the path, |
| 323 | // parameter, and query portions of the URL. It is guaranteed to be ASCII. |
| 324 | std::string PathForRequest() const; |
| 325 | |
| 326 | // Returns the host, excluding the square brackets surrounding IPv6 address |
| 327 | // literals. This can be useful for passing to getaddrinfo(). |
| 328 | std::string HostNoBrackets() const; |
| 329 | |
| 330 | // Returns true if this URL's host matches or is in the same domain as |
| 331 | // the given input string. For example if this URL was "www.google.com", |
| 332 | // this would match "com", "google.com", and "www.google.com |
| 333 | // (input domain should be lower-case ASCII to match the canonicalized |
| 334 | // scheme). This call is more efficient than getting the host and check |
| 335 | // whether host has the specific domain or not because no copies or |
| 336 | // object constructions are done. |
| 337 | // |
| 338 | // If function DomainIs has parameter domain_len, which means the parameter |
| 339 | // lower_ascii_domain does not gurantee to terminate with NULL character. |
| 340 | bool DomainIs(const char* lower_ascii_domain, int domain_len) const; |
| 341 | |
| 342 | // If function DomainIs only has parameter lower_ascii_domain, which means |
| 343 | // domain string should be terminate with NULL character. |
| 344 | bool DomainIs(const char* lower_ascii_domain) const { |
| 345 | return DomainIs(lower_ascii_domain, |
| 346 | static_cast<int>(strlen(lower_ascii_domain))); |
| 347 | } |
| 348 | |
| 349 | // Swaps the contents of this GURL object with the argument without doing |
| 350 | // any memory allocations. |
| 351 | void Swap(GURL* other); |
| 352 | |
| 353 | // Returns a reference to a singleton empty GURL. This object is for callers |
| 354 | // who return references but don't have anything to return in some cases. |
| 355 | // This function may be called from any thread. |
| 356 | static const GURL& EmptyGURL(); |
| 357 | |
| 358 | // Returns the inner URL of a nested URL [currently only non-null for |
| 359 | // filesystem: URLs]. |
| 360 | const GURL* inner_url() const { |
| 361 | return inner_url_.get(); |
| 362 | } |
| 363 | |
| 364 | private: |
| 365 | // Variant of the string parsing constructor that allows the caller to elect |
| 366 | // retain trailing whitespace, if any, on the passed URL spec but only if the |
| 367 | // scheme is one that allows trailing whitespace. The primary use-case is |
| 368 | // for data: URLs. In most cases, you want to use the single parameter |
| 369 | // constructor above. |
| 370 | enum RetainWhiteSpaceSelector { RETAIN_TRAILING_PATH_WHITEPACE }; |
| 371 | GURL(const std::string& url_string, RetainWhiteSpaceSelector); |
| 372 | |
| 373 | template<typename STR> |
| 374 | void InitCanonical(const STR& input_spec, bool trim_path_end); |
| 375 | |
| 376 | void InitializeFromCanonicalSpec(); |
| 377 | |
| 378 | // Returns the substring of the input identified by the given component. |
| 379 | std::string ComponentString(const url::Component& comp) const { |
| 380 | if (comp.len <= 0) |
| 381 | return std::string(); |
| 382 | return std::string(spec_, comp.begin, comp.len); |
| 383 | } |
| 384 | |
| 385 | // The actual text of the URL, in canonical ASCII form. |
| 386 | std::string spec_; |
| 387 | |
| 388 | // Set when the given URL is valid. Otherwise, we may still have a spec and |
| 389 | // components, but they may not identify valid resources (for example, an |
| 390 | // invalid port number, invalid characters in the scheme, etc.). |
| 391 | bool is_valid_; |
| 392 | |
| 393 | // Identified components of the canonical spec. |
| 394 | url::Parsed parsed_; |
| 395 | |
| 396 | // Used for nested schemes [currently only filesystem:]. |
| 397 | scoped_ptr<GURL> inner_url_; |
| 398 | |
| 399 | // TODO bug 684583: Add encoding for query params. |
| 400 | }; |
| 401 | |
| 402 | // Stream operator so GURL can be used in assertion statements. |
| 403 | URL_EXPORT std::ostream& operator<<(std::ostream& out, const GURL& url); |
| 404 | |
| 405 | #endif // URL_GURL_H_ |