|  | /* Based on nsURLParsers.cc from Mozilla | 
|  | * ------------------------------------- | 
|  | * The contents of this file are subject to the Mozilla Public License Version | 
|  | * 1.1 (the "License"); you may not use this file except in compliance with | 
|  | * the License. You may obtain a copy of the License at | 
|  | * http://www.mozilla.org/MPL/ | 
|  | * | 
|  | * Software distributed under the License is distributed on an "AS IS" basis, | 
|  | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | 
|  | * for the specific language governing rights and limitations under the | 
|  | * License. | 
|  | * | 
|  | * The Original Code is mozilla.org code. | 
|  | * | 
|  | * The Initial Developer of the Original Code is | 
|  | * Netscape Communications Corporation. | 
|  | * Portions created by the Initial Developer are Copyright (C) 1998 | 
|  | * the Initial Developer. All Rights Reserved. | 
|  | * | 
|  | * Contributor(s): | 
|  | *   Darin Fisher (original author) | 
|  | * | 
|  | * Alternatively, the contents of this file may be used under the terms of | 
|  | * either the GNU General Public License Version 2 or later (the "GPL"), or | 
|  | * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), | 
|  | * in which case the provisions of the GPL or the LGPL are applicable instead | 
|  | * of those above. If you wish to allow use of your version of this file only | 
|  | * under the terms of either the GPL or the LGPL, and not to allow others to | 
|  | * use your version of this file under the terms of the MPL, indicate your | 
|  | * decision by deleting the provisions above and replace them with the notice | 
|  | * and other provisions required by the GPL or the LGPL. If you do not delete | 
|  | * the provisions above, a recipient may use your version of this file under | 
|  | * the terms of any one of the MPL, the GPL or the LGPL. | 
|  | * | 
|  | * ***** END LICENSE BLOCK ***** */ | 
|  |  | 
|  | #include "url/third_party/mozilla/url_parse.h" | 
|  |  | 
|  | #include <stdlib.h> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "url/url_parse_internal.h" | 
|  | #include "url/url_util.h" | 
|  | #include "url/url_util_internal.h" | 
|  |  | 
|  | namespace url { | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Returns true if the given character is a valid digit to use in a port. | 
|  | inline bool IsPortDigit(base::char16 ch) { | 
|  | return ch >= '0' && ch <= '9'; | 
|  | } | 
|  |  | 
|  | // Returns the offset of the next authority terminator in the input starting | 
|  | // from start_offset. If no terminator is found, the return value will be equal | 
|  | // to spec_len. | 
|  | template<typename CHAR> | 
|  | int FindNextAuthorityTerminator(const CHAR* spec, | 
|  | int start_offset, | 
|  | int spec_len) { | 
|  | for (int i = start_offset; i < spec_len; i++) { | 
|  | if (IsAuthorityTerminator(spec[i])) | 
|  | return i; | 
|  | } | 
|  | return spec_len;  // Not found. | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void ParseUserInfo(const CHAR* spec, | 
|  | const Component& user, | 
|  | Component* username, | 
|  | Component* password) { | 
|  | // Find the first colon in the user section, which separates the username and | 
|  | // password. | 
|  | int colon_offset = 0; | 
|  | while (colon_offset < user.len && spec[user.begin + colon_offset] != ':') | 
|  | colon_offset++; | 
|  |  | 
|  | if (colon_offset < user.len) { | 
|  | // Found separator: <username>:<password> | 
|  | *username = Component(user.begin, colon_offset); | 
|  | *password = MakeRange(user.begin + colon_offset + 1, | 
|  | user.begin + user.len); | 
|  | } else { | 
|  | // No separator, treat everything as the username | 
|  | *username = user; | 
|  | *password = Component(); | 
|  | } | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void ParseServerInfo(const CHAR* spec, | 
|  | const Component& serverinfo, | 
|  | Component* hostname, | 
|  | Component* port_num) { | 
|  | if (serverinfo.len == 0) { | 
|  | // No server info, host name is empty. | 
|  | hostname->reset(); | 
|  | port_num->reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // If the host starts with a left-bracket, assume the entire host is an | 
|  | // IPv6 literal.  Otherwise, assume none of the host is an IPv6 literal. | 
|  | // This assumption will be overridden if we find a right-bracket. | 
|  | // | 
|  | // Our IPv6 address canonicalization code requires both brackets to exist, | 
|  | // but the ability to locate an incomplete address can still be useful. | 
|  | int ipv6_terminator = spec[serverinfo.begin] == '[' ? serverinfo.end() : -1; | 
|  | int colon = -1; | 
|  |  | 
|  | // Find the last right-bracket, and the last colon. | 
|  | for (int i = serverinfo.begin; i < serverinfo.end(); i++) { | 
|  | switch (spec[i]) { | 
|  | case ']': | 
|  | ipv6_terminator = i; | 
|  | break; | 
|  | case ':': | 
|  | colon = i; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (colon > ipv6_terminator) { | 
|  | // Found a port number: <hostname>:<port> | 
|  | *hostname = MakeRange(serverinfo.begin, colon); | 
|  | if (hostname->len == 0) | 
|  | hostname->reset(); | 
|  | *port_num = MakeRange(colon + 1, serverinfo.end()); | 
|  | } else { | 
|  | // No port: <hostname> | 
|  | *hostname = serverinfo; | 
|  | port_num->reset(); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Given an already-identified auth section, breaks it into its consituent | 
|  | // parts. The port number will be parsed and the resulting integer will be | 
|  | // filled into the given *port variable, or -1 if there is no port number or it | 
|  | // is invalid. | 
|  | template<typename CHAR> | 
|  | void DoParseAuthority(const CHAR* spec, | 
|  | const Component& auth, | 
|  | Component* username, | 
|  | Component* password, | 
|  | Component* hostname, | 
|  | Component* port_num) { | 
|  | DCHECK(auth.is_valid()) << "We should always get an authority"; | 
|  | if (auth.len == 0) { | 
|  | username->reset(); | 
|  | password->reset(); | 
|  | hostname->reset(); | 
|  | port_num->reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Search backwards for @, which is the separator between the user info and | 
|  | // the server info. | 
|  | int i = auth.begin + auth.len - 1; | 
|  | while (i > auth.begin && spec[i] != '@') | 
|  | i--; | 
|  |  | 
|  | if (spec[i] == '@') { | 
|  | // Found user info: <user-info>@<server-info> | 
|  | ParseUserInfo(spec, Component(auth.begin, i - auth.begin), | 
|  | username, password); | 
|  | ParseServerInfo(spec, MakeRange(i + 1, auth.begin + auth.len), | 
|  | hostname, port_num); | 
|  | } else { | 
|  | // No user info, everything is server info. | 
|  | username->reset(); | 
|  | password->reset(); | 
|  | ParseServerInfo(spec, auth, hostname, port_num); | 
|  | } | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void ParsePath(const CHAR* spec, | 
|  | const Component& path, | 
|  | Component* filepath, | 
|  | Component* query, | 
|  | Component* ref) { | 
|  | // path = [/]<segment1>/<segment2>/<...>/<segmentN>;<param>?<query>#<ref> | 
|  |  | 
|  | // Special case when there is no path. | 
|  | if (path.len == -1) { | 
|  | filepath->reset(); | 
|  | query->reset(); | 
|  | ref->reset(); | 
|  | return; | 
|  | } | 
|  | DCHECK(path.len > 0) << "We should never have 0 length paths"; | 
|  |  | 
|  | // Search for first occurrence of either ? or #. | 
|  | int path_end = path.begin + path.len; | 
|  |  | 
|  | int query_separator = -1;  // Index of the '?' | 
|  | int ref_separator = -1;    // Index of the '#' | 
|  | for (int i = path.begin; i < path_end; i++) { | 
|  | switch (spec[i]) { | 
|  | case '?': | 
|  | // Only match the query string if it precedes the reference fragment | 
|  | // and when we haven't found one already. | 
|  | if (ref_separator < 0 && query_separator < 0) | 
|  | query_separator = i; | 
|  | break; | 
|  | case '#': | 
|  | // Record the first # sign only. | 
|  | if (ref_separator < 0) | 
|  | ref_separator = i; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Markers pointing to the character after each of these corresponding | 
|  | // components. The code below words from the end back to the beginning, | 
|  | // and will update these indices as it finds components that exist. | 
|  | int file_end, query_end; | 
|  |  | 
|  | // Ref fragment: from the # to the end of the path. | 
|  | if (ref_separator >= 0) { | 
|  | file_end = query_end = ref_separator; | 
|  | *ref = MakeRange(ref_separator + 1, path_end); | 
|  | } else { | 
|  | file_end = query_end = path_end; | 
|  | ref->reset(); | 
|  | } | 
|  |  | 
|  | // Query fragment: everything from the ? to the next boundary (either the end | 
|  | // of the path or the ref fragment). | 
|  | if (query_separator >= 0) { | 
|  | file_end = query_separator; | 
|  | *query = MakeRange(query_separator + 1, query_end); | 
|  | } else { | 
|  | query->reset(); | 
|  | } | 
|  |  | 
|  | // File path: treat an empty file path as no file path. | 
|  | if (file_end != path.begin) | 
|  | *filepath = MakeRange(path.begin, file_end); | 
|  | else | 
|  | filepath->reset(); | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | bool DoExtractScheme(const CHAR* url, | 
|  | int url_len, | 
|  | Component* scheme) { | 
|  | // Skip leading whitespace and control characters. | 
|  | int begin = 0; | 
|  | while (begin < url_len && ShouldTrimFromURL(url[begin])) | 
|  | begin++; | 
|  | if (begin == url_len) | 
|  | return false;  // Input is empty or all whitespace. | 
|  |  | 
|  | // Find the first colon character. | 
|  | for (int i = begin; i < url_len; i++) { | 
|  | if (url[i] == ':') { | 
|  | *scheme = MakeRange(begin, i); | 
|  | return true; | 
|  | } | 
|  | } | 
|  | return false;  // No colon found: no scheme | 
|  | } | 
|  |  | 
|  | // Fills in all members of the Parsed structure except for the scheme. | 
|  | // | 
|  | // |spec| is the full spec being parsed, of length |spec_len|. | 
|  | // |after_scheme| is the character immediately following the scheme (after the | 
|  | //   colon) where we'll begin parsing. | 
|  | // | 
|  | // Compatability data points. I list "host", "path" extracted: | 
|  | // Input                IE6             Firefox                Us | 
|  | // -----                --------------  --------------         -------------- | 
|  | // http://foo.com/      "foo.com", "/"  "foo.com", "/"         "foo.com", "/" | 
|  | // http:foo.com/        "foo.com", "/"  "foo.com", "/"         "foo.com", "/" | 
|  | // http:/foo.com/       fail(*)         "foo.com", "/"         "foo.com", "/" | 
|  | // http:\foo.com/       fail(*)         "\foo.com", "/"(fail)  "foo.com", "/" | 
|  | // http:////foo.com/    "foo.com", "/"  "foo.com", "/"         "foo.com", "/" | 
|  | // | 
|  | // (*) Interestingly, although IE fails to load these URLs, its history | 
|  | // canonicalizer handles them, meaning if you've been to the corresponding | 
|  | // "http://foo.com/" link, it will be colored. | 
|  | template <typename CHAR> | 
|  | void DoParseAfterScheme(const CHAR* spec, | 
|  | int spec_len, | 
|  | int after_scheme, | 
|  | Parsed* parsed) { | 
|  | int num_slashes = CountConsecutiveSlashes(spec, after_scheme, spec_len); | 
|  | int after_slashes = after_scheme + num_slashes; | 
|  |  | 
|  | // First split into two main parts, the authority (username, password, host, | 
|  | // and port) and the full path (path, query, and reference). | 
|  | Component authority; | 
|  | Component full_path; | 
|  |  | 
|  | // Found "//<some data>", looks like an authority section. Treat everything | 
|  | // from there to the next slash (or end of spec) to be the authority. Note | 
|  | // that we ignore the number of slashes and treat it as the authority. | 
|  | int end_auth = FindNextAuthorityTerminator(spec, after_slashes, spec_len); | 
|  | authority = Component(after_slashes, end_auth - after_slashes); | 
|  |  | 
|  | if (end_auth == spec_len)  // No beginning of path found. | 
|  | full_path = Component(); | 
|  | else  // Everything starting from the slash to the end is the path. | 
|  | full_path = Component(end_auth, spec_len - end_auth); | 
|  |  | 
|  | // Now parse those two sub-parts. | 
|  | DoParseAuthority(spec, authority, &parsed->username, &parsed->password, | 
|  | &parsed->host, &parsed->port); | 
|  | ParsePath(spec, full_path, &parsed->path, &parsed->query, &parsed->ref); | 
|  | } | 
|  |  | 
|  | // The main parsing function for standard URLs. Standard URLs have a scheme, | 
|  | // host, path, etc. | 
|  | template<typename CHAR> | 
|  | void DoParseStandardURL(const CHAR* spec, int spec_len, Parsed* parsed) { | 
|  | DCHECK(spec_len >= 0); | 
|  |  | 
|  | // Strip leading & trailing spaces and control characters. | 
|  | int begin = 0; | 
|  | TrimURL(spec, &begin, &spec_len); | 
|  |  | 
|  | int after_scheme; | 
|  | if (DoExtractScheme(spec, spec_len, &parsed->scheme)) { | 
|  | after_scheme = parsed->scheme.end() + 1;  // Skip past the colon. | 
|  | } else { | 
|  | // Say there's no scheme when there is no colon. We could also say that | 
|  | // everything is the scheme. Both would produce an invalid URL, but this way | 
|  | // seems less wrong in more cases. | 
|  | parsed->scheme.reset(); | 
|  | after_scheme = begin; | 
|  | } | 
|  | DoParseAfterScheme(spec, spec_len, after_scheme, parsed); | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void DoParseFileSystemURL(const CHAR* spec, int spec_len, Parsed* parsed) { | 
|  | DCHECK(spec_len >= 0); | 
|  |  | 
|  | // Get the unused parts of the URL out of the way. | 
|  | parsed->username.reset(); | 
|  | parsed->password.reset(); | 
|  | parsed->host.reset(); | 
|  | parsed->port.reset(); | 
|  | parsed->path.reset();   // May use this; reset for convenience. | 
|  | parsed->ref.reset();    // May use this; reset for convenience. | 
|  | parsed->query.reset();  // May use this; reset for convenience. | 
|  | parsed->clear_inner_parsed();  // May use this; reset for convenience. | 
|  |  | 
|  | // Strip leading & trailing spaces and control characters. | 
|  | int begin = 0; | 
|  | TrimURL(spec, &begin, &spec_len); | 
|  |  | 
|  | // Handle empty specs or ones that contain only whitespace or control chars. | 
|  | if (begin == spec_len) { | 
|  | parsed->scheme.reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | int inner_start = -1; | 
|  |  | 
|  | // Extract the scheme.  We also handle the case where there is no scheme. | 
|  | if (DoExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { | 
|  | // Offset the results since we gave ExtractScheme a substring. | 
|  | parsed->scheme.begin += begin; | 
|  |  | 
|  | if (parsed->scheme.end() == spec_len - 1) | 
|  | return; | 
|  |  | 
|  | inner_start = parsed->scheme.end() + 1; | 
|  | } else { | 
|  | // No scheme found; that's not valid for filesystem URLs. | 
|  | parsed->scheme.reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | Component inner_scheme; | 
|  | const CHAR* inner_spec = &spec[inner_start]; | 
|  | int inner_spec_len = spec_len - inner_start; | 
|  |  | 
|  | if (DoExtractScheme(inner_spec, inner_spec_len, &inner_scheme)) { | 
|  | // Offset the results since we gave ExtractScheme a substring. | 
|  | inner_scheme.begin += inner_start; | 
|  |  | 
|  | if (inner_scheme.end() == spec_len - 1) | 
|  | return; | 
|  | } else { | 
|  | // No scheme found; that's not valid for filesystem URLs. | 
|  | // The best we can do is return "filesystem://". | 
|  | return; | 
|  | } | 
|  |  | 
|  | Parsed inner_parsed; | 
|  |  | 
|  | if (CompareSchemeComponent(spec, inner_scheme, kFileScheme)) { | 
|  | // File URLs are special. | 
|  | ParseFileURL(inner_spec, inner_spec_len, &inner_parsed); | 
|  | } else if (CompareSchemeComponent(spec, inner_scheme, kFileSystemScheme)) { | 
|  | // Filesystem URLs don't nest. | 
|  | return; | 
|  | } else if (IsStandard(spec, inner_scheme)) { | 
|  | // All "normal" URLs. | 
|  | DoParseStandardURL(inner_spec, inner_spec_len, &inner_parsed); | 
|  | } else { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // All members of inner_parsed need to be offset by inner_start. | 
|  | // If we had any scheme that supported nesting more than one level deep, | 
|  | // we'd have to recurse into the inner_parsed's inner_parsed when | 
|  | // adjusting by inner_start. | 
|  | inner_parsed.scheme.begin += inner_start; | 
|  | inner_parsed.username.begin += inner_start; | 
|  | inner_parsed.password.begin += inner_start; | 
|  | inner_parsed.host.begin += inner_start; | 
|  | inner_parsed.port.begin += inner_start; | 
|  | inner_parsed.query.begin += inner_start; | 
|  | inner_parsed.ref.begin += inner_start; | 
|  | inner_parsed.path.begin += inner_start; | 
|  |  | 
|  | // Query and ref move from inner_parsed to parsed. | 
|  | parsed->query = inner_parsed.query; | 
|  | inner_parsed.query.reset(); | 
|  | parsed->ref = inner_parsed.ref; | 
|  | inner_parsed.ref.reset(); | 
|  |  | 
|  | parsed->set_inner_parsed(inner_parsed); | 
|  | if (!inner_parsed.scheme.is_valid() || !inner_parsed.path.is_valid() || | 
|  | inner_parsed.inner_parsed()) { | 
|  | return; | 
|  | } | 
|  |  | 
|  | // The path in inner_parsed should start with a slash, then have a filesystem | 
|  | // type followed by a slash.  From the first slash up to but excluding the | 
|  | // second should be what it keeps; the rest goes to parsed.  If the path ends | 
|  | // before the second slash, it's still pretty clear what the user meant, so | 
|  | // we'll let that through. | 
|  | if (!IsURLSlash(spec[inner_parsed.path.begin])) { | 
|  | return; | 
|  | } | 
|  | int inner_path_end = inner_parsed.path.begin + 1;  // skip the leading slash | 
|  | while (inner_path_end < spec_len && | 
|  | !IsURLSlash(spec[inner_path_end])) | 
|  | ++inner_path_end; | 
|  | parsed->path.begin = inner_path_end; | 
|  | int new_inner_path_length = inner_path_end - inner_parsed.path.begin; | 
|  | parsed->path.len = inner_parsed.path.len - new_inner_path_length; | 
|  | parsed->inner_parsed()->path.len = new_inner_path_length; | 
|  | } | 
|  |  | 
|  | // Initializes a path URL which is merely a scheme followed by a path. Examples | 
|  | // include "about:foo" and "javascript:alert('bar');" | 
|  | template<typename CHAR> | 
|  | void DoParsePathURL(const CHAR* spec, int spec_len, | 
|  | bool trim_path_end, | 
|  | Parsed* parsed) { | 
|  | // Get the non-path and non-scheme parts of the URL out of the way, we never | 
|  | // use them. | 
|  | parsed->username.reset(); | 
|  | parsed->password.reset(); | 
|  | parsed->host.reset(); | 
|  | parsed->port.reset(); | 
|  | parsed->path.reset(); | 
|  | parsed->query.reset(); | 
|  | parsed->ref.reset(); | 
|  |  | 
|  | // Strip leading & trailing spaces and control characters. | 
|  | int scheme_begin = 0; | 
|  | TrimURL(spec, &scheme_begin, &spec_len, trim_path_end); | 
|  |  | 
|  | // Handle empty specs or ones that contain only whitespace or control chars. | 
|  | if (scheme_begin == spec_len) { | 
|  | parsed->scheme.reset(); | 
|  | parsed->path.reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | int path_begin; | 
|  | // Extract the scheme, with the path being everything following. We also | 
|  | // handle the case where there is no scheme. | 
|  | if (ExtractScheme(&spec[scheme_begin], spec_len - scheme_begin, | 
|  | &parsed->scheme)) { | 
|  | // Offset the results since we gave ExtractScheme a substring. | 
|  | parsed->scheme.begin += scheme_begin; | 
|  | path_begin = parsed->scheme.end() + 1; | 
|  | } else { | 
|  | // No scheme case. | 
|  | parsed->scheme.reset(); | 
|  | path_begin = scheme_begin; | 
|  | } | 
|  |  | 
|  | if (path_begin == spec_len) | 
|  | return; | 
|  | DCHECK_LT(path_begin, spec_len); | 
|  |  | 
|  | ParsePath(spec, | 
|  | MakeRange(path_begin, spec_len), | 
|  | &parsed->path, | 
|  | &parsed->query, | 
|  | &parsed->ref); | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void DoParseMailtoURL(const CHAR* spec, int spec_len, Parsed* parsed) { | 
|  | DCHECK(spec_len >= 0); | 
|  |  | 
|  | // Get the non-path and non-scheme parts of the URL out of the way, we never | 
|  | // use them. | 
|  | parsed->username.reset(); | 
|  | parsed->password.reset(); | 
|  | parsed->host.reset(); | 
|  | parsed->port.reset(); | 
|  | parsed->ref.reset(); | 
|  | parsed->query.reset();  // May use this; reset for convenience. | 
|  |  | 
|  | // Strip leading & trailing spaces and control characters. | 
|  | int begin = 0; | 
|  | TrimURL(spec, &begin, &spec_len); | 
|  |  | 
|  | // Handle empty specs or ones that contain only whitespace or control chars. | 
|  | if (begin == spec_len) { | 
|  | parsed->scheme.reset(); | 
|  | parsed->path.reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | int path_begin = -1; | 
|  | int path_end = -1; | 
|  |  | 
|  | // Extract the scheme, with the path being everything following. We also | 
|  | // handle the case where there is no scheme. | 
|  | if (ExtractScheme(&spec[begin], spec_len - begin, &parsed->scheme)) { | 
|  | // Offset the results since we gave ExtractScheme a substring. | 
|  | parsed->scheme.begin += begin; | 
|  |  | 
|  | if (parsed->scheme.end() != spec_len - 1) { | 
|  | path_begin = parsed->scheme.end() + 1; | 
|  | path_end = spec_len; | 
|  | } | 
|  | } else { | 
|  | // No scheme found, just path. | 
|  | parsed->scheme.reset(); | 
|  | path_begin = begin; | 
|  | path_end = spec_len; | 
|  | } | 
|  |  | 
|  | // Split [path_begin, path_end) into a path + query. | 
|  | for (int i = path_begin; i < path_end; ++i) { | 
|  | if (spec[i] == '?') { | 
|  | parsed->query = MakeRange(i + 1, path_end); | 
|  | path_end = i; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // For compatability with the standard URL parser, treat no path as | 
|  | // -1, rather than having a length of 0 | 
|  | if (path_begin == path_end) { | 
|  | parsed->path.reset(); | 
|  | } else { | 
|  | parsed->path = MakeRange(path_begin, path_end); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Converts a port number in a string to an integer. We'd like to just call | 
|  | // sscanf but our input is not NULL-terminated, which sscanf requires. Instead, | 
|  | // we copy the digits to a small stack buffer (since we know the maximum number | 
|  | // of digits in a valid port number) that we can NULL terminate. | 
|  | template<typename CHAR> | 
|  | int DoParsePort(const CHAR* spec, const Component& component) { | 
|  | // Easy success case when there is no port. | 
|  | const int kMaxDigits = 5; | 
|  | if (!component.is_nonempty()) | 
|  | return PORT_UNSPECIFIED; | 
|  |  | 
|  | // Skip over any leading 0s. | 
|  | Component digits_comp(component.end(), 0); | 
|  | for (int i = 0; i < component.len; i++) { | 
|  | if (spec[component.begin + i] != '0') { | 
|  | digits_comp = MakeRange(component.begin + i, component.end()); | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (digits_comp.len == 0) | 
|  | return 0;  // All digits were 0. | 
|  |  | 
|  | // Verify we don't have too many digits (we'll be copying to our buffer so | 
|  | // we need to double-check). | 
|  | if (digits_comp.len > kMaxDigits) | 
|  | return PORT_INVALID; | 
|  |  | 
|  | // Copy valid digits to the buffer. | 
|  | char digits[kMaxDigits + 1];  // +1 for null terminator | 
|  | for (int i = 0; i < digits_comp.len; i++) { | 
|  | CHAR ch = spec[digits_comp.begin + i]; | 
|  | if (!IsPortDigit(ch)) { | 
|  | // Invalid port digit, fail. | 
|  | return PORT_INVALID; | 
|  | } | 
|  | digits[i] = static_cast<char>(ch); | 
|  | } | 
|  |  | 
|  | // Null-terminate the string and convert to integer. Since we guarantee | 
|  | // only digits, atoi's lack of error handling is OK. | 
|  | digits[digits_comp.len] = 0; | 
|  | int port = atoi(digits); | 
|  | if (port > 65535) | 
|  | return PORT_INVALID;  // Out of range. | 
|  | return port; | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | void DoExtractFileName(const CHAR* spec, | 
|  | const Component& path, | 
|  | Component* file_name) { | 
|  | // Handle empty paths: they have no file names. | 
|  | if (!path.is_nonempty()) { | 
|  | file_name->reset(); | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Extract the filename range from the path which is between | 
|  | // the last slash and the following semicolon. | 
|  | int file_end = path.end(); | 
|  | for (int i = path.end() - 1; i >= path.begin; i--) { | 
|  | if (spec[i] == ';') { | 
|  | file_end = i; | 
|  | } else if (IsURLSlash(spec[i])) { | 
|  | // File name is everything following this character to the end | 
|  | *file_name = MakeRange(i + 1, file_end); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | // No slash found, this means the input was degenerate (generally paths | 
|  | // will start with a slash). Let's call everything the file name. | 
|  | *file_name = MakeRange(path.begin, file_end); | 
|  | return; | 
|  | } | 
|  |  | 
|  | template<typename CHAR> | 
|  | bool DoExtractQueryKeyValue(const CHAR* spec, | 
|  | Component* query, | 
|  | Component* key, | 
|  | Component* value) { | 
|  | if (!query->is_nonempty()) | 
|  | return false; | 
|  |  | 
|  | int start = query->begin; | 
|  | int cur = start; | 
|  | int end = query->end(); | 
|  |  | 
|  | // We assume the beginning of the input is the beginning of the "key" and we | 
|  | // skip to the end of it. | 
|  | key->begin = cur; | 
|  | while (cur < end && spec[cur] != '&' && spec[cur] != '=') | 
|  | cur++; | 
|  | key->len = cur - key->begin; | 
|  |  | 
|  | // Skip the separator after the key (if any). | 
|  | if (cur < end && spec[cur] == '=') | 
|  | cur++; | 
|  |  | 
|  | // Find the value part. | 
|  | value->begin = cur; | 
|  | while (cur < end && spec[cur] != '&') | 
|  | cur++; | 
|  | value->len = cur - value->begin; | 
|  |  | 
|  | // Finally skip the next separator if any | 
|  | if (cur < end && spec[cur] == '&') | 
|  | cur++; | 
|  |  | 
|  | // Save the new query | 
|  | *query = MakeRange(cur, end); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | Parsed::Parsed() : inner_parsed_(NULL) { | 
|  | } | 
|  |  | 
|  | Parsed::Parsed(const Parsed& other) : | 
|  | scheme(other.scheme), | 
|  | username(other.username), | 
|  | password(other.password), | 
|  | host(other.host), | 
|  | port(other.port), | 
|  | path(other.path), | 
|  | query(other.query), | 
|  | ref(other.ref), | 
|  | inner_parsed_(NULL) { | 
|  | if (other.inner_parsed_) | 
|  | set_inner_parsed(*other.inner_parsed_); | 
|  | } | 
|  |  | 
|  | Parsed& Parsed::operator=(const Parsed& other) { | 
|  | if (this != &other) { | 
|  | scheme = other.scheme; | 
|  | username = other.username; | 
|  | password = other.password; | 
|  | host = other.host; | 
|  | port = other.port; | 
|  | path = other.path; | 
|  | query = other.query; | 
|  | ref = other.ref; | 
|  | if (other.inner_parsed_) | 
|  | set_inner_parsed(*other.inner_parsed_); | 
|  | else | 
|  | clear_inner_parsed(); | 
|  | } | 
|  | return *this; | 
|  | } | 
|  |  | 
|  | Parsed::~Parsed() { | 
|  | delete inner_parsed_; | 
|  | } | 
|  |  | 
|  | int Parsed::Length() const { | 
|  | if (ref.is_valid()) | 
|  | return ref.end(); | 
|  | return CountCharactersBefore(REF, false); | 
|  | } | 
|  |  | 
|  | int Parsed::CountCharactersBefore(ComponentType type, | 
|  | bool include_delimiter) const { | 
|  | if (type == SCHEME) | 
|  | return scheme.begin; | 
|  |  | 
|  | // There will be some characters after the scheme like "://" and we don't | 
|  | // know how many. Search forwards for the next thing until we find one. | 
|  | int cur = 0; | 
|  | if (scheme.is_valid()) | 
|  | cur = scheme.end() + 1;  // Advance over the ':' at the end of the scheme. | 
|  |  | 
|  | if (username.is_valid()) { | 
|  | if (type <= USERNAME) | 
|  | return username.begin; | 
|  | cur = username.end() + 1;  // Advance over the '@' or ':' at the end. | 
|  | } | 
|  |  | 
|  | if (password.is_valid()) { | 
|  | if (type <= PASSWORD) | 
|  | return password.begin; | 
|  | cur = password.end() + 1;  // Advance over the '@' at the end. | 
|  | } | 
|  |  | 
|  | if (host.is_valid()) { | 
|  | if (type <= HOST) | 
|  | return host.begin; | 
|  | cur = host.end(); | 
|  | } | 
|  |  | 
|  | if (port.is_valid()) { | 
|  | if (type < PORT || (type == PORT && include_delimiter)) | 
|  | return port.begin - 1;  // Back over delimiter. | 
|  | if (type == PORT) | 
|  | return port.begin;  // Don't want delimiter counted. | 
|  | cur = port.end(); | 
|  | } | 
|  |  | 
|  | if (path.is_valid()) { | 
|  | if (type <= PATH) | 
|  | return path.begin; | 
|  | cur = path.end(); | 
|  | } | 
|  |  | 
|  | if (query.is_valid()) { | 
|  | if (type < QUERY || (type == QUERY && include_delimiter)) | 
|  | return query.begin - 1;  // Back over delimiter. | 
|  | if (type == QUERY) | 
|  | return query.begin;  // Don't want delimiter counted. | 
|  | cur = query.end(); | 
|  | } | 
|  |  | 
|  | if (ref.is_valid()) { | 
|  | if (type == REF && !include_delimiter) | 
|  | return ref.begin;  // Back over delimiter. | 
|  |  | 
|  | // When there is a ref and we get here, the component we wanted was before | 
|  | // this and not found, so we always know the beginning of the ref is right. | 
|  | return ref.begin - 1;  // Don't want delimiter counted. | 
|  | } | 
|  |  | 
|  | return cur; | 
|  | } | 
|  |  | 
|  | Component Parsed::GetContent() const { | 
|  | const int begin = CountCharactersBefore(USERNAME, false); | 
|  | const int len = Length() - begin; | 
|  | // For compatability with the standard URL parser, we treat no content as | 
|  | // -1, rather than having a length of 0 (we normally wouldn't care so | 
|  | // much for these non-standard URLs). | 
|  | return len ? Component(begin, len) : Component(); | 
|  | } | 
|  |  | 
|  | bool ExtractScheme(const char* url, int url_len, Component* scheme) { | 
|  | return DoExtractScheme(url, url_len, scheme); | 
|  | } | 
|  |  | 
|  | bool ExtractScheme(const base::char16* url, int url_len, Component* scheme) { | 
|  | return DoExtractScheme(url, url_len, scheme); | 
|  | } | 
|  |  | 
|  | // This handles everything that may be an authority terminator, including | 
|  | // backslash. For special backslash handling see DoParseAfterScheme. | 
|  | bool IsAuthorityTerminator(base::char16 ch) { | 
|  | return IsURLSlash(ch) || ch == '?' || ch == '#'; | 
|  | } | 
|  |  | 
|  | void ExtractFileName(const char* url, | 
|  | const Component& path, | 
|  | Component* file_name) { | 
|  | DoExtractFileName(url, path, file_name); | 
|  | } | 
|  |  | 
|  | void ExtractFileName(const base::char16* url, | 
|  | const Component& path, | 
|  | Component* file_name) { | 
|  | DoExtractFileName(url, path, file_name); | 
|  | } | 
|  |  | 
|  | bool ExtractQueryKeyValue(const char* url, | 
|  | Component* query, | 
|  | Component* key, | 
|  | Component* value) { | 
|  | return DoExtractQueryKeyValue(url, query, key, value); | 
|  | } | 
|  |  | 
|  | bool ExtractQueryKeyValue(const base::char16* url, | 
|  | Component* query, | 
|  | Component* key, | 
|  | Component* value) { | 
|  | return DoExtractQueryKeyValue(url, query, key, value); | 
|  | } | 
|  |  | 
|  | void ParseAuthority(const char* spec, | 
|  | const Component& auth, | 
|  | Component* username, | 
|  | Component* password, | 
|  | Component* hostname, | 
|  | Component* port_num) { | 
|  | DoParseAuthority(spec, auth, username, password, hostname, port_num); | 
|  | } | 
|  |  | 
|  | void ParseAuthority(const base::char16* spec, | 
|  | const Component& auth, | 
|  | Component* username, | 
|  | Component* password, | 
|  | Component* hostname, | 
|  | Component* port_num) { | 
|  | DoParseAuthority(spec, auth, username, password, hostname, port_num); | 
|  | } | 
|  |  | 
|  | int ParsePort(const char* url, const Component& port) { | 
|  | return DoParsePort(url, port); | 
|  | } | 
|  |  | 
|  | int ParsePort(const base::char16* url, const Component& port) { | 
|  | return DoParsePort(url, port); | 
|  | } | 
|  |  | 
|  | void ParseStandardURL(const char* url, int url_len, Parsed* parsed) { | 
|  | DoParseStandardURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParseStandardURL(const base::char16* url, int url_len, Parsed* parsed) { | 
|  | DoParseStandardURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParsePathURL(const char* url, | 
|  | int url_len, | 
|  | bool trim_path_end, | 
|  | Parsed* parsed) { | 
|  | DoParsePathURL(url, url_len, trim_path_end, parsed); | 
|  | } | 
|  |  | 
|  | void ParsePathURL(const base::char16* url, | 
|  | int url_len, | 
|  | bool trim_path_end, | 
|  | Parsed* parsed) { | 
|  | DoParsePathURL(url, url_len, trim_path_end, parsed); | 
|  | } | 
|  |  | 
|  | void ParseFileSystemURL(const char* url, int url_len, Parsed* parsed) { | 
|  | DoParseFileSystemURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParseFileSystemURL(const base::char16* url, int url_len, Parsed* parsed) { | 
|  | DoParseFileSystemURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParseMailtoURL(const char* url, int url_len, Parsed* parsed) { | 
|  | DoParseMailtoURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParseMailtoURL(const base::char16* url, int url_len, Parsed* parsed) { | 
|  | DoParseMailtoURL(url, url_len, parsed); | 
|  | } | 
|  |  | 
|  | void ParsePathInternal(const char* spec, | 
|  | const Component& path, | 
|  | Component* filepath, | 
|  | Component* query, | 
|  | Component* ref) { | 
|  | ParsePath(spec, path, filepath, query, ref); | 
|  | } | 
|  |  | 
|  | void ParsePathInternal(const base::char16* spec, | 
|  | const Component& path, | 
|  | Component* filepath, | 
|  | Component* query, | 
|  | Component* ref) { | 
|  | ParsePath(spec, path, filepath, query, ref); | 
|  | } | 
|  |  | 
|  | void ParseAfterScheme(const char* spec, | 
|  | int spec_len, | 
|  | int after_scheme, | 
|  | Parsed* parsed) { | 
|  | DoParseAfterScheme(spec, spec_len, after_scheme, parsed); | 
|  | } | 
|  |  | 
|  | void ParseAfterScheme(const base::char16* spec, | 
|  | int spec_len, | 
|  | int after_scheme, | 
|  | Parsed* parsed) { | 
|  | DoParseAfterScheme(spec, spec_len, after_scheme, parsed); | 
|  | } | 
|  |  | 
|  | }  // namespace url |