Blame - url/url_canon_path.cc - mojo-tools

blob: ceff689631990f0a50f0e5d3509cebcd337f46a4 [file] [log] [blame]

James Robinson	646469d	2014-10-03 15:33:28 -0700	[diff] [blame]	1	// Copyright 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "base/logging.h"
				6	#include "url/url_canon.h"
				7	#include "url/url_canon_internal.h"
				8	#include "url/url_parse_internal.h"
				9
				10	namespace url {
				11
				12	namespace {
				13
				14	enum CharacterFlags {
				15	// Pass through unchanged, whether escaped or unescaped. This doesn't
				16	// actually set anything so you can't OR it to check, it's just to make the
				17	// table below more clear when neither ESCAPE or UNESCAPE is set.
				18	PASS = 0,
				19
				20	// This character requires special handling in DoPartialPath. Doing this test
				21	// first allows us to filter out the common cases of regular characters that
				22	// can be directly copied.
				23	SPECIAL = 1,
				24
				25	// This character must be escaped in the canonical output. Note that all
				26	// escaped chars also have the "special" bit set so that the code that looks
				27	// for this is triggered. Not valid with PASS or ESCAPE
				28	ESCAPE_BIT = 2,
				29	ESCAPE = ESCAPE_BIT \| SPECIAL,
				30
				31	// This character must be unescaped in canonical output. Not valid with
				32	// ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
				33	// characters unescaped, they should just be copied.
				34	UNESCAPE = 4,
				35
				36	// This character is disallowed in URLs. Note that the "special" bit is also
				37	// set to trigger handling.
				38	INVALID_BIT = 8,
				39	INVALID = INVALID_BIT \| SPECIAL,
				40	};
				41
				42	// This table contains one of the above flag values. Note some flags are more
				43	// than one bits because they also turn on the "special" flag. Special is the
				44	// only flag that may be combined with others.
				45	//
				46	// This table is designed to match exactly what IE does with the characters.
				47	//
				48	// Dot is even more special, and the escaped version is handled specially by
				49	// IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
				50	// bit is never handled (we just need the "special") bit.
				51	const unsigned char kPathCharLookup[0x100] = {
				52	// NULL control chars...
				53	INVALID, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				54	// control chars...
				55	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				56	// ' ' ! " # $ % & ' ( ) * + , - . /
				57	ESCAPE, PASS, ESCAPE, ESCAPE, PASS, ESCAPE, PASS, PASS, PASS, PASS, PASS, PASS, PASS, UNESCAPE,SPECIAL, PASS,
				58	// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
				59	UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, PASS, ESCAPE, PASS, ESCAPE, ESCAPE,
				60	// @ A B C D E F G H I J K L M N O
				61	PASS, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
				62	// P Q R S T U V W X Y Z [ \ ] ^ _
				63	UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,PASS, ESCAPE, PASS, ESCAPE, UNESCAPE,
				64	// ` a b c d e f g h i j k l m n o
				65	ESCAPE, UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,
				66	// p q r s t u v w x y z { \| } ~ <NBSP>
				67	UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,UNESCAPE,ESCAPE, ESCAPE, ESCAPE, UNESCAPE,ESCAPE,
				68	// ...all the high-bit characters are escaped
				69	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				70	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				71	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				72	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				73	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				74	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				75	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE,
				76	ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE, ESCAPE};
				77
				78	enum DotDisposition {
				79	// The given dot is just part of a filename and is not special.
				80	NOT_A_DIRECTORY,
				81
				82	// The given dot is the current directory.
				83	DIRECTORY_CUR,
				84
				85	// The given dot is the first of a double dot that should take us up one.
				86	DIRECTORY_UP
				87	};
				88
				89	// When the path resolver finds a dot, this function is called with the
				90	// character following that dot to see what it is. The return value
				91	// indicates what type this dot is (see above). This code handles the case
				92	// where the dot is at the end of the input.
				93	//
				94	// \|*consumed_len\| will contain the number of characters in the input that
				95	// express what we found.
				96	//
				97	// If the input is "../foo", \|after_dot\| = 1, \|end\| = 6, and
				98	// at the end, \|*consumed_len\| = 2 for the "./" this function consumed. The
				99	// original dot length should be handled by the caller.
				100	template<typename CHAR>
				101	DotDisposition ClassifyAfterDot(const CHAR* spec, int after_dot,
				102	int end, int* consumed_len) {
				103	if (after_dot == end) {
				104	// Single dot at the end.
				105	*consumed_len = 0;
				106	return DIRECTORY_CUR;
				107	}
				108	if (IsURLSlash(spec[after_dot])) {
				109	// Single dot followed by a slash.
				110	*consumed_len = 1; // Consume the slash
				111	return DIRECTORY_CUR;
				112	}
				113
				114	int second_dot_len = IsDot(spec, after_dot, end);
				115	if (second_dot_len) {
				116	int after_second_dot = after_dot + second_dot_len;
				117	if (after_second_dot == end) {
				118	// Double dot at the end.
				119	*consumed_len = second_dot_len;
				120	return DIRECTORY_UP;
				121	}
				122	if (IsURLSlash(spec[after_second_dot])) {
				123	// Double dot followed by a slash.
				124	*consumed_len = second_dot_len + 1;
				125	return DIRECTORY_UP;
				126	}
				127	}
				128
				129	// The dots are followed by something else, not a directory.
				130	*consumed_len = 0;
				131	return NOT_A_DIRECTORY;
				132	}
				133
				134	// Rewinds the output to the previous slash. It is assumed that the output
				135	// ends with a slash and this doesn't count (we call this when we are
				136	// appending directory paths, so the previous path component has and ending
				137	// slash).
				138	//
				139	// This will stop at the first slash (assumed to be at position
				140	// \|path_begin_in_output\| and not go any higher than that. Some web pages
				141	// do ".." too many times, so we need to handle that brokenness.
				142	//
				143	// It searches for a literal slash rather than including a backslash as well
				144	// because it is run only on the canonical output.
				145	//
				146	// The output is guaranteed to end in a slash when this function completes.
				147	void BackUpToPreviousSlash(int path_begin_in_output,
				148	CanonOutput* output) {
				149	DCHECK(output->length() > 0);
				150
				151	int i = output->length() - 1;
				152	DCHECK(output->at(i) == '/');
				153	if (i == path_begin_in_output)
				154	return; // We're at the first slash, nothing to do.
				155
				156	// Now back up (skipping the trailing slash) until we find another slash.
				157	i--;
				158	while (output->at(i) != '/' && i > path_begin_in_output)
				159	i--;
				160
				161	// Now shrink the output to just include that last slash we found.
				162	output->set_length(i + 1);
				163	}
				164
				165	// Appends the given path to the output. It assumes that if the input path
				166	// starts with a slash, it should be copied to the output. If no path has
				167	// already been appended to the output (the case when not resolving
				168	// relative URLs), the path should begin with a slash.
				169	//
				170	// If there are already path components (this mode is used when appending
				171	// relative paths for resolving), it assumes that the output already has
				172	// a trailing slash and that if the input begins with a slash, it should be
				173	// copied to the output.
				174	//
				175	// We do not collapse multiple slashes in a row to a single slash. It seems
				176	// no web browsers do this, and we don't want incompababilities, even though
				177	// it would be correct for most systems.
				178	template<typename CHAR, typename UCHAR>
				179	bool DoPartialPath(const CHAR* spec,
				180	const Component& path,
				181	int path_begin_in_output,
				182	CanonOutput* output) {
				183	int end = path.end();
				184
				185	bool success = true;
				186	for (int i = path.begin; i < end; i++) {
				187	UCHAR uch = static_cast<UCHAR>(spec[i]);
				188	if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) {
				189	// We only need to test wide input for having non-ASCII characters. For
				190	// narrow input, we'll always just use the lookup table. We don't try to
				191	// do anything tricky with decoding/validating UTF-8. This function will
				192	// read one or two UTF-16 characters and append the output as UTF-8. This
				193	// call will be removed in 8-bit mode.
				194	success &= AppendUTF8EscapedChar(spec, &i, end, output);
				195	} else {
				196	// Normal ASCII character or 8-bit input, use the lookup table.
				197	unsigned char out_ch = static_cast<unsigned char>(uch);
				198	unsigned char flags = kPathCharLookup[out_ch];
				199	if (flags & SPECIAL) {
				200	// Needs special handling of some sort.
				201	int dotlen;
				202	if ((dotlen = IsDot(spec, i, end)) > 0) {
				203	// See if this dot was preceeded by a slash in the output. We
				204	// assume that when canonicalizing paths, they will always
				205	// start with a slash and not a dot, so we don't have to
				206	// bounds check the output.
				207	//
				208	// Note that we check this in the case of dots so we don't have to
				209	// special case slashes. Since slashes are much more common than
				210	// dots, this actually increases performance measurably (though
				211	// slightly).
				212	DCHECK(output->length() > path_begin_in_output);
				213	if (output->length() > path_begin_in_output &&
				214	output->at(output->length() - 1) == '/') {
				215	// Slash followed by a dot, check to see if this is means relative
				216	int consumed_len;
				217	switch (ClassifyAfterDot<CHAR>(spec, i + dotlen, end,
				218	&consumed_len)) {
				219	case NOT_A_DIRECTORY:
				220	// Copy the dot to the output, it means nothing special.
				221	output->push_back('.');
				222	i += dotlen - 1;
				223	break;
				224	case DIRECTORY_CUR: // Current directory, just skip the input.
				225	i += dotlen + consumed_len - 1;
				226	break;
				227	case DIRECTORY_UP:
				228	BackUpToPreviousSlash(path_begin_in_output, output);
				229	i += dotlen + consumed_len - 1;
				230	break;
				231	}
				232	} else {
				233	// This dot is not preceeded by a slash, it is just part of some
				234	// file name.
				235	output->push_back('.');
				236	i += dotlen - 1;
				237	}
				238
				239	} else if (out_ch == '\\') {
				240	// Convert backslashes to forward slashes
				241	output->push_back('/');
				242
				243	} else if (out_ch == '%') {
				244	// Handle escape sequences.
				245	unsigned char unescaped_value;
				246	if (DecodeEscaped(spec, &i, end, &unescaped_value)) {
				247	// Valid escape sequence, see if we keep, reject, or unescape it.
				248	char unescaped_flags = kPathCharLookup[unescaped_value];
				249
				250	if (unescaped_flags & UNESCAPE) {
				251	// This escaped value shouldn't be escaped, copy it.
				252	output->push_back(unescaped_value);
				253	} else if (unescaped_flags & INVALID_BIT) {
				254	// Invalid escaped character, copy it and remember the error.
				255	output->push_back('%');
				256	output->push_back(static_cast<char>(spec[i - 1]));
				257	output->push_back(static_cast<char>(spec[i]));
				258	success = false;
				259	} else {
				260	// Valid escaped character but we should keep it escaped. We
				261	// don't want to change the case of any hex letters in case
				262	// the server is sensitive to that, so we just copy the two
				263	// characters without checking (DecodeEscape will have advanced
				264	// to the last character of the pair).
				265	output->push_back('%');
				266	output->push_back(static_cast<char>(spec[i - 1]));
				267	output->push_back(static_cast<char>(spec[i]));
				268	}
				269	} else {
				270	// Invalid escape sequence. IE7 rejects any URLs with such
				271	// sequences, while Firefox, IE6, and Safari all pass it through
				272	// unchanged. We are more permissive unlike IE7. I don't think this
				273	// can cause significant problems, if it does, we should change
				274	// to be more like IE7.
				275	output->push_back('%');
				276	}
				277
				278	} else if (flags & INVALID_BIT) {
				279	// For NULLs, etc. fail.
				280	AppendEscapedChar(out_ch, output);
				281	success = false;
				282
				283	} else if (flags & ESCAPE_BIT) {
				284	// This character should be escaped.
				285	AppendEscapedChar(out_ch, output);
				286	}
				287	} else {
				288	// Nothing special about this character, just append it.
				289	output->push_back(out_ch);
				290	}
				291	}
				292	}
				293	return success;
				294	}
				295
				296	template<typename CHAR, typename UCHAR>
				297	bool DoPath(const CHAR* spec,
				298	const Component& path,
				299	CanonOutput* output,
				300	Component* out_path) {
				301	bool success = true;
				302	out_path->begin = output->length();
				303	if (path.len > 0) {
				304	// Write out an initial slash if the input has none. If we just parse a URL
				305	// and then canonicalize it, it will of course have a slash already. This
				306	// check is for the replacement and relative URL resolving cases of file
				307	// URLs.
				308	if (!IsURLSlash(spec[path.begin]))
				309	output->push_back('/');
				310
				311	success = DoPartialPath<CHAR, UCHAR>(spec, path, out_path->begin, output);
				312	} else {
				313	// No input, canonical path is a slash.
				314	output->push_back('/');
				315	}
				316	out_path->len = output->length() - out_path->begin;
				317	return success;
				318	}
				319
				320	} // namespace
				321
				322	bool CanonicalizePath(const char* spec,
				323	const Component& path,
				324	CanonOutput* output,
				325	Component* out_path) {
				326	return DoPath<char, unsigned char>(spec, path, output, out_path);
				327	}
				328
				329	bool CanonicalizePath(const base::char16* spec,
				330	const Component& path,
				331	CanonOutput* output,
				332	Component* out_path) {
				333	return DoPath<base::char16, base::char16>(spec, path, output, out_path);
				334	}
				335
				336	bool CanonicalizePartialPath(const char* spec,
				337	const Component& path,
				338	int path_begin_in_output,
				339	CanonOutput* output) {
				340	return DoPartialPath<char, unsigned char>(spec, path, path_begin_in_output,
				341	output);
				342	}
				343
				344	bool CanonicalizePartialPath(const base::char16* spec,
				345	const Component& path,
				346	int path_begin_in_output,
				347	CanonOutput* output) {
				348	return DoPartialPath<base::char16, base::char16>(spec, path,
				349	path_begin_in_output,
				350	output);
				351	}
				352
				353	} // namespace url