Blame - url/url_canon_host.cc - mojo-tools

blob: 513248a3804fbbda5e263939823a90c035f90ec5 [file] [log] [blame]

James Robinson	646469d	2014-10-03 15:33:28 -0700	[diff] [blame]	1	// Copyright 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "base/logging.h"
				6	#include "url/url_canon.h"
				7	#include "url/url_canon_internal.h"
				8
				9	namespace url {
				10
				11	namespace {
				12
				13	// For reference, here's what IE supports:
				14	// Key: 0 (disallowed: failure if present in the input)
				15	// + (allowed either escaped or unescaped, and unmodified)
				16	// U (allowed escaped or unescaped but always unescaped if present in
				17	// escaped form)
				18	// E (allowed escaped or unescaped but always escaped if present in
				19	// unescaped form)
				20	// % (only allowed escaped in the input, will be unmodified).
				21	// I left blank alpha numeric characters.
				22	//
				23	// 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
				24	// -----------------------------------------------
				25	// 0 0 E E E E E E E E E E E E E E E
				26	// 1 E E E E E E E E E E E E E E E E
				27	// 2 E + E E + E + + + + + + + U U 0
				28	// 3 % % E + E 0 <-- Those are : ; < = > ?
				29	// 4 %
				30	// 5 U 0 U U U <-- Those are [ \ ] ^ _
				31	// 6 E <-- That's `
				32	// 7 E E E U E <-- Those are { \| } ~ (UNPRINTABLE)
				33	//
				34	// NOTE: I didn't actually test all the control characters. Some may be
				35	// disallowed in the input, but they are all accepted escaped except for 0.
				36	// I also didn't test if characters affecting HTML parsing are allowed
				37	// unescaped, eg. (") or (#), which would indicate the beginning of the path.
				38	// Surprisingly, space is accepted in the input and always escaped.
				39
				40	// This table lists the canonical version of all characters we allow in the
				41	// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
				42	// value to indicate that this character should be escaped. We are a little more
				43	// restrictive than IE, but less restrictive than Firefox.
				44	//
				45	// Note that we disallow the % character. We will allow it when part of an
				46	// escape sequence, of course, but this disallows "%25". Even though IE allows
				47	// it, allowing it would put us in a funny state. If there was an invalid
				48	// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
				49	// Allowing percents means we'll succeed a second time, so validity would change
				50	// based on how many times you run the canonicalizer. We prefer to always report
				51	// the same vailidity, so reject this.
				52	const unsigned char kEsc = 0xff;
				53	const unsigned char kHostCharLookup[0x80] = {
				54	// 00-1f: all are invalid
				55	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				56	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				57	// ' ' ! " # $ % & ' ( ) * + , - . /
				58	kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
				59	// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
				60	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
				61	// @ A B C D E F G H I J K L M N O
				62	kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
				63	// P Q R S T U V W X Y Z [ \ ] ^ _
				64	'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
				65	// ` a b c d e f g h i j k l m n o
				66	kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
				67	// p q r s t u v w x y z { \| } ~
				68	'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
				69
				70	const int kTempHostBufferLen = 1024;
				71	typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
				72	typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
				73
				74	// Scans a host name and fills in the output flags according to what we find.
				75	// \|has_non_ascii\| will be true if there are any non-7-bit characters, and
				76	// \|has_escaped\| will be true if there is a percent sign.
				77	template<typename CHAR, typename UCHAR>
				78	void ScanHostname(const CHAR* spec,
				79	const Component& host,
				80	bool* has_non_ascii,
				81	bool* has_escaped) {
				82	int end = host.end();
				83	*has_non_ascii = false;
				84	*has_escaped = false;
				85	for (int i = host.begin; i < end; i++) {
				86	if (static_cast<UCHAR>(spec[i]) >= 0x80)
				87	*has_non_ascii = true;
				88	else if (spec[i] == '%')
				89	*has_escaped = true;
				90	}
				91	}
				92
				93	// Canonicalizes a host name that is entirely 8-bit characters (even though
				94	// the type holding them may be 16 bits. Escaped characters will be unescaped.
				95	// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
				96	//
				97	// The \|*has_non_ascii\| flag will be true if there are non-7-bit characters in
				98	// the output.
				99	//
				100	// This function is used in two situations:
				101	//
				102	// * When the caller knows there is no non-ASCII or percent escaped
				103	// characters. This is what DoHost does. The result will be a completely
				104	// canonicalized host since we know nothing weird can happen (escaped
				105	// characters could be unescaped to non-7-bit, so they have to be treated
				106	// with suspicion at this point). It does not use the \|has_non_ascii\| flag.
				107	//
				108	// * When the caller has an 8-bit string that may need unescaping.
				109	// DoComplexHost calls us this situation to do unescaping and validation.
				110	// After this, it may do other IDN operations depending on the value of the
				111	// \|*has_non_ascii\| flag.
				112	//
				113	// The return value indicates if the output is a potentially valid host name.
				114	template<typename INCHAR, typename OUTCHAR>
				115	bool DoSimpleHost(const INCHAR* host,
				116	int host_len,
				117	CanonOutputT<OUTCHAR>* output,
				118	bool* has_non_ascii) {
				119	*has_non_ascii = false;
				120
				121	bool success = true;
				122	for (int i = 0; i < host_len; ++i) {
				123	unsigned int source = host[i];
				124	if (source == '%') {
				125	// Unescape first, if possible.
				126	// Source will be used only if decode operation was successful.
				127	if (!DecodeEscaped(host, &i, host_len,
				128	reinterpret_cast<unsigned char*>(&source))) {
				129	// Invalid escaped character. There is nothing that can make this
				130	// host valid. We append an escaped percent so the URL looks reasonable
				131	// and mark as failed.
				132	AppendEscapedChar('%', output);
				133	success = false;
				134	continue;
				135	}
				136	}
				137
				138	if (source < 0x80) {
				139	// We have ASCII input, we can use our lookup table.
				140	unsigned char replacement = kHostCharLookup[source];
				141	if (!replacement) {
				142	// Invalid character, add it as percent-escaped and mark as failed.
				143	AppendEscapedChar(source, output);
				144	success = false;
				145	} else if (replacement == kEsc) {
				146	// This character is valid but should be escaped.
				147	AppendEscapedChar(source, output);
				148	} else {
				149	// Common case, the given character is valid in a hostname, the lookup
				150	// table tells us the canonical representation of that character (lower
				151	// cased).
				152	output->push_back(replacement);
				153	}
				154	} else {
				155	// It's a non-ascii char. Just push it to the output.
				156	// In case where we have char16 input, and char output it's safe to
				157	// cast char16->char only if input string was converted to ASCII.
				158	output->push_back(static_cast<OUTCHAR>(source));
				159	*has_non_ascii = true;
				160	}
				161	}
				162
				163	return success;
				164	}
				165
				166	// Canonicalizes a host that requires IDN conversion. Returns true on success
				167	bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
				168	// We need to escape URL before doing IDN conversion, since punicode strings
				169	// cannot be escaped after they are created.
				170	RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
				171	bool has_non_ascii;
				172	DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
				173
				174	StackBufferW wide_output;
				175	if (!IDNToASCII(url_escaped_host.data(),
				176	url_escaped_host.length(),
				177	&wide_output)) {
				178	// Some error, give up. This will write some reasonable looking
				179	// representation of the string to the output.
				180	AppendInvalidNarrowString(src, 0, src_len, output);
				181	return false;
				182	}
				183
				184	// Now we check the ASCII output like a normal host. It will also handle
				185	// unescaping. Although we unescaped everything before this function call, if
				186	// somebody does %00 as fullwidth, ICU will convert this to ASCII.
				187	bool success = DoSimpleHost(wide_output.data(),
				188	wide_output.length(),
				189	output, &has_non_ascii);
				190	DCHECK(!has_non_ascii);
				191	return success;
				192	}
				193
				194	// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
				195	// UTF-16. The has_escaped flag should be set if the input string requires
				196	// unescaping.
				197	bool DoComplexHost(const char* host, int host_len,
				198	bool has_non_ascii, bool has_escaped, CanonOutput* output) {
				199	// Save the current position in the output. We may write stuff and rewind it
				200	// below, so we need to know where to rewind to.
				201	int begin_length = output->length();
				202
				203	// Points to the UTF-8 data we want to convert. This will either be the
				204	// input or the unescaped version written to \|*output\| if necessary.
				205	const char* utf8_source;
				206	int utf8_source_len;
				207	if (has_escaped) {
				208	// Unescape before converting to UTF-16 for IDN. We write this into the
				209	// output because it most likely does not require IDNization, and we can
				210	// save another huge stack buffer. It will be replaced below if it requires
				211	// IDN. This will also update our non-ASCII flag so we know whether the
				212	// unescaped input requires IDN.
				213	if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
				214	// Error with some escape sequence. We'll call the current output
				215	// complete. DoSimpleHost will have written some "reasonable" output.
				216	return false;
				217	}
				218
				219	// Unescaping may have left us with ASCII input, in which case the
				220	// unescaped version we wrote to output is complete.
				221	if (!has_non_ascii) {
				222	return true;
				223	}
				224
				225	// Save the pointer into the data was just converted (it may be appended to
				226	// other data in the output buffer).
				227	utf8_source = &output->data()[begin_length];
				228	utf8_source_len = output->length() - begin_length;
				229	} else {
				230	// We don't need to unescape, use input for IDNization later. (We know the
				231	// input has non-ASCII, or the simple version would have been called
				232	// instead of us.)
				233	utf8_source = host;
				234	utf8_source_len = host_len;
				235	}
				236
				237	// Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
				238	// Above, we may have used the output to write the unescaped values to, so
				239	// we have to rewind it to where we started after we convert it to UTF-16.
				240	StackBufferW utf16;
				241	if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
				242	// In this error case, the input may or may not be the output.
				243	StackBuffer utf8;
				244	for (int i = 0; i < utf8_source_len; i++)
				245	utf8.push_back(utf8_source[i]);
				246	output->set_length(begin_length);
				247	AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
				248	return false;
				249	}
				250	output->set_length(begin_length);
				251
				252	// This will call DoSimpleHost which will do normal ASCII canonicalization
				253	// and also check for IP addresses in the outpt.
				254	return DoIDNHost(utf16.data(), utf16.length(), output);
				255	}
				256
				257	// UTF-16 convert host to its ASCII version. The set up is already ready for
				258	// the backend, so we just pass through. The has_escaped flag should be set if
				259	// the input string requires unescaping.
				260	bool DoComplexHost(const base::char16* host, int host_len,
				261	bool has_non_ascii, bool has_escaped, CanonOutput* output) {
				262	if (has_escaped) {
				263	// Yikes, we have escaped characters with wide input. The escaped
				264	// characters should be interpreted as UTF-8. To solve this problem,
				265	// we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
				266	//
				267	// We don't bother to optimize the conversion in the ASCII case (which
				268	// could just be a copy) and use the UTF-8 path, because it should be
				269	// very rare that host names have escaped characters, and it is relatively
				270	// fast to do the conversion anyway.
				271	StackBuffer utf8;
				272	if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
				273	AppendInvalidNarrowString(host, 0, host_len, output);
				274	return false;
				275	}
				276
				277	// Once we convert to UTF-8, we can use the 8-bit version of the complex
				278	// host handling code above.
				279	return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
				280	has_escaped, output);
				281	}
				282
				283	// No unescaping necessary, we can safely pass the input to ICU. This
				284	// function will only get called if we either have escaped or non-ascii
				285	// input, so it's safe to just use ICU now. Even if the input is ASCII,
				286	// this function will do the right thing (just slower than we could).
				287	return DoIDNHost(host, host_len, output);
				288	}
				289
				290	template<typename CHAR, typename UCHAR>
				291	void DoHost(const CHAR* spec,
				292	const Component& host,
				293	CanonOutput* output,
				294	CanonHostInfo* host_info) {
				295	if (host.len <= 0) {
				296	// Empty hosts don't need anything.
				297	host_info->family = CanonHostInfo::NEUTRAL;
				298	host_info->out_host = Component();
				299	return;
				300	}
				301
				302	bool has_non_ascii, has_escaped;
				303	ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
				304
				305	// Keep track of output's initial length, so we can rewind later.
				306	const int output_begin = output->length();
				307
				308	bool success;
				309	if (!has_non_ascii && !has_escaped) {
				310	success = DoSimpleHost(&spec[host.begin], host.len,
				311	output, &has_non_ascii);
				312	DCHECK(!has_non_ascii);
				313	} else {
				314	success = DoComplexHost(&spec[host.begin], host.len,
				315	has_non_ascii, has_escaped, output);
				316	}
				317
				318	if (!success) {
				319	// Canonicalization failed. Set BROKEN to notify the caller.
				320	host_info->family = CanonHostInfo::BROKEN;
				321	} else {
				322	// After all the other canonicalization, check if we ended up with an IP
				323	// address. IP addresses are small, so writing into this temporary buffer
				324	// should not cause an allocation.
				325	RawCanonOutput<64> canon_ip;
				326	CanonicalizeIPAddress(output->data(),
				327	MakeRange(output_begin, output->length()),
				328	&canon_ip, host_info);
				329
				330	// If we got an IPv4/IPv6 address, copy the canonical form back to the
				331	// real buffer. Otherwise, it's a hostname or broken IP, in which case
				332	// we just leave it in place.
				333	if (host_info->IsIPAddress()) {
				334	output->set_length(output_begin);
				335	output->Append(canon_ip.data(), canon_ip.length());
				336	}
				337	}
				338
				339	host_info->out_host = MakeRange(output_begin, output->length());
				340	}
				341
				342	} // namespace
				343
				344	bool CanonicalizeHost(const char* spec,
				345	const Component& host,
				346	CanonOutput* output,
				347	Component* out_host) {
				348	CanonHostInfo host_info;
				349	DoHost<char, unsigned char>(spec, host, output, &host_info);
				350	*out_host = host_info.out_host;
				351	return (host_info.family != CanonHostInfo::BROKEN);
				352	}
				353
				354	bool CanonicalizeHost(const base::char16* spec,
				355	const Component& host,
				356	CanonOutput* output,
				357	Component* out_host) {
				358	CanonHostInfo host_info;
				359	DoHost<base::char16, base::char16>(spec, host, output, &host_info);
				360	*out_host = host_info.out_host;
				361	return (host_info.family != CanonHostInfo::BROKEN);
				362	}
				363
				364	void CanonicalizeHostVerbose(const char* spec,
				365	const Component& host,
				366	CanonOutput* output,
				367	CanonHostInfo* host_info) {
				368	DoHost<char, unsigned char>(spec, host, output, host_info);
				369	}
				370
				371	void CanonicalizeHostVerbose(const base::char16* spec,
				372	const Component& host,
				373	CanonOutput* output,
				374	CanonHostInfo* host_info) {
				375	DoHost<base::char16, base::char16>(spec, host, output, host_info);
				376	}
				377
				378	} // namespace url