James Robinson | 646469d | 2014-10-03 15:33:28 -0700 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | // ICU integration functions. |
| 6 | |
| 7 | #include <stdlib.h> |
| 8 | #include <string.h> |
| 9 | |
| 10 | #include "base/lazy_instance.h" |
| 11 | #include "base/logging.h" |
| 12 | #include "third_party/icu/source/common/unicode/ucnv.h" |
| 13 | #include "third_party/icu/source/common/unicode/ucnv_cb.h" |
| 14 | #include "third_party/icu/source/common/unicode/uidna.h" |
| 15 | #include "url/url_canon_icu.h" |
| 16 | #include "url/url_canon_internal.h" // for _itoa_s |
| 17 | |
| 18 | namespace url { |
| 19 | |
| 20 | namespace { |
| 21 | |
| 22 | // Called when converting a character that can not be represented, this will |
| 23 | // append an escaped version of the numerical character reference for that code |
| 24 | // point. It is of the form "Ӓ" and we will escape the non-digits to |
| 25 | // "%26%231234%3B". Why? This is what Netscape did back in the olden days. |
| 26 | void appendURLEscapedChar(const void* context, |
| 27 | UConverterFromUnicodeArgs* from_args, |
| 28 | const UChar* code_units, |
| 29 | int32_t length, |
| 30 | UChar32 code_point, |
| 31 | UConverterCallbackReason reason, |
| 32 | UErrorCode* err) { |
| 33 | if (reason == UCNV_UNASSIGNED) { |
| 34 | *err = U_ZERO_ERROR; |
| 35 | |
| 36 | const static int prefix_len = 6; |
| 37 | const static char prefix[prefix_len + 1] = "%26%23"; // "&#" percent-escaped |
| 38 | ucnv_cbFromUWriteBytes(from_args, prefix, prefix_len, 0, err); |
| 39 | |
| 40 | DCHECK(code_point < 0x110000); |
| 41 | char number[8]; // Max Unicode code point is 7 digits. |
| 42 | _itoa_s(code_point, number, 10); |
| 43 | int number_len = static_cast<int>(strlen(number)); |
| 44 | ucnv_cbFromUWriteBytes(from_args, number, number_len, 0, err); |
| 45 | |
| 46 | const static int postfix_len = 3; |
| 47 | const static char postfix[postfix_len + 1] = "%3B"; // ";" percent-escaped |
| 48 | ucnv_cbFromUWriteBytes(from_args, postfix, postfix_len, 0, err); |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | // A class for scoping the installation of the invalid character callback. |
| 53 | class AppendHandlerInstaller { |
| 54 | public: |
| 55 | // The owner of this object must ensure that the converter is alive for the |
| 56 | // duration of this object's lifetime. |
| 57 | AppendHandlerInstaller(UConverter* converter) : converter_(converter) { |
| 58 | UErrorCode err = U_ZERO_ERROR; |
| 59 | ucnv_setFromUCallBack(converter_, appendURLEscapedChar, 0, |
| 60 | &old_callback_, &old_context_, &err); |
| 61 | } |
| 62 | |
| 63 | ~AppendHandlerInstaller() { |
| 64 | UErrorCode err = U_ZERO_ERROR; |
| 65 | ucnv_setFromUCallBack(converter_, old_callback_, old_context_, 0, 0, &err); |
| 66 | } |
| 67 | |
| 68 | private: |
| 69 | UConverter* converter_; |
| 70 | |
| 71 | UConverterFromUCallback old_callback_; |
| 72 | const void* old_context_; |
| 73 | }; |
| 74 | |
| 75 | // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to |
| 76 | // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). |
| 77 | // |
| 78 | // We use UTS46 with BiDiCheck to migrate from IDNA 2003 (with unassigned |
| 79 | // code points allowed) to IDNA 2008 with |
| 80 | // the backward compatibility in mind. What it does: |
| 81 | // |
| 82 | // 1. Use the up-to-date Unicode data. |
| 83 | // 2. Define a case folding/mapping with the up-to-date Unicode data as |
| 84 | // in IDNA 2003. |
| 85 | // 3. Use transitional mechanism for 4 deviation characters (sharp-s, |
| 86 | // final sigma, ZWJ and ZWNJ) for now. |
| 87 | // 4. Continue to allow symbols and punctuations. |
| 88 | // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. |
| 89 | // 6. Do not apply STD3 rules |
| 90 | // 7. Do not allow unassigned code points. |
| 91 | // |
| 92 | // It also closely matches what IE 10 does except for the BiDi check ( |
| 93 | // http://goo.gl/3XBhqw ). |
| 94 | // See http://http://unicode.org/reports/tr46/ and references therein |
| 95 | // for more details. |
| 96 | struct UIDNAWrapper { |
| 97 | UIDNAWrapper() { |
| 98 | UErrorCode err = U_ZERO_ERROR; |
| 99 | // TODO(jungshik): Change options as different parties (browsers, |
| 100 | // registrars, search engines) converge toward a consensus. |
| 101 | value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); |
| 102 | if (U_FAILURE(err)) |
| 103 | value = NULL; |
| 104 | } |
| 105 | |
| 106 | UIDNA* value; |
| 107 | }; |
| 108 | |
| 109 | } // namespace |
| 110 | |
| 111 | ICUCharsetConverter::ICUCharsetConverter(UConverter* converter) |
| 112 | : converter_(converter) { |
| 113 | } |
| 114 | |
| 115 | ICUCharsetConverter::~ICUCharsetConverter() { |
| 116 | } |
| 117 | |
| 118 | void ICUCharsetConverter::ConvertFromUTF16(const base::char16* input, |
| 119 | int input_len, |
| 120 | CanonOutput* output) { |
| 121 | // Install our error handler. It will be called for character that can not |
| 122 | // be represented in the destination character set. |
| 123 | AppendHandlerInstaller handler(converter_); |
| 124 | |
| 125 | int begin_offset = output->length(); |
| 126 | int dest_capacity = output->capacity() - begin_offset; |
| 127 | output->set_length(output->length()); |
| 128 | |
| 129 | do { |
| 130 | UErrorCode err = U_ZERO_ERROR; |
| 131 | char* dest = &output->data()[begin_offset]; |
| 132 | int required_capacity = ucnv_fromUChars(converter_, dest, dest_capacity, |
| 133 | input, input_len, &err); |
| 134 | if (err != U_BUFFER_OVERFLOW_ERROR) { |
| 135 | output->set_length(begin_offset + required_capacity); |
| 136 | return; |
| 137 | } |
| 138 | |
| 139 | // Output didn't fit, expand |
| 140 | dest_capacity = required_capacity; |
| 141 | output->Resize(begin_offset + dest_capacity); |
| 142 | } while (true); |
| 143 | } |
| 144 | |
| 145 | static base::LazyInstance<UIDNAWrapper>::Leaky |
| 146 | g_uidna = LAZY_INSTANCE_INITIALIZER; |
| 147 | |
| 148 | // Converts the Unicode input representing a hostname to ASCII using IDN rules. |
| 149 | // The output must be ASCII, but is represented as wide characters. |
| 150 | // |
| 151 | // On success, the output will be filled with the ASCII host name and it will |
| 152 | // return true. Unlike most other canonicalization functions, this assumes that |
| 153 | // the output is empty. The beginning of the host will be at offset 0, and |
| 154 | // the length of the output will be set to the length of the new host name. |
| 155 | // |
| 156 | // On error, this will return false. The output in this case is undefined. |
| 157 | // TODO(jungshik): use UTF-8/ASCII version of nameToASCII. |
| 158 | // Change the function signature and callers accordingly to avoid unnecessary |
| 159 | // conversions in our code. In addition, consider using icu::IDNA's UTF-8/ASCII |
| 160 | // version with StringByteSink. That way, we can avoid C wrappers and additional |
| 161 | // string conversion. |
| 162 | bool IDNToASCII(const base::char16* src, int src_len, CanonOutputW* output) { |
| 163 | DCHECK(output->length() == 0); // Output buffer is assumed empty. |
| 164 | |
| 165 | UIDNA* uidna = g_uidna.Get().value; |
| 166 | DCHECK(uidna != NULL); |
| 167 | while (true) { |
| 168 | UErrorCode err = U_ZERO_ERROR; |
| 169 | UIDNAInfo info = UIDNA_INFO_INITIALIZER; |
| 170 | int output_length = uidna_nameToASCII(uidna, src, src_len, output->data(), |
| 171 | output->capacity(), &info, &err); |
| 172 | if (U_SUCCESS(err) && info.errors == 0) { |
| 173 | output->set_length(output_length); |
| 174 | return true; |
| 175 | } |
| 176 | |
| 177 | // TODO(jungshik): Look at info.errors to handle them case-by-case basis |
| 178 | // if necessary. |
| 179 | if (err != U_BUFFER_OVERFLOW_ERROR || info.errors != 0) |
| 180 | return false; // Unknown error, give up. |
| 181 | |
| 182 | // Not enough room in our buffer, expand. |
| 183 | output->Resize(output_length); |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | } // namespace url |