|  | // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include <stdlib.h> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "base/strings/string_util.h" | 
|  | #include "net/base/net_util.h" | 
|  | #include "net/tools/dump_cache/url_to_filename_encoder.h" | 
|  |  | 
|  | using std::string; | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Returns 1 if buf is prefixed by "num_digits" of hex digits | 
|  | // Teturns 0 otherwise. | 
|  | // The function checks for '\0' for string termination. | 
|  | int HexDigitsPrefix(const char* buf, int num_digits) { | 
|  | for (int i = 0; i < num_digits; i++) { | 
|  | if (!IsHexDigit(buf[i])) | 
|  | return 0;  // This also detects end of string as '\0' is not xdigit. | 
|  | } | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | #ifdef WIN32 | 
|  | #define strtoull _strtoui64 | 
|  | #endif | 
|  |  | 
|  | // A simple parser for long long values. Returns the parsed value if a | 
|  | // valid integer is found; else returns deflt | 
|  | // UInt64 and Int64 cannot handle decimal numbers with leading 0s. | 
|  | uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { | 
|  | char *error = NULL; | 
|  | const uint64 value = strtoull(str, &error, 16); | 
|  | return (error == str) ? deflt : value; | 
|  | } | 
|  |  | 
|  | } | 
|  |  | 
|  | namespace net { | 
|  |  | 
|  | // The escape character choice is made here -- all code and tests in this | 
|  | // directory are based off of this constant.  However, our testdata | 
|  | // has tons of dependencies on this, so it cannot be changed without | 
|  | // re-running those tests and fixing them. | 
|  | const char UrlToFilenameEncoder::kEscapeChar = ','; | 
|  | const char UrlToFilenameEncoder::kTruncationChar = '-'; | 
|  | const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128; | 
|  |  | 
|  | void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) { | 
|  | CHECK(!segment->empty()); | 
|  | if ((*segment == ".") || (*segment == "..")) { | 
|  | dest->append(1, kEscapeChar); | 
|  | dest->append(*segment); | 
|  | segment->clear(); | 
|  | } else { | 
|  | size_t segment_size = segment->size(); | 
|  | if (segment_size > kMaximumSubdirectoryLength) { | 
|  | // We need to inject ",-" at the end of the segment to signify that | 
|  | // we are inserting an artificial '/'.  This means we have to chop | 
|  | // off at least two characters to make room. | 
|  | segment_size = kMaximumSubdirectoryLength - 2; | 
|  |  | 
|  | // But we don't want to break up an escape sequence that happens to lie at | 
|  | // the end.  Escape sequences are at most 2 characters. | 
|  | if ((*segment)[segment_size - 1] == kEscapeChar) { | 
|  | segment_size -= 1; | 
|  | } else if ((*segment)[segment_size - 2] == kEscapeChar) { | 
|  | segment_size -= 2; | 
|  | } | 
|  | dest->append(segment->data(), segment_size); | 
|  | dest->append(1, kEscapeChar); | 
|  | dest->append(1, kTruncationChar); | 
|  | segment->erase(0, segment_size); | 
|  |  | 
|  | // At this point, if we had segment_size=3, and segment="abcd", | 
|  | // then after this erase, we will have written "abc,-" and set segment="d" | 
|  | } else { | 
|  | dest->append(*segment); | 
|  | segment->clear(); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix, | 
|  | const string& escaped_ending, | 
|  | char dir_separator, | 
|  | string* encoded_filename) { | 
|  | string filename_ending = UrlUtilities::Unescape(escaped_ending); | 
|  |  | 
|  | char encoded[3]; | 
|  | int encoded_len; | 
|  | string segment; | 
|  |  | 
|  | // TODO(jmarantz): This code would be a bit simpler if we disallowed | 
|  | // Instaweb allowing filename_prefix to not end in "/".  We could | 
|  | // then change the is routine to just take one input string. | 
|  | size_t start_of_segment = filename_prefix.find_last_of(dir_separator); | 
|  | if (start_of_segment == string::npos) { | 
|  | segment = filename_prefix; | 
|  | } else { | 
|  | segment = filename_prefix.substr(start_of_segment + 1); | 
|  | *encoded_filename = filename_prefix.substr(0, start_of_segment + 1); | 
|  | } | 
|  |  | 
|  | size_t index = 0; | 
|  | // Special case the first / to avoid adding a leading kEscapeChar. | 
|  | if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) { | 
|  | encoded_filename->append(segment); | 
|  | segment.clear(); | 
|  | encoded_filename->append(1, dir_separator); | 
|  | ++index; | 
|  | } | 
|  |  | 
|  | for (; index < filename_ending.length(); ++index) { | 
|  | unsigned char ch = static_cast<unsigned char>(filename_ending[index]); | 
|  |  | 
|  | // Note: instead of outputing an empty segment, we let the second slash | 
|  | // be escaped below. | 
|  | if ((ch == dir_separator) && !segment.empty()) { | 
|  | AppendSegment(&segment, encoded_filename); | 
|  | encoded_filename->append(1, dir_separator); | 
|  | segment.clear(); | 
|  | } else { | 
|  | // After removing unsafe chars the only safe ones are _.=+- and alphanums. | 
|  | if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') || | 
|  | (ch == '-') || (('0' <= ch) && (ch <= '9')) || | 
|  | (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) { | 
|  | encoded[0] = ch; | 
|  | encoded_len = 1; | 
|  | } else { | 
|  | encoded[0] = kEscapeChar; | 
|  | encoded[1] = ch / 16; | 
|  | encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | 
|  | encoded[2] = ch % 16; | 
|  | encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | 
|  | encoded_len = 3; | 
|  | } | 
|  | segment.append(encoded, encoded_len); | 
|  |  | 
|  | // If segment is too big, we must chop it into chunks. | 
|  | if (segment.size() > kMaximumSubdirectoryLength) { | 
|  | AppendSegment(&segment, encoded_filename); | 
|  | encoded_filename->append(1, dir_separator); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Append "," to the leaf filename so the leaf can also be a branch., e.g. | 
|  | // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and | 
|  | // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed | 
|  | // us over the 128 char limit, then we will need to append "/" and the | 
|  | // remaining chars. | 
|  | segment += kEscapeChar; | 
|  | AppendSegment(&segment, encoded_filename); | 
|  | if (!segment.empty()) { | 
|  | // The last overflow segment is special, because we appended in | 
|  | // kEscapeChar above.  We won't need to check it again for size | 
|  | // or further escaping. | 
|  | encoded_filename->append(1, dir_separator); | 
|  | encoded_filename->append(segment); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Note: this decoder is not the exact inverse of the EncodeSegment above, | 
|  | // because it does not take into account a prefix. | 
|  | bool UrlToFilenameEncoder::Decode(const string& encoded_filename, | 
|  | char dir_separator, | 
|  | string* decoded_url) { | 
|  | enum State { | 
|  | kStart, | 
|  | kEscape, | 
|  | kFirstDigit, | 
|  | kTruncate, | 
|  | kEscapeDot | 
|  | }; | 
|  | State state = kStart; | 
|  | char hex_buffer[3]; | 
|  | hex_buffer[2] = '\0'; | 
|  | for (size_t i = 0; i < encoded_filename.size(); ++i) { | 
|  | char ch = encoded_filename[i]; | 
|  | switch (state) { | 
|  | case kStart: | 
|  | if (ch == kEscapeChar) { | 
|  | state = kEscape; | 
|  | } else if (ch == dir_separator) { | 
|  | decoded_url->append(1, '/');  // URLs only use '/' not '\\' | 
|  | } else { | 
|  | decoded_url->append(1, ch); | 
|  | } | 
|  | break; | 
|  | case kEscape: | 
|  | if (HexDigitsPrefix(&ch, 1) == 1) { | 
|  | hex_buffer[0] = ch; | 
|  | state = kFirstDigit; | 
|  | } else if (ch == kTruncationChar) { | 
|  | state = kTruncate; | 
|  | } else if (ch == '.') { | 
|  | decoded_url->append(1, '.'); | 
|  | state = kEscapeDot;  // Look for at most one more dot. | 
|  | } else if (ch == dir_separator) { | 
|  | // Consider url "//x".  This was once encoded to "/,/x,". | 
|  | // This code is what skips the first Escape. | 
|  | decoded_url->append(1, '/');  // URLs only use '/' not '\\' | 
|  | state = kStart; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | break; | 
|  | case kFirstDigit: | 
|  | if (HexDigitsPrefix(&ch, 1) == 1) { | 
|  | hex_buffer[1] = ch; | 
|  | uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0); | 
|  | decoded_url->append(1, static_cast<char>(hex_value)); | 
|  | state = kStart; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | break; | 
|  | case kTruncate: | 
|  | if (ch == dir_separator) { | 
|  | // Skip this separator, it was only put in to break up long | 
|  | // path segments, but is not part of the URL. | 
|  | state = kStart; | 
|  | } else { | 
|  | return false; | 
|  | } | 
|  | break; | 
|  | case kEscapeDot: | 
|  | decoded_url->append(1, ch); | 
|  | state = kStart; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | // All legal encoded filenames end in kEscapeChar. | 
|  | return (state == kEscape); | 
|  | } | 
|  |  | 
|  | // Escape the given input |path| and chop any individual components | 
|  | // of the path which are greater than kMaximumSubdirectoryLength characters | 
|  | // into two chunks. | 
|  | // | 
|  | // This legacy version has several issues with aliasing of different URLs, | 
|  | // inability to represent both /a/b/c and /a/b/c/d, and inability to decode | 
|  | // the filenames back into URLs. | 
|  | // | 
|  | // But there is a large body of slurped data which depends on this format, | 
|  | // so leave it as the default for spdy_in_mem_edsm_server. | 
|  | string UrlToFilenameEncoder::LegacyEscape(const string& path) { | 
|  | string output; | 
|  |  | 
|  | // Note:  We also chop paths into medium sized 'chunks'. | 
|  | //        This is due to the incompetence of the windows | 
|  | //        filesystem, which still hasn't figured out how | 
|  | //        to deal with long filenames. | 
|  | int last_slash = 0; | 
|  | for (size_t index = 0; index < path.length(); index++) { | 
|  | char ch = path[index]; | 
|  | if (ch == 0x5C) | 
|  | last_slash = index; | 
|  | if ((ch == 0x2D) ||                    // hyphen | 
|  | (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore | 
|  | ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9] | 
|  | ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z] | 
|  | ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z] | 
|  | output.append(&path[index], 1); | 
|  | } else { | 
|  | char encoded[3]; | 
|  | encoded[0] = 'x'; | 
|  | encoded[1] = ch / 16; | 
|  | encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0'; | 
|  | encoded[2] = ch % 16; | 
|  | encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0'; | 
|  | output.append(encoded, 3); | 
|  | } | 
|  | if (index - last_slash > kMaximumSubdirectoryLength) { | 
|  | #ifdef WIN32 | 
|  | char slash = '\\'; | 
|  | #else | 
|  | char slash = '/'; | 
|  | #endif | 
|  | output.append(&slash, 1); | 
|  | last_slash = index; | 
|  | } | 
|  | } | 
|  | return output; | 
|  | } | 
|  |  | 
|  | }  // namespace net |