// Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "google/protobuf/json/internal/writer.h" #include #include #include #include #include "absl/algorithm/container.h" #include "absl/log/absl_check.h" // Must be included last. #include "google/protobuf/port_def.inc" namespace google { namespace protobuf { namespace json_internal { // Tries to write a non-finite double if necessary; returns false if // nothing was written. bool JsonWriter::MaybeWriteSpecialFp(double val) { if (val == std::numeric_limits::infinity()) { Write("\"Infinity\""); } else if (val == -std::numeric_limits::infinity()) { Write("\"-Infinity\""); } else if (std::isnan(val)) { Write("\"NaN\""); } else { return false; } return true; } void JsonWriter::WriteBase64(absl::string_view str) { // This is the regular base64, not the "web-safe" version. constexpr absl::string_view kBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const char* ptr = str.data(); const char* end = ptr + str.size(); // Reads the `n`th character off of `ptr` while gracefully avoiding // sign extension due to implicit conversions auto read = [&](size_t n) { return static_cast(static_cast(ptr[n])); }; char buf[4]; absl::string_view view(buf, sizeof(buf)); Write("\""); while (end - ptr >= 3) { buf[0] = kBase64[read(0) >> 2]; buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)]; buf[2] = kBase64[((read(1) & 0xf) << 2) | (read(2) >> 6)]; buf[3] = kBase64[read(2) & 0x3f]; Write(view); ptr += 3; } switch (end - ptr) { case 2: buf[0] = kBase64[read(0) >> 2]; buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)]; buf[2] = kBase64[(read(1) & 0xf) << 2]; buf[3] = '='; Write(view); break; case 1: buf[0] = kBase64[read(0) >> 2]; buf[1] = kBase64[((read(0) & 0x3) << 4)]; buf[2] = '='; buf[3] = '='; Write(view); break; } Write("\""); } // The minimum value of a unicode high-surrogate code unit in the utf-16 // encoding. A high-surrogate is also known as a leading-surrogate. // See http://www.unicode.org/glossary/#high_surrogate_code_unit static constexpr uint16_t kMinHighSurrogate = 0xd800; // The minimum value of a unicode low-surrogate code unit in the utf-16 // encoding. A low-surrogate is also known as a trailing-surrogate. // See http://www.unicode.org/glossary/#low_surrogate_code_unit static constexpr uint16_t kMinLowSurrogate = 0xdc00; // The maximum value of a unicode low-surrogate code unit in the utf-16 // encoding. A low-surrogate is also known as a trailing surrogate. // See http://www.unicode.org/glossary/#low_surrogate_code_unit static constexpr uint16_t kMaxLowSurrogate = 0xdfff; // The minimum value of a unicode supplementary code point. // See http://www.unicode.org/glossary/#supplementary_code_point static constexpr uint32_t kMinSupplementaryCodePoint = 0x010000; // The maximum value of a unicode code point. // See http://www.unicode.org/glossary/#code_point static constexpr uint32_t kMaxCodePoint = 0x10ffff; // Indicates decoding failure; not a valid Unicode scalar. static constexpr uint32_t kErrorSentinel = 0xaaaaaaaa; // A Unicode Scalar encoded two ways. struct Utf8Scalar { // The Unicode scalar value as a 32-bit integer. If decoding failed, this // is equal to kErrorSentinel. uint32_t u32; // The Unicode scalar value encoded as UTF-8 bytes. May not reflect the // contents of `u32` if it is kErrorSentinel. absl::string_view utf8; }; // Parses a single UTF-8-encoded Unicode scalar from `str`. Returns a pair of // the scalar and the UTF-8-encoded content corresponding to it from `str`. // // Returns U+FFFD on failure, and consumes an unspecified number of bytes in // doing so. static Utf8Scalar ConsumeUtf8Scalar(absl::string_view& str) { ABSL_DCHECK(!str.empty()); uint32_t scalar = static_cast(str[0]); const char* start = str.data(); size_t len = 1; str = str.substr(1); // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying // one of the following (big-endian) patterns: // // 0b0xxxxxxx // 0b110xxxxx'10xxxxxx // 0b1110xxxx'10xxxxxx'10xxxxxx // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx // // We don't need to decode it; just validate it. int lookahead = 0; switch (absl::countl_one(static_cast(scalar))) { case 0: break; case 2: lookahead = 1; scalar &= (1 << 5) - 1; break; case 3: lookahead = 2; scalar &= (1 << 4) - 1; break; case 4: lookahead = 3; scalar &= (1 << 3) - 1; break; default: scalar = kErrorSentinel; break; } for (int i = 0; i < lookahead; ++i) { if (str.empty()) { scalar = kErrorSentinel; break; } uint8_t next = str[0]; str = str.substr(1); ++len; // Looking for top 2 bits are 0b10. if (next >> 6 != 2) { scalar = kErrorSentinel; break; } next &= (1 << 6) - 1; scalar <<= 6; scalar |= next; } if (scalar > kMaxCodePoint) { scalar = kErrorSentinel; } return {scalar, absl::string_view(start, len)}; } // Decides whether we must escape `scalar`. // // If the given Unicode scalar would not use a \u escape, `custom_escape` will // be set to a non-empty string. static bool MustEscape(uint32_t scalar, absl::string_view& custom_escape) { switch (scalar) { // These escapes are defined by the JSON spec. We do not escape /. case '\n': custom_escape = R"(\n)"; return true; case '\r': custom_escape = R"(\r)"; return true; case '\t': custom_escape = R"(\t)"; return true; case '\"': custom_escape = R"(\")"; return true; case '\f': custom_escape = R"(\f)"; return true; case '\b': custom_escape = R"(\b)"; return true; case '\\': custom_escape = R"(\\)"; return true; case kErrorSentinel: // Decoding failure turns into spaces, *not* replacement characters. We // handle this separately from "normal" spaces so that it follows the // escaping code-path. // // Note that literal replacement characters in the input string DO NOT // get turned into spaces; this is only for decoding failures! custom_escape = " "; return true; // These are not required by the JSON spec, but help // to prevent security bugs in JavaScript. // // These were originally present in the ESF parser, so they are kept for // legacy compatibility (and because escaping most of these is in good // taste, regardless). case '<': case '>': case 0xfeff: // Zero width no-break space. case 0xfff9: // Interlinear annotation anchor. case 0xfffa: // Interlinear annotation separator. case 0xfffb: // Interlinear annotation terminator. case 0x00ad: // Soft-hyphen. case 0x06dd: // Arabic end of ayah. case 0x070f: // Syriac abbreviation mark. case 0x17b4: // Khmer vowel inherent Aq. case 0x17b5: // Khmer vowel inherent Aa. case 0x000e0001: // Language tag. return true; default: static constexpr std::pair kEscapedRanges[] = { {0x0000, 0x001f}, // ASCII control. {0x007f, 0x009f}, // High ASCII bytes. {0x0600, 0x0603}, // Arabic signs. {0x200b, 0x200f}, // Zero width etc. {0x2028, 0x202e}, // Separators etc. {0x2060, 0x2064}, // Invisible etc. {0x206a, 0x206f}, // Shaping etc. {0x0001d173, 0x0001d17a}, // Music formatting. {0x000e0020, 0x000e007f}, // TAG symbols. }; return absl::c_any_of(kEscapedRanges, [scalar](auto range) { return range.first <= scalar && scalar <= range.second; }); } } void JsonWriter::WriteEscapedUtf8(absl::string_view str) { while (!str.empty()) { auto scalar = ConsumeUtf8Scalar(str); absl::string_view custom_escape; if (!MustEscape(scalar.u32, custom_escape)) { Write(scalar.utf8); continue; } if (!custom_escape.empty()) { Write(custom_escape); continue; } if (scalar.u32 < 0x10000) { WriteUEscape(scalar.u32); continue; } uint16_t lo = (scalar.u32 & (kMaxLowSurrogate - kMinLowSurrogate)) + kMinLowSurrogate; uint16_t hi = (scalar.u32 >> 10) + (kMinHighSurrogate - (kMinSupplementaryCodePoint >> 10)); WriteUEscape(hi); WriteUEscape(lo); } } void JsonWriter::WriteUEscape(uint16_t val) { char hex[7]; int len = absl::SNPrintF(hex, sizeof(hex), R"(\u%04x)", val); Write(absl::string_view(hex, static_cast(len))); } } // namespace json_internal } // namespace protobuf } // namespace google