329 lines
10 KiB
C++
329 lines
10 KiB
C++
// Protocol Buffers - Google's data interchange format
|
|
// Copyright 2008 Google Inc. All rights reserved.
|
|
// https://developers.google.com/protocol-buffers/
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following disclaimer
|
|
// in the documentation and/or other materials provided with the
|
|
// distribution.
|
|
// * Neither the name of Google Inc. nor the names of its
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#include "google/protobuf/json/internal/writer.h"
|
|
|
|
#include <cstdint>
|
|
#include <initializer_list>
|
|
#include <limits>
|
|
#include <utility>
|
|
|
|
#include "absl/algorithm/container.h"
|
|
#include "absl/log/absl_check.h"
|
|
|
|
// Must be included last.
|
|
#include "google/protobuf/port_def.inc"
|
|
|
|
namespace google {
|
|
namespace protobuf {
|
|
namespace json_internal {
|
|
|
|
// Tries to write a non-finite double if necessary; returns false if
|
|
// nothing was written.
|
|
bool JsonWriter::MaybeWriteSpecialFp(double val) {
|
|
if (val == std::numeric_limits<double>::infinity()) {
|
|
Write("\"Infinity\"");
|
|
} else if (val == -std::numeric_limits<double>::infinity()) {
|
|
Write("\"-Infinity\"");
|
|
} else if (std::isnan(val)) {
|
|
Write("\"NaN\"");
|
|
} else {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void JsonWriter::WriteBase64(absl::string_view str) {
|
|
// This is the regular base64, not the "web-safe" version.
|
|
constexpr absl::string_view kBase64 =
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
const char* ptr = str.data();
|
|
const char* end = ptr + str.size();
|
|
|
|
// Reads the `n`th character off of `ptr` while gracefully avoiding
|
|
// sign extension due to implicit conversions
|
|
auto read = [&](size_t n) {
|
|
return static_cast<size_t>(static_cast<uint8_t>(ptr[n]));
|
|
};
|
|
|
|
char buf[4];
|
|
absl::string_view view(buf, sizeof(buf));
|
|
Write("\"");
|
|
|
|
while (end - ptr >= 3) {
|
|
buf[0] = kBase64[read(0) >> 2];
|
|
buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)];
|
|
buf[2] = kBase64[((read(1) & 0xf) << 2) | (read(2) >> 6)];
|
|
buf[3] = kBase64[read(2) & 0x3f];
|
|
Write(view);
|
|
ptr += 3;
|
|
}
|
|
|
|
switch (end - ptr) {
|
|
case 2:
|
|
buf[0] = kBase64[read(0) >> 2];
|
|
buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)];
|
|
buf[2] = kBase64[(read(1) & 0xf) << 2];
|
|
buf[3] = '=';
|
|
Write(view);
|
|
break;
|
|
case 1:
|
|
buf[0] = kBase64[read(0) >> 2];
|
|
buf[1] = kBase64[((read(0) & 0x3) << 4)];
|
|
buf[2] = '=';
|
|
buf[3] = '=';
|
|
Write(view);
|
|
break;
|
|
}
|
|
|
|
Write("\"");
|
|
}
|
|
|
|
// The minimum value of a unicode high-surrogate code unit in the utf-16
|
|
// encoding. A high-surrogate is also known as a leading-surrogate.
|
|
// See http://www.unicode.org/glossary/#high_surrogate_code_unit
|
|
static constexpr uint16_t kMinHighSurrogate = 0xd800;
|
|
|
|
// The minimum value of a unicode low-surrogate code unit in the utf-16
|
|
// encoding. A low-surrogate is also known as a trailing-surrogate.
|
|
// See http://www.unicode.org/glossary/#low_surrogate_code_unit
|
|
static constexpr uint16_t kMinLowSurrogate = 0xdc00;
|
|
|
|
// The maximum value of a unicode low-surrogate code unit in the utf-16
|
|
// encoding. A low-surrogate is also known as a trailing surrogate.
|
|
// See http://www.unicode.org/glossary/#low_surrogate_code_unit
|
|
static constexpr uint16_t kMaxLowSurrogate = 0xdfff;
|
|
|
|
// The minimum value of a unicode supplementary code point.
|
|
// See http://www.unicode.org/glossary/#supplementary_code_point
|
|
static constexpr uint32_t kMinSupplementaryCodePoint = 0x010000;
|
|
|
|
// The maximum value of a unicode code point.
|
|
// See http://www.unicode.org/glossary/#code_point
|
|
static constexpr uint32_t kMaxCodePoint = 0x10ffff;
|
|
|
|
// Indicates decoding failure; not a valid Unicode scalar.
|
|
static constexpr uint32_t kErrorSentinel = 0xaaaaaaaa;
|
|
|
|
// A Unicode Scalar encoded two ways.
|
|
struct Utf8Scalar {
|
|
// The Unicode scalar value as a 32-bit integer. If decoding failed, this
|
|
// is equal to kErrorSentinel.
|
|
uint32_t u32;
|
|
// The Unicode scalar value encoded as UTF-8 bytes. May not reflect the
|
|
// contents of `u32` if it is kErrorSentinel.
|
|
absl::string_view utf8;
|
|
};
|
|
|
|
// Parses a single UTF-8-encoded Unicode scalar from `str`. Returns a pair of
|
|
// the scalar and the UTF-8-encoded content corresponding to it from `str`.
|
|
//
|
|
// Returns U+FFFD on failure, and consumes an unspecified number of bytes in
|
|
// doing so.
|
|
static Utf8Scalar ConsumeUtf8Scalar(absl::string_view& str) {
|
|
ABSL_DCHECK(!str.empty());
|
|
uint32_t scalar = static_cast<uint8_t>(str[0]);
|
|
const char* start = str.data();
|
|
size_t len = 1;
|
|
|
|
str = str.substr(1);
|
|
|
|
// Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
|
|
// one of the following (big-endian) patterns:
|
|
//
|
|
// 0b0xxxxxxx
|
|
// 0b110xxxxx'10xxxxxx
|
|
// 0b1110xxxx'10xxxxxx'10xxxxxx
|
|
// 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
|
|
//
|
|
// We don't need to decode it; just validate it.
|
|
int lookahead = 0;
|
|
switch (absl::countl_one(static_cast<uint8_t>(scalar))) {
|
|
case 0:
|
|
break;
|
|
case 2:
|
|
lookahead = 1;
|
|
scalar &= (1 << 5) - 1;
|
|
break;
|
|
case 3:
|
|
lookahead = 2;
|
|
scalar &= (1 << 4) - 1;
|
|
break;
|
|
case 4:
|
|
lookahead = 3;
|
|
scalar &= (1 << 3) - 1;
|
|
break;
|
|
default:
|
|
scalar = kErrorSentinel;
|
|
break;
|
|
}
|
|
|
|
for (int i = 0; i < lookahead; ++i) {
|
|
if (str.empty()) {
|
|
scalar = kErrorSentinel;
|
|
break;
|
|
}
|
|
|
|
uint8_t next = str[0];
|
|
str = str.substr(1);
|
|
++len;
|
|
|
|
// Looking for top 2 bits are 0b10.
|
|
if (next >> 6 != 2) {
|
|
scalar = kErrorSentinel;
|
|
break;
|
|
}
|
|
next &= (1 << 6) - 1;
|
|
scalar <<= 6;
|
|
scalar |= next;
|
|
}
|
|
|
|
if (scalar > kMaxCodePoint) {
|
|
scalar = kErrorSentinel;
|
|
}
|
|
|
|
return {scalar, absl::string_view(start, len)};
|
|
}
|
|
|
|
// Decides whether we must escape `scalar`.
|
|
//
|
|
// If the given Unicode scalar would not use a \u escape, `custom_escape` will
|
|
// be set to a non-empty string.
|
|
static bool MustEscape(uint32_t scalar, absl::string_view& custom_escape) {
|
|
switch (scalar) {
|
|
// These escapes are defined by the JSON spec. We do not escape /.
|
|
case '\n':
|
|
custom_escape = R"(\n)";
|
|
return true;
|
|
case '\r':
|
|
custom_escape = R"(\r)";
|
|
return true;
|
|
case '\t':
|
|
custom_escape = R"(\t)";
|
|
return true;
|
|
case '\"':
|
|
custom_escape = R"(\")";
|
|
return true;
|
|
case '\f':
|
|
custom_escape = R"(\f)";
|
|
return true;
|
|
case '\b':
|
|
custom_escape = R"(\b)";
|
|
return true;
|
|
case '\\':
|
|
custom_escape = R"(\\)";
|
|
return true;
|
|
|
|
case kErrorSentinel:
|
|
// Decoding failure turns into spaces, *not* replacement characters. We
|
|
// handle this separately from "normal" spaces so that it follows the
|
|
// escaping code-path.
|
|
//
|
|
// Note that literal replacement characters in the input string DO NOT
|
|
// get turned into spaces; this is only for decoding failures!
|
|
custom_escape = " ";
|
|
return true;
|
|
|
|
// These are not required by the JSON spec, but help
|
|
// to prevent security bugs in JavaScript.
|
|
//
|
|
// These were originally present in the ESF parser, so they are kept for
|
|
// legacy compatibility (and because escaping most of these is in good
|
|
// taste, regardless).
|
|
case '<':
|
|
case '>':
|
|
case 0xfeff: // Zero width no-break space.
|
|
case 0xfff9: // Interlinear annotation anchor.
|
|
case 0xfffa: // Interlinear annotation separator.
|
|
case 0xfffb: // Interlinear annotation terminator.
|
|
case 0x00ad: // Soft-hyphen.
|
|
case 0x06dd: // Arabic end of ayah.
|
|
case 0x070f: // Syriac abbreviation mark.
|
|
case 0x17b4: // Khmer vowel inherent Aq.
|
|
case 0x17b5: // Khmer vowel inherent Aa.
|
|
case 0x000e0001: // Language tag.
|
|
return true;
|
|
default:
|
|
static constexpr std::pair<uint32_t, uint32_t> kEscapedRanges[] = {
|
|
{0x0000, 0x001f}, // ASCII control.
|
|
{0x007f, 0x009f}, // High ASCII bytes.
|
|
{0x0600, 0x0603}, // Arabic signs.
|
|
{0x200b, 0x200f}, // Zero width etc.
|
|
{0x2028, 0x202e}, // Separators etc.
|
|
{0x2060, 0x2064}, // Invisible etc.
|
|
{0x206a, 0x206f}, // Shaping etc.
|
|
{0x0001d173, 0x0001d17a}, // Music formatting.
|
|
{0x000e0020, 0x000e007f}, // TAG symbols.
|
|
};
|
|
|
|
return absl::c_any_of(kEscapedRanges, [scalar](auto range) {
|
|
return range.first <= scalar && scalar <= range.second;
|
|
});
|
|
}
|
|
}
|
|
|
|
void JsonWriter::WriteEscapedUtf8(absl::string_view str) {
|
|
while (!str.empty()) {
|
|
auto scalar = ConsumeUtf8Scalar(str);
|
|
absl::string_view custom_escape;
|
|
|
|
if (!MustEscape(scalar.u32, custom_escape)) {
|
|
Write(scalar.utf8);
|
|
continue;
|
|
}
|
|
|
|
if (!custom_escape.empty()) {
|
|
Write(custom_escape);
|
|
continue;
|
|
}
|
|
|
|
if (scalar.u32 < 0x10000) {
|
|
WriteUEscape(scalar.u32);
|
|
continue;
|
|
}
|
|
|
|
uint16_t lo =
|
|
(scalar.u32 & (kMaxLowSurrogate - kMinLowSurrogate)) + kMinLowSurrogate;
|
|
uint16_t hi = (scalar.u32 >> 10) +
|
|
(kMinHighSurrogate - (kMinSupplementaryCodePoint >> 10));
|
|
WriteUEscape(hi);
|
|
WriteUEscape(lo);
|
|
}
|
|
}
|
|
|
|
void JsonWriter::WriteUEscape(uint16_t val) {
|
|
char hex[7];
|
|
int len = absl::SNPrintF(hex, sizeof(hex), R"(\u%04x)", val);
|
|
Write(absl::string_view(hex, static_cast<size_t>(len)));
|
|
}
|
|
} // namespace json_internal
|
|
} // namespace protobuf
|
|
} // namespace google
|