// Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "google/protobuf/json/internal/parser.h" #include #include #include #include #include #include #include #include #include "google/protobuf/type.pb.h" #include "google/protobuf/descriptor.h" #include "google/protobuf/dynamic_message.h" #include "google/protobuf/message.h" #include "absl/base/attributes.h" #include "absl/container/flat_hash_set.h" #include "absl/log/absl_check.h" #include "absl/log/absl_log.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/ascii.h" #include "absl/strings/escaping.h" #include "absl/strings/match.h" #include "absl/strings/numbers.h" #include "absl/strings/str_format.h" #include "absl/strings/str_split.h" #include "absl/strings/string_view.h" #include "absl/types/optional.h" #include "absl/types/span.h" #include "google/protobuf/io/zero_copy_sink.h" #include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream_impl_lite.h" #include "google/protobuf/json/internal/descriptor_traits.h" #include "google/protobuf/json/internal/lexer.h" #include "google/protobuf/json/internal/parser_traits.h" #include "google/protobuf/util/type_resolver.h" #include "google/protobuf/stubs/status_macros.h" // Must be included last. #include "google/protobuf/port_def.inc" namespace google { namespace protobuf { namespace json_internal { namespace { // This file contains code that drives a JsonLexer to visit a JSON document and // convert it into some form of proto. // // This semantic layer is duplicated: proto2-ish code can deserialize directly // into a message, whereas proto3-ish code deserializes into a byte stream, // using TypeResolvers instead of Descriptors. // // The parsing code is templated over which of these two reflection + output // combinations is used. The traits types that collect the per-instantiation // functionality can be found in json_util2_parser_traits-inl.h. // This table maps an unsigned `char` value, interpreted as an ASCII character, // to a corresponding value in the base64 alphabet (both traditional and // "web-safe" characters are included). // // If a character is not valid base64, it maps to -1; this is used by the bit // operations that assemble a base64-encoded word to determine if an error // occurred, by checking the sign bit. constexpr signed char kBase64Table[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62 /*+*/, -1, 62 /*-*/, -1, 63 /*/ */, 52 /*0*/, 53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/, 58 /*6*/, 59 /*7*/, 60 /*8*/, 61 /*9*/, -1, -1, -1, -1, -1, -1, -1, 0 /*A*/, 1 /*B*/, 2 /*C*/, 3 /*D*/, 4 /*E*/, 5 /*F*/, 6 /*G*/, 07 /*H*/, 8 /*I*/, 9 /*J*/, 10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/, 16 /*Q*/, 17 /*R*/, 18 /*S*/, 19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/, 25 /*Z*/, -1, -1, -1, -1, 63 /*_*/, -1, 26 /*a*/, 27 /*b*/, 28 /*c*/, 29 /*d*/, 30 /*e*/, 31 /*f*/, 32 /*g*/, 33 /*h*/, 34 /*i*/, 35 /*j*/, 36 /*k*/, 37 /*l*/, 38 /*m*/, 39 /*n*/, 40 /*o*/, 41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/, 46 /*u*/, 47 /*v*/, 48 /*w*/, 49 /*x*/, 50 /*y*/, 51 /*z*/, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; uint32_t Base64Lookup(char c) { // Sign-extend return value so high bit will be set on any unexpected char. return static_cast(kBase64Table[static_cast(c)]); } // Decodes `base64` in-place, shrinking the length as appropriate. absl::StatusOr> DecodeBase64InPlace(absl::Span base64) { // We decode in place. This is safe because this is a new buffer (not // aliasing the input) and because base64 decoding shrinks 4 bytes into 3. char* out = base64.data(); const char* ptr = base64.data(); const char* end = ptr + base64.size(); const char* end4 = ptr + (base64.size() & ~3u); for (; ptr < end4; ptr += 4, out += 3) { auto val = Base64Lookup(ptr[0]) << 18 | Base64Lookup(ptr[1]) << 12 | Base64Lookup(ptr[2]) << 6 | Base64Lookup(ptr[3]) << 0; if (static_cast(val) < 0) { // Junk chars or padding. Remove trailing padding, if any. if (end - ptr == 4 && ptr[3] == '=') { if (ptr[2] == '=') { end -= 2; } else { end -= 1; } } break; } out[0] = val >> 16; out[1] = (val >> 8) & 0xff; out[2] = val & 0xff; } if (ptr < end) { uint32_t val = ~0u; switch (end - ptr) { case 2: val = Base64Lookup(ptr[0]) << 18 | Base64Lookup(ptr[1]) << 12; out[0] = val >> 16; out += 1; break; case 3: val = Base64Lookup(ptr[0]) << 18 | Base64Lookup(ptr[1]) << 12 | Base64Lookup(ptr[2]) << 6; out[0] = val >> 16; out[1] = (val >> 8) & 0xff; out += 2; break; } if (static_cast(val) < 0) { return absl::InvalidArgumentError("corrupt base64"); } } return absl::Span(base64.data(), static_cast(out - base64.data())); } template absl::StatusOr> ParseIntInner(JsonLexer& lex, double lo, double hi) { absl::StatusOr kind = lex.PeekKind(); RETURN_IF_ERROR(kind.status()); LocationWith n; switch (*kind) { case JsonLexer::kNum: { absl::StatusOr> x = lex.ParseRawNumber(); RETURN_IF_ERROR(x.status()); n.loc = x->loc; if (absl::SimpleAtoi(x->value.AsView(), &n.value)) { break; } double d; if (!absl::SimpleAtod(x->value.AsView(), &d) || !std::isfinite(d)) { return x->loc.Invalid( absl::StrFormat("invalid number: '%s'", x->value.AsView())); } // Conversion overflow here would be UB. if (lo > d || d > hi) { return lex.Invalid("JSON number out of range for int"); } n.value = static_cast(d); if (d - static_cast(n.value) != 0) { return lex.Invalid( "expected integer, but JSON number had fractional part"); } break; } case JsonLexer::kStr: { absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); // SimpleAtoi will ignore leading and trailing whitespace, so we need // to check for it ourselves. for (char c : str->value.AsView()) { if (absl::ascii_isspace(c)) { return lex.Invalid("non-number characters in quoted number"); } } if (!absl::SimpleAtoi(str->value.AsView(), &n.value)) { return str->loc.Invalid("non-number characters in quoted number"); } n.loc = str->loc; break; } default: return lex.Invalid("expected number or string"); } return n; } template absl::StatusOr ParseInt(JsonLexer& lex, Field field) { absl::StatusOr> n = ParseIntInner(lex, -9007199254740992.0, 9007199254740992.0); RETURN_IF_ERROR(n.status()); if (Traits::Is32Bit(field)) { if (std::numeric_limits::min() > n->value || n->value > std::numeric_limits::max()) { return n->loc.Invalid("integer out of range"); } } return n->value; } template absl::StatusOr ParseUInt(JsonLexer& lex, Field field) { absl::StatusOr> n = ParseIntInner(lex, 0, 18014398509481984.0); RETURN_IF_ERROR(n.status()); if (Traits::Is32Bit(field)) { if (n->value > std::numeric_limits::max()) { return n->loc.Invalid("integer out of range"); } } return n->value; } template absl::StatusOr ParseFp(JsonLexer& lex, Field field) { absl::StatusOr kind = lex.PeekKind(); RETURN_IF_ERROR(kind.status()); double n; switch (*kind) { case JsonLexer::kNum: { absl::StatusOr> d = lex.ParseNumber(); RETURN_IF_ERROR(d.status()); n = d->value; break; } case JsonLexer::kStr: { absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); if (str->value == "NaN") { n = NAN; } else if (str->value == "Infinity") { n = INFINITY; } else if (str->value == "-Infinity") { n = -INFINITY; } else if (!absl::SimpleAtod(str->value.AsView(), &n)) { return str->loc.Invalid("non-number characters in quoted number"); } break; } default: return lex.Invalid("expected number or string"); } if (Traits::Is32Bit(field)) { // Detect out-of-range 32-bit floats by seeing whether the conversion result // is still finite. Finite extreme values may have textual representations // that parse to 64-bit values outside the 32-bit range, but which are // closer to the 32-bit extreme than to the "next value with the same // precision". if (std::isfinite(n) && !std::isfinite(static_cast(n))) { return lex.Invalid("float out of range"); } } return n; } template absl::StatusOr ParseStrOrBytes(JsonLexer& lex, Field field) { absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); if (Traits::FieldType(field) == FieldDescriptor::TYPE_BYTES) { std::string& b64 = str->value.ToString(); absl::StatusOr> decoded = DecodeBase64InPlace(absl::MakeSpan(&b64[0], b64.size())); if (!decoded.ok()) { return str->loc.Invalid(decoded.status().message()); } b64.resize(decoded->size()); } return std::move(str->value.ToString()); } template absl::StatusOr> ParseEnumFromStr(JsonLexer& lex, MaybeOwnedString& str, Field field) { absl::StatusOr value = Traits::EnumNumberByName( field, str.AsView(), lex.options().case_insensitive_enum_parsing); if (value.ok()) { return absl::optional(*value); } int32_t i; if (absl::SimpleAtoi(str.AsView(), &i)) { return absl::optional(i); } else if (lex.options().ignore_unknown_fields) { return {absl::nullopt}; } return value.status(); } // Parses an enum; can return nullopt if a quoted enumerator that we don't // know about is received and `ignore_unknown_fields` is set. template absl::StatusOr> ParseEnum(JsonLexer& lex, Field field) { absl::StatusOr kind = lex.PeekKind(); RETURN_IF_ERROR(kind.status()); int32_t n = 0; switch (*kind) { case JsonLexer::kStr: { absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); auto e = ParseEnumFromStr(lex, str->value, field); RETURN_IF_ERROR(e.status()); if (!e->has_value()) { return {absl::nullopt}; } n = **e; break; } case JsonLexer::kNum: return ParseInt(lex, field); default: return lex.Invalid("expected number or string"); } return n; } // Mutually recursive with functions that follow. template absl::Status ParseMessage(JsonLexer& lex, const Desc& desc, Msg& msg, bool any_reparse); template absl::Status ParseField(JsonLexer& lex, const Desc& desc, absl::string_view name, Msg& msg); template absl::Status ParseSingular(JsonLexer& lex, Field field, Msg& msg) { auto field_type = Traits::FieldType(field); if (lex.Peek(JsonLexer::kNull)) { auto message_type = ClassifyMessage(Traits::FieldTypeName(field)); switch (field_type) { case FieldDescriptor::TYPE_ENUM: if (message_type == MessageType::kNull) { Traits::SetEnum(field, msg, 0); } break; case FieldDescriptor::TYPE_MESSAGE: { if (message_type == MessageType::kValue) { return Traits::NewMsg( field, msg, [&](const Desc& type, Msg& msg) -> absl::Status { auto field = Traits::FieldByNumber(type, 1); ABSL_DCHECK(field.has_value()); RETURN_IF_ERROR(lex.Expect("null")); Traits::SetEnum(Traits::MustHaveField(type, 1), msg, 0); return absl::OkStatus(); }); } break; } default: break; } return lex.Expect("null"); } switch (field_type) { case FieldDescriptor::TYPE_FLOAT: { auto x = ParseFp(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetFloat(field, msg, *x); break; } case FieldDescriptor::TYPE_DOUBLE: { auto x = ParseFp(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetDouble(field, msg, *x); break; } case FieldDescriptor::TYPE_SFIXED64: case FieldDescriptor::TYPE_SINT64: case FieldDescriptor::TYPE_INT64: { auto x = ParseInt(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetInt64(field, msg, *x); break; } case FieldDescriptor::TYPE_FIXED64: case FieldDescriptor::TYPE_UINT64: { auto x = ParseUInt(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetUInt64(field, msg, *x); break; } case FieldDescriptor::TYPE_SFIXED32: case FieldDescriptor::TYPE_SINT32: case FieldDescriptor::TYPE_INT32: { auto x = ParseInt(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetInt32(field, msg, static_cast(*x)); break; } case FieldDescriptor::TYPE_FIXED32: case FieldDescriptor::TYPE_UINT32: { auto x = ParseUInt(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetUInt32(field, msg, static_cast(*x)); break; } case FieldDescriptor::TYPE_BOOL: { absl::StatusOr kind = lex.PeekKind(); RETURN_IF_ERROR(kind.status()); switch (*kind) { case JsonLexer::kTrue: RETURN_IF_ERROR(lex.Expect("true")); Traits::SetBool(field, msg, true); break; case JsonLexer::kFalse: RETURN_IF_ERROR(lex.Expect("false")); Traits::SetBool(field, msg, false); break; case JsonLexer::kStr: { if (!lex.options().allow_legacy_syntax) { goto bad; } auto x = lex.ParseUtf8(); RETURN_IF_ERROR(x.status()); bool flag; if (!absl::SimpleAtob(x->value, &flag)) { // Is this error a lie? Do we accept things otyher than "true" and // "false" because SimpleAtob does? Absolutely! return x->loc.Invalid("expected 'true' or 'false'"); } Traits::SetBool(field, msg, flag); break; } bad: default: return lex.Invalid("expected 'true' or 'false'"); } break; } case FieldDescriptor::TYPE_STRING: case FieldDescriptor::TYPE_BYTES: { auto x = ParseStrOrBytes(lex, field); RETURN_IF_ERROR(x.status()); Traits::SetString(field, msg, *x); break; } case FieldDescriptor::TYPE_ENUM: { absl::StatusOr> x = ParseEnum(lex, field); RETURN_IF_ERROR(x.status()); if (x->has_value() || Traits::IsImplicitPresence(field)) { Traits::SetEnum(field, msg, x->value_or(0)); } break; } case FieldDescriptor::TYPE_MESSAGE: case FieldDescriptor::TYPE_GROUP: { return Traits::NewMsg( field, msg, [&](const Desc& type, Msg& msg) -> absl::Status { return ParseMessage(lex, type, msg, /*any_reparse=*/false); }); } default: return lex.Invalid( absl::StrCat("unsupported field type: ", Traits::FieldType(field))); } return absl::OkStatus(); } template absl::Status EmitNull(JsonLexer& lex, Field field, Msg& msg) { switch (Traits::FieldType(field)) { case FieldDescriptor::TYPE_FLOAT: Traits::SetFloat(field, msg, 0); break; case FieldDescriptor::TYPE_DOUBLE: Traits::SetDouble(field, msg, 0); break; case FieldDescriptor::TYPE_SFIXED64: case FieldDescriptor::TYPE_SINT64: case FieldDescriptor::TYPE_INT64: Traits::SetInt64(field, msg, 0); break; case FieldDescriptor::TYPE_FIXED64: case FieldDescriptor::TYPE_UINT64: Traits::SetUInt64(field, msg, 0); break; case FieldDescriptor::TYPE_SFIXED32: case FieldDescriptor::TYPE_SINT32: case FieldDescriptor::TYPE_INT32: Traits::SetInt32(field, msg, 0); break; case FieldDescriptor::TYPE_FIXED32: case FieldDescriptor::TYPE_UINT32: Traits::SetUInt32(field, msg, 0); break; case FieldDescriptor::TYPE_BOOL: Traits::SetBool(field, msg, false); break; case FieldDescriptor::TYPE_STRING: case FieldDescriptor::TYPE_BYTES: Traits::SetString(field, msg, ""); break; case FieldDescriptor::TYPE_ENUM: Traits::SetEnum(field, msg, 0); break; case FieldDescriptor::TYPE_MESSAGE: case FieldDescriptor::TYPE_GROUP: return Traits::NewMsg(field, msg, [](const auto&, const auto&) -> absl::Status { return absl::OkStatus(); }); default: return lex.Invalid( absl::StrCat("unsupported field type: ", Traits::FieldType(field))); } return absl::OkStatus(); } template absl::Status ParseArray(JsonLexer& lex, Field field, Msg& msg) { if (lex.Peek(JsonLexer::kNull)) { return lex.Expect("null"); } return lex.VisitArray([&]() -> absl::Status { lex.path().NextRepeated(); MessageType type = ClassifyMessage(Traits::FieldTypeName(field)); if (lex.Peek(JsonLexer::kNull)) { if (type == MessageType::kValue) { return ParseSingular(lex, field, msg); } if (type == MessageType::kNull) { return ParseSingular(lex, field, msg); } if (lex.options().allow_legacy_syntax) { RETURN_IF_ERROR(lex.Expect("null")); return EmitNull(lex, field, msg); } return lex.Invalid("null cannot occur inside of repeated fields"); } // Note that this is sufficient to catch when we are inside of a ListValue, // because a ListValue's sole field is of type Value. Thus, we only need to // classify cases in which we are inside of an array and parsing messages // that like looking like arrays. // // This will also correctly handle e.g. writing out a ListValue with the // legacy syntax of `{"values": [[0], [1], [2]]}`, which does not go through // the custom parser handler. bool can_flatten = type != MessageType::kValue && type != MessageType::kList; if (can_flatten && lex.options().allow_legacy_syntax && lex.Peek(JsonLexer::kArr)) { // You read that right. In legacy mode, if we encounter an array within // an array, we just flatten it as part of the current array! // // This DOES NOT apply when parsing a google.protobuf.Value or a // google.protobuf.ListValue! return ParseArray(lex, field, msg); } return ParseSingular(lex, field, msg); }); } template absl::Status ParseMap(JsonLexer& lex, Field field, Msg& msg) { if (lex.Peek(JsonLexer::kNull)) { return lex.Expect("null"); } absl::flat_hash_set keys_seen; return lex.VisitObject( [&](LocationWith& key) -> absl::Status { lex.path().NextRepeated(); auto insert_result = keys_seen.emplace(key.value.AsView()); if (!insert_result.second) { return key.loc.Invalid(absl::StrFormat( "got unexpectedly-repeated repeated map key: '%s'", key.value.AsView())); } return Traits::NewMsg( field, msg, [&](const Desc& type, Msg& entry) -> absl::Status { auto key_field = Traits::KeyField(type); switch (Traits::FieldType(key_field)) { case FieldDescriptor::TYPE_INT64: case FieldDescriptor::TYPE_SINT64: case FieldDescriptor::TYPE_SFIXED64: { int64_t n; if (!absl::SimpleAtoi(key.value.AsView(), &n)) { return key.loc.Invalid( "non-number characters in quoted number"); } Traits::SetInt64(key_field, entry, n); break; } case FieldDescriptor::TYPE_UINT64: case FieldDescriptor::TYPE_FIXED64: { uint64_t n; if (!absl::SimpleAtoi(key.value.AsView(), &n)) { return key.loc.Invalid( "non-number characters in quoted number"); } Traits::SetUInt64(key_field, entry, n); break; } case FieldDescriptor::TYPE_INT32: case FieldDescriptor::TYPE_SINT32: case FieldDescriptor::TYPE_SFIXED32: { int32_t n; if (!absl::SimpleAtoi(key.value.AsView(), &n)) { return key.loc.Invalid( "non-number characters in quoted number"); } Traits::SetInt32(key_field, entry, n); break; } case FieldDescriptor::TYPE_UINT32: case FieldDescriptor::TYPE_FIXED32: { uint32_t n; if (!absl::SimpleAtoi(key.value.AsView(), &n)) { return key.loc.Invalid( "non-number characters in quoted number"); } Traits::SetUInt32(key_field, entry, n); break; } case FieldDescriptor::TYPE_BOOL: { if (key.value == "true") { Traits::SetBool(key_field, entry, true); } else if (key.value == "false") { Traits::SetBool(key_field, entry, false); } else { return key.loc.Invalid(absl::StrFormat( "expected bool string, got '%s'", key.value.AsView())); } break; } case FieldDescriptor::TYPE_ENUM: { MaybeOwnedString key_str = key.value; auto e = ParseEnumFromStr(lex, key_str, field); RETURN_IF_ERROR(e.status()); Traits::SetEnum(key_field, entry, e->value_or(0)); break; } case FieldDescriptor::TYPE_STRING: { Traits::SetString(key_field, entry, std::move(key.value.ToString())); break; } default: return lex.Invalid("unsupported map key type"); } return ParseSingular(lex, Traits::ValueField(type), entry); }); }); } absl::optional TakeTimeDigitsWithSuffixAndAdvance( absl::string_view& data, int max_digits, absl::string_view end) { ABSL_DCHECK_LE(max_digits, 9); uint32_t val = 0; int limit = max_digits; while (!data.empty()) { if (limit-- < 0) { return absl::nullopt; } uint32_t digit = data[0] - '0'; if (digit >= 10) { break; } val *= 10; val += digit; data = data.substr(1); } if (!absl::StartsWith(data, end)) { return absl::nullopt; } data = data.substr(end.size()); return val; } absl::optional TakeNanosAndAdvance(absl::string_view& data) { int32_t frac_secs = 0; size_t frac_digits = 0; if (absl::StartsWith(data, ".")) { for (char c : data.substr(1)) { if (!absl::ascii_isdigit(c)) { break; } ++frac_digits; } auto digits = data.substr(1, frac_digits); if (frac_digits == 0 || frac_digits > 9 || !absl::SimpleAtoi(digits, &frac_secs)) { return absl::nullopt; } data = data.substr(frac_digits + 1); } for (int i = 0; i < 9 - frac_digits; ++i) { frac_secs *= 10; } return frac_secs; } template absl::Status ParseTimestamp(JsonLexer& lex, const Desc& desc, Msg& msg) { if (lex.Peek(JsonLexer::kNull)) { return lex.Expect("null"); } absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); absl::string_view data = str->value.AsView(); if (data.size() < 20) { return str->loc.Invalid("timestamp string too short"); } int64_t secs; { /* 1972-01-01T01:00:00 */ auto year = TakeTimeDigitsWithSuffixAndAdvance(data, 4, "-"); if (!year.has_value() || *year == 0) { return str->loc.Invalid("bad year in timestamp"); } auto mon = TakeTimeDigitsWithSuffixAndAdvance(data, 2, "-"); if (!mon.has_value() || *mon == 0) { return str->loc.Invalid("bad month in timestamp"); } auto day = TakeTimeDigitsWithSuffixAndAdvance(data, 2, "T"); if (!day.has_value() || *day == 0) { return str->loc.Invalid("bad day in timestamp"); } auto hour = TakeTimeDigitsWithSuffixAndAdvance(data, 2, ":"); if (!hour.has_value()) { return str->loc.Invalid("bad hours in timestamp"); } auto min = TakeTimeDigitsWithSuffixAndAdvance(data, 2, ":"); if (!min.has_value()) { return str->loc.Invalid("bad minutes in timestamp"); } auto sec = TakeTimeDigitsWithSuffixAndAdvance(data, 2, ""); if (!sec.has_value()) { return str->loc.Invalid("bad seconds in timestamp"); } uint32_t m_adj = *mon - 3; // March-based month. uint32_t carry = m_adj > *mon ? 1 : 0; uint32_t year_base = 4800; // Before min year, multiple of 400. uint32_t y_adj = *year + year_base - carry; uint32_t month_days = ((m_adj + carry * 12) * 62719 + 769) / 2048; uint32_t leap_days = y_adj / 4 - y_adj / 100 + y_adj / 400; int32_t epoch_days = y_adj * 365 + leap_days + month_days + (*day - 1) - 2472632; secs = int64_t{epoch_days} * 86400 + *hour * 3600 + *min * 60 + *sec; } auto nanos = TakeNanosAndAdvance(data); if (!nanos.has_value()) { return str->loc.Invalid("timestamp had bad nanoseconds"); } if (data.empty()) { return str->loc.Invalid("timestamp missing timezone offset"); } { // [+-]hh:mm or Z bool neg = false; switch (data[0]) { case '-': neg = true; ABSL_FALLTHROUGH_INTENDED; case '+': { if (data.size() != 6) { return str->loc.Invalid("timestamp offset of wrong size."); } data = data.substr(1); auto hour = TakeTimeDigitsWithSuffixAndAdvance(data, 2, ":"); auto mins = TakeTimeDigitsWithSuffixAndAdvance(data, 2, ""); if (!hour.has_value() || !mins.has_value()) { return str->loc.Invalid("timestamp offset has bad hours and minutes"); } int64_t offset = (*hour * 60 + *mins) * 60; secs += (neg ? offset : -offset); break; } // Lowercase z is not accepted, per the spec. case 'Z': if (data.size() == 1) { break; } ABSL_FALLTHROUGH_INTENDED; default: return str->loc.Invalid("bad timezone offset"); } } Traits::SetInt64(Traits::MustHaveField(desc, 1), msg, secs); Traits::SetInt32(Traits::MustHaveField(desc, 2), msg, *nanos); return absl::OkStatus(); } template absl::Status ParseDuration(JsonLexer& lex, const Desc& desc, Msg& msg) { if (lex.Peek(JsonLexer::kNull)) { return lex.Expect("null"); } constexpr int64_t kMaxSeconds = int64_t{3652500} * 86400; absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); size_t int_part_end = 0; for (char c : str->value.AsView()) { if (!absl::ascii_isdigit(c) && c != '-') { break; } ++int_part_end; } if (int_part_end == 0) { return str->loc.Invalid("duration must start with an integer"); } absl::string_view sec_digits = str->value.AsView().substr(0, int_part_end); int64_t secs; if (!absl::SimpleAtoi(sec_digits, &secs)) { return str->loc.Invalid("duration had bad seconds"); } if (secs > kMaxSeconds || secs < -kMaxSeconds) { return str->loc.Invalid("duration out of range"); } absl::string_view rest = str->value.AsView().substr(int_part_end); auto nanos = TakeNanosAndAdvance(rest); if (!nanos.has_value()) { return str->loc.Invalid("duration had bad nanoseconds"); } bool isNegative = (secs < 0) || absl::StartsWith(sec_digits, "-"); if (isNegative) { *nanos *= -1; } if (rest != "s") { return str->loc.Invalid("duration must end with a single 's'"); } Traits::SetInt64(Traits::MustHaveField(desc, 1), msg, secs); Traits::SetInt32(Traits::MustHaveField(desc, 2), msg, *nanos); return absl::OkStatus(); } template absl::Status ParseFieldMask(JsonLexer& lex, const Desc& desc, Msg& msg) { absl::StatusOr> str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); auto paths = str->value.AsView(); // The special case of the empty string is not handled correctly below, // because StrSplit("", ',') is [""], not []. if (paths.empty()) { return absl::OkStatus(); } // google.protobuf.FieldMask has a single field with number 1. auto paths_field = Traits::MustHaveField(desc, 1); for (absl::string_view path : absl::StrSplit(paths, ',')) { std::string snake_path; // Assume approximately six-letter words, so add one extra space for an // underscore for every six bytes. snake_path.reserve(path.size() * 7 / 6); for (char c : path) { if (absl::ascii_isdigit(c) || absl::ascii_islower(c) || c == '.') { snake_path.push_back(c); } else if (absl::ascii_isupper(c)) { snake_path.push_back('_'); snake_path.push_back(absl::ascii_tolower(c)); } else if (lex.options().allow_legacy_syntax) { snake_path.push_back(c); } else { return str->loc.Invalid("unexpected character in FieldMask"); } } Traits::SetString(paths_field, msg, snake_path); } return absl::OkStatus(); } template absl::Status ParseAny(JsonLexer& lex, const Desc& desc, Msg& msg) { // Buffer an entire object. Because @type can occur anywhere, we're forced // to do this. RETURN_IF_ERROR(lex.SkipToToken()); auto mark = lex.BeginMark(); // Search for @type, buffering the entire object along the way so we can // reparse it. absl::optional type_url; RETURN_IF_ERROR(lex.VisitObject( [&](const LocationWith& key) -> absl::Status { if (key.value == "@type") { if (type_url.has_value()) { return key.loc.Invalid("repeated @type in Any"); } absl::StatusOr> maybe_url = lex.ParseUtf8(); RETURN_IF_ERROR(maybe_url.status()); type_url = std::move(maybe_url)->value; return absl::OkStatus(); } return lex.SkipValue(); })); // Build a new lexer over the skipped object. absl::string_view any_text = mark.value.UpToUnread(); io::ArrayInputStream in(any_text.data(), any_text.size()); // Copying lex.options() is important; it inherits the recursion // limit. JsonLexer any_lex(&in, lex.options(), &lex.path(), mark.loc); if (!type_url.has_value() && !lex.options().allow_legacy_syntax) { return mark.loc.Invalid("missing @type in Any"); } if (type_url.has_value()) { Traits::SetString(Traits::MustHaveField(desc, 1), msg, type_url->AsView()); return Traits::NewDynamic( Traits::MustHaveField(desc, 2), type_url->ToString(), msg, [&](const Desc& desc, Msg& msg) { auto pop = any_lex.path().Push("", FieldDescriptor::TYPE_MESSAGE, Traits::TypeName(desc)); return ParseMessage(any_lex, desc, msg, /*any_reparse=*/true); }); } else { // Empty {} is accepted in legacy mode. ABSL_DCHECK(lex.options().allow_legacy_syntax); RETURN_IF_ERROR(any_lex.VisitObject([&](auto&) { return mark.loc.Invalid( "in legacy mode, missing @type in Any is only allowed for an empty " "object"); })); return absl::OkStatus(); } } // These are mutually recursive with ParseValue. template absl::Status ParseStructValue(JsonLexer& lex, const Desc& desc, Msg& msg); template absl::Status ParseListValue(JsonLexer& lex, const Desc& desc, Msg& msg); template absl::Status ParseValue(JsonLexer& lex, const Desc& desc, Msg& msg) { auto kind = lex.PeekKind(); RETURN_IF_ERROR(kind.status()); // NOTE: The field numbers 1 through 6 are the numbers of the oneof fields // in google.protobuf.Value. Conformance tests verify the correctness of // these numbers. switch (*kind) { case JsonLexer::kNull: { auto field = Traits::MustHaveField(desc, 1); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); RETURN_IF_ERROR(lex.Expect("null")); Traits::SetEnum(field, msg, 0); break; } case JsonLexer::kNum: { auto field = Traits::MustHaveField(desc, 2); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); auto number = lex.ParseNumber(); RETURN_IF_ERROR(number.status()); Traits::SetDouble(field, msg, number->value); break; } case JsonLexer::kStr: { auto field = Traits::MustHaveField(desc, 3); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); auto str = lex.ParseUtf8(); RETURN_IF_ERROR(str.status()); Traits::SetString(field, msg, std::move(str->value.ToString())); break; } case JsonLexer::kFalse: case JsonLexer::kTrue: { auto field = Traits::MustHaveField(desc, 4); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); // "Quoted" bools, including non-standard Abseil Atob bools, are not // supported, because all strings are treated as genuine JSON strings. if (*kind == JsonLexer::kTrue) { RETURN_IF_ERROR(lex.Expect("true")); Traits::SetBool(field, msg, true); } else { RETURN_IF_ERROR(lex.Expect("false")); Traits::SetBool(field, msg, false); } break; } case JsonLexer::kObj: { auto field = Traits::MustHaveField(desc, 5); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); return Traits::NewMsg(field, msg, [&](auto& desc, auto& msg) { return ParseStructValue(lex, desc, msg); }); } case JsonLexer::kArr: { auto field = Traits::MustHaveField(desc, 6); auto pop = lex.path().Push(Traits::FieldName(field), Traits::FieldType(field), Traits::FieldTypeName(field)); return Traits::NewMsg(field, msg, [&](auto& desc, auto& msg) { return ParseListValue(lex, desc, msg); }); } } return absl::OkStatus(); } template absl::Status ParseStructValue(JsonLexer& lex, const Desc& desc, Msg& msg) { auto entry_field = Traits::MustHaveField(desc, 1); auto pop = lex.path().Push("", FieldDescriptor::TYPE_MESSAGE, Traits::FieldTypeName(entry_field)); // Structs are always cleared even if set to {}. Traits::RecordAsSeen(entry_field, msg); // Parsing a map does the right thing: Struct has a single map field; keys are correctly parsed as strings, and the values // recurse into ParseMessage, which will be routed into ParseValue. This // results in some extra overhead, but performance is not what we're going // for here. return ParseMap(lex, entry_field, msg); } template absl::Status ParseListValue(JsonLexer& lex, const Desc& desc, Msg& msg) { auto entry_field = Traits::MustHaveField(desc, 1); auto pop = lex.path().Push("", FieldDescriptor::TYPE_MESSAGE, Traits::FieldTypeName(entry_field)); // ListValues are always cleared even if set to []. Traits::RecordAsSeen(entry_field, msg); // Parsing an array does the right thing: see the analogous comment in // ParseStructValue. return ParseArray(lex, entry_field, msg); } template absl::Status ParseField(JsonLexer& lex, const Desc& desc, absl::string_view name, Msg& msg) { absl::optional> field; if (absl::StartsWith(name, "[") && absl::EndsWith(name, "]")) { absl::string_view extn_name = name.substr(1, name.size() - 2); field = Traits::ExtensionByName(desc, extn_name); if (field.has_value()) { // The check for whether this is an invalid field occurs below, since it // is combined for both extension and non-extension fields. auto correct_type_name = Traits::TypeName(desc); if (Traits::TypeName(Traits::ContainingType(*field)) != correct_type_name) { return lex.Invalid(absl::StrFormat( "'%s' is a known extension name, but is not an extension " "of '%s' as expected", extn_name, correct_type_name)); } } } else { field = Traits::FieldByName(desc, name); } if (!field.has_value()) { if (!lex.options().ignore_unknown_fields) { return lex.Invalid(absl::StrFormat("no such field: '%s'", name)); } return lex.SkipValue(); } auto pop = lex.path().Push(name, Traits::FieldType(*field), Traits::FieldTypeName(*field)); if (Traits::HasParsed( *field, msg, /*allow_repeated_non_oneof=*/lex.options().allow_legacy_syntax) && !lex.Peek(JsonLexer::kNull)) { return lex.Invalid(absl::StrFormat( "'%s' has already been set (either directly or as part of a oneof)", name)); } if (Traits::IsMap(*field)) { return ParseMap(lex, *field, msg); } if (Traits::IsRepeated(*field)) { if (lex.options().allow_legacy_syntax && !lex.Peek(JsonLexer::kArr)) { // The original ESF parser permits a single element in place of an array // thereof. return ParseSingular(lex, *field, msg); } return ParseArray(lex, *field, msg); } return ParseSingular(lex, *field, msg); } template absl::Status ParseMessage(JsonLexer& lex, const Desc& desc, Msg& msg, bool any_reparse) { MessageType type = ClassifyMessage(Traits::TypeName(desc)); if (!any_reparse) { switch (type) { case MessageType::kAny: return ParseAny(lex, desc, msg); case MessageType::kValue: return ParseValue(lex, desc, msg); case MessageType::kStruct: return ParseStructValue(lex, desc, msg); default: break; } // For some types, the ESF parser permits parsing the "non-special" version. // It is not clear if this counts as out-of-spec, but we're treating it as // such. bool is_upcoming_object = lex.Peek(JsonLexer::kObj); if (!(is_upcoming_object && lex.options().allow_legacy_syntax)) { switch (type) { case MessageType::kList: return ParseListValue(lex, desc, msg); case MessageType::kWrapper: { return ParseSingular(lex, Traits::MustHaveField(desc, 1), msg); } case MessageType::kTimestamp: return ParseTimestamp(lex, desc, msg); case MessageType::kDuration: return ParseDuration(lex, desc, msg); case MessageType::kFieldMask: return ParseFieldMask(lex, desc, msg); default: break; } } } return lex.VisitObject( [&](LocationWith& name) -> absl::Status { // If this is a well-known type, we expect its contents to be inside // of a JSON field named "value". if (any_reparse) { if (name.value == "@type") { RETURN_IF_ERROR(lex.SkipValue()); return absl::OkStatus(); } if (type != MessageType::kNotWellKnown) { if (name.value != "value") { return lex.Invalid( "fields in a well-known-typed Any must be @type or value"); } // Parse the upcoming value as the message itself. This is *not* // an Any reparse because we do not expect to see @type in the // upcoming value. return ParseMessage(lex, desc, msg, /*any_reparse=*/false); } } return ParseField(lex, desc, name.value.AsView(), msg); }); } } // namespace absl::Status JsonStringToMessage(absl::string_view input, Message* message, json_internal::ParseOptions options) { MessagePath path(message->GetDescriptor()->full_name()); if (PROTOBUF_DEBUG) { ABSL_DLOG(INFO) << "json2/input: " << absl::CHexEscape(input); } io::ArrayInputStream in(input.data(), input.size()); JsonLexer lex(&in, options, &path); ParseProto2Descriptor::Msg msg(message); absl::Status s = ParseMessage(lex, *message->GetDescriptor(), msg, /*any_reparse=*/false); if (s.ok() && !lex.AtEof()) { s = absl::InvalidArgumentError( "extraneous characters after end of JSON object"); } if (PROTOBUF_DEBUG) { ABSL_DLOG(INFO) << "json2/status: " << s; ABSL_DLOG(INFO) << "json2/output: " << message->DebugString(); } return s; } absl::Status JsonToBinaryStream(google::protobuf::util::TypeResolver* resolver, const std::string& type_url, io::ZeroCopyInputStream* json_input, io::ZeroCopyOutputStream* binary_output, json_internal::ParseOptions options) { // NOTE: Most of the contortions in this function are to allow for capture of // input and output of the parser in ABSL_DLOG mode. Destruction order is very // critical in this function, because io::ZeroCopy*Stream types usually only // flush on destruction. // For ABSL_DLOG, we would like to print out the input and output, which // requires buffering both instead of doing "zero copy". This block, and the // one at the end of the function, set up and tear down interception of the // input and output streams. std::string copy; std::string out; absl::optional tee_input; absl::optional tee_output; if (PROTOBUF_DEBUG) { const void* data; int len; while (json_input->Next(&data, &len)) { copy.resize(copy.size() + len); std::memcpy(©[copy.size() - len], data, len); } tee_input.emplace(copy.data(), copy.size()); tee_output.emplace(&out); ABSL_DLOG(INFO) << "json2/input: " << absl::CHexEscape(copy); } // This scope forces the CodedOutputStream inside of `msg` to flush before we // possibly handle logging the binary protobuf output. absl::Status s; { MessagePath path(type_url); JsonLexer lex(tee_input.has_value() ? &*tee_input : json_input, options, &path); Msg msg(tee_output.has_value() ? &*tee_output : binary_output); ResolverPool pool(resolver); auto desc = pool.FindMessage(type_url); RETURN_IF_ERROR(desc.status()); s = ParseMessage(lex, **desc, msg, /*any_reparse=*/false); if (s.ok() && !lex.AtEof()) { s = absl::InvalidArgumentError( "extraneous characters after end of JSON object"); } } if (PROTOBUF_DEBUG) { tee_output.reset(); // Flush the output stream. io::zc_sink_internal::ZeroCopyStreamByteSink(binary_output) .Append(out.data(), out.size()); ABSL_DLOG(INFO) << "json2/status: " << s; ABSL_DLOG(INFO) << "json2/output: " << absl::BytesToHexString(out); } return s; } } // namespace json_internal } // namespace protobuf } // namespace google