On 01/23/2015 02:18 AM, Noah Misch wrote:
On Wed, Jan 21, 2015 at 06:51:34PM -0500, Andrew Dunstan wrote:
The following case has just been brought to my attention (look at the
differing number of backslashes):

    andrew=# select jsonb '"\\u0000"';
       jsonb
    ----------
      "\u0000"
    (1 row)

    andrew=# select jsonb '"\u0000"';
       jsonb
    ----------
      "\u0000"
    (1 row)
A mess indeed.  The input is unambiguous, but the jsonb representation can't
distinguish "\u0000" from "\\u0000".  Some operations on the original json
type have similar problems, since they use an in-memory binary representation
with the same shortcoming:

[local] test=# select json_array_element_text($$["\u0000"]$$, 0) =
test-# json_array_element_text($$["\\u0000"]$$, 0);
  ?column?
----------
  t
(1 row)

Things get worse, though. On output, '\uabcd' for any four hex digits is
recognized as a unicode escape, and thus the backslash is not escaped, so
that we get:

    andrew=# select jsonb '"\\uabcd"';
       jsonb
    ----------
      "\uabcd"
    (1 row)


We could probably fix this fairly easily for non- U+0000 cases by having
jsonb_to_cstring use a different escape_json routine.
Sounds reasonable.  For 9.4.1, before more people upgrade?

But it's a mess, sadly, and I'm not sure what a good fix for the U+0000 case
would look like.
Agreed.  When a string unescape algorithm removes some kinds of backslash
escapes and not others, it's nigh inevitable that two semantically-distinct
inputs can yield the same output.  json_lex_string() fell into that trap by
making an exception for \u0000.  To fix this, the result needs to be fully
unescaped (\u0000 converted to the NUL byte) or retain all backslash escapes.
(Changing that either way is no fun now that an on-disk format is at stake.)

Maybe we should detect such input and emit a warning of
ambiguity? It's likely to be rare enough, but clearly not as rare as we'd
like, since this is a report from the field.
Perhaps.  Something like "WARNING:  jsonb cannot represent \\u0000; reading as
\u0000"?  Alas, but I do prefer that to silent data corruption.




Maybe something like this patch. I have two concerns about it, though. The first is the possible performance impact of looking for the offending string in every jsonb input, and the second is that the test isn't quite right, since input of \\\u0000 doesn't raise this issue - i.e. the problem arises when u0000 is preceded by an even number of backslashes.

For the moment, maybe I could commit the fix for the non U+0000 case for escape_json, and we could think some more about detecting and warning about the problem strings.

cheers

andrew
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 3c137ea..8d2b38f 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -94,6 +94,8 @@ static void datum_to_json(Datum val, bool is_null, StringInfo result,
 static void add_json(Datum val, bool is_null, StringInfo result,
 		 Oid val_type, bool key_scalar);
 static text *catenate_stringinfo_string(StringInfo buffer, const char *addon);
+static void  escape_json_work(StringInfo buf, const char *str,
+							  bool jsonb_unicode);
 
 /* the null action object used for pure validation */
 static JsonSemAction nullSemAction =
@@ -2356,6 +2358,18 @@ json_object_two_arg(PG_FUNCTION_ARGS)
 void
 escape_json(StringInfo buf, const char *str)
 {
+	escape_json_work( buf, str, false);
+}
+
+void
+escape_jsonb(StringInfo buf, const char *str)
+{
+	escape_json_work( buf, str, true);
+}
+
+static void
+escape_json_work(StringInfo buf, const char *str, bool jsonb_unicode)
+{
 	const char *p;
 
 	appendStringInfoCharMacro(buf, '\"');
@@ -2398,7 +2412,9 @@ escape_json(StringInfo buf, const char *str)
 				 * unicode escape that should be present is \u0000, all the
 				 * other unicode escapes will have been resolved.
 				 */
-				if (p[1] == 'u' &&
+				if (jsonb_unicode && strncmp(p+1, "u0000", 5) == 0)
+					appendStringInfoCharMacro(buf, *p);
+				else if (!jsonb_unicode && p[1] == 'u' &&
 					isxdigit((unsigned char) p[2]) &&
 					isxdigit((unsigned char) p[3]) &&
 					isxdigit((unsigned char) p[4]) &&
diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c
index 644ea6d..ba1e7f0 100644
--- a/src/backend/utils/adt/jsonb.c
+++ b/src/backend/utils/adt/jsonb.c
@@ -218,6 +218,14 @@ jsonb_from_cstring(char *json, int len)
 	JsonbInState state;
 	JsonSemAction sem;
 
+	if (strstr(json,"\\\\u0000") != NULL)
+		ereport(WARNING,
+				(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+				 errmsg("jsonb cannot represent \\\\u0000; reading as
+\\u0000"),
+				 errdetail("Due to an implementation restriction, jsonb cannot represent a unicode null immediately preceded by a backslash.")));
+		elog(WARNING,);
+
 	memset(&state, 0, sizeof(state));
 	memset(&sem, 0, sizeof(sem));
 	lex = makeJsonLexContextCstringLen(json, len, true);
@@ -305,7 +313,7 @@ jsonb_put_escaped_value(StringInfo out, JsonbValue *scalarVal)
 			appendBinaryStringInfo(out, "null", 4);
 			break;
 		case jbvString:
-			escape_json(out, pnstrdup(scalarVal->val.string.val, scalarVal->val.string.len));
+			escape_jsonb(out, pnstrdup(scalarVal->val.string.val, scalarVal->val.string.len));
 			break;
 		case jbvNumeric:
 			appendStringInfoString(out,
diff --git a/src/include/utils/json.h b/src/include/utils/json.h
index 6f69403..661d7bd 100644
--- a/src/include/utils/json.h
+++ b/src/include/utils/json.h
@@ -42,7 +42,13 @@ extern Datum json_build_array_noargs(PG_FUNCTION_ARGS);
 extern Datum json_object(PG_FUNCTION_ARGS);
 extern Datum json_object_two_arg(PG_FUNCTION_ARGS);
 
+/*
+ * escape_jsonb is more strict about unicode escapes, and will only not
+ * escape \u0000, as that is the only unicode escape that should still be
+ * present.
+ */
 extern void escape_json(StringInfo buf, const char *str);
+extern void escape_jsonb(StringInfo buf, const char *str);
 
 extern Datum json_typeof(PG_FUNCTION_ARGS);
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to