Hi,
The byteain function converts a string input into a bytea type.
The original implementation processes two input formats:
a hex format (starting with \x) and a traditional escaped format.
For the escaped format, the function scans the input string twice
— once to calculate the exact size of the output and allocate memory,
and again to fill the allocated memory with the parsed data.
This double scanning can be inefficient, especially for large inputs.
So I optimized the function to eliminate the need for two scans,
while preserving correctness and efficiency.
Please help review it and share your valuable comments.
Thanks,
Steven Niu
https://www.highgo.com/
From db0352fb7fa463bd7a02f73f29760d1400cef402 Mon Sep 17 00:00:00 2001
From: Steven Niu <niush...@highgo.com>
Date: Wed, 26 Mar 2025 14:43:43 +0800
Subject: [PATCH] Optimize function byteain() to avoid double scanning
Optimized the function to eliminate the need for two scans,
while preserving correctness and efficiency.
Author: Steven Niu <niush...@gmail.com>
---
src/backend/utils/adt/varlena.c | 66 +++++++++++----------------------
1 file changed, 22 insertions(+), 44 deletions(-)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 95631eb2099..de422cafbd5 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -291,7 +291,6 @@ text_to_cstring_buffer(const text *src, char *dst, size_t
dst_len)
* ereport(ERROR, ...) if bad form.
*
* BUGS:
- * The input is scanned twice.
* The error checking of input is minimal.
*/
Datum
@@ -302,6 +301,7 @@ byteain(PG_FUNCTION_ARGS)
char *tp;
char *rp;
int bc;
+ size_t input_len;
bytea *result;
/* Recognize hex input */
@@ -318,45 +318,28 @@ byteain(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(result);
}
- /* Else, it's the traditional escaped style */
- for (bc = 0, tp = inputText; *tp != '\0'; bc++)
- {
- if (tp[0] != '\\')
- tp++;
- else if ((tp[0] == '\\') &&
- (tp[1] >= '0' && tp[1] <= '3') &&
- (tp[2] >= '0' && tp[2] <= '7') &&
- (tp[3] >= '0' && tp[3] <= '7'))
- tp += 4;
- else if ((tp[0] == '\\') &&
- (tp[1] == '\\'))
- tp += 2;
- else
- {
- /*
- * one backslash, not followed by another or ### valid
octal
- */
- ereturn(escontext, (Datum) 0,
-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
- errmsg("invalid input syntax for type
%s", "bytea")));
- }
- }
-
- bc += VARHDRSZ;
-
- result = (bytea *) palloc(bc);
- SET_VARSIZE(result, bc);
-
- tp = inputText;
+ /* Handle traditional escaped style in a single pass */
+ input_len = strlen(inputText);
+ result = palloc(input_len + VARHDRSZ); /* Allocate max possible size */
rp = VARDATA(result);
+ tp = inputText;
+
while (*tp != '\0')
{
if (tp[0] != '\\')
+ {
*rp++ = *tp++;
- else if ((tp[0] == '\\') &&
- (tp[1] >= '0' && tp[1] <= '3') &&
- (tp[2] >= '0' && tp[2] <= '7') &&
- (tp[3] >= '0' && tp[3] <= '7'))
+ continue;
+ }
+
+ if (tp[1] == '\\')
+ {
+ *rp++ = '\\';
+ tp += 2;
+ }
+ else if ((tp[1] >= '0' && tp[1] <= '3') &&
+ (tp[2] >= '0' && tp[2] <= '7') &&
+ (tp[3] >= '0' && tp[3] <= '7'))
{
bc = VAL(tp[1]);
bc <<= 3;
@@ -366,23 +349,18 @@ byteain(PG_FUNCTION_ARGS)
tp += 4;
}
- else if ((tp[0] == '\\') &&
- (tp[1] == '\\'))
- {
- *rp++ = '\\';
- tp += 2;
- }
else
{
- /*
- * We should never get here. The first pass should not
allow it.
- */
+ /* Invalid escape sequence: report error */
ereturn(escontext, (Datum) 0,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid input syntax for type
%s", "bytea")));
}
}
+ /* Set the actual size of the bytea */
+ SET_VARSIZE(result, (rp - VARDATA(result)) + VARHDRSZ);
+
PG_RETURN_BYTEA_P(result);
}
--
2.43.0