I wrote:
> Yeah.  In a quick scan, it appears that there is only one caller that
> tries to save the result directly.  So I considered making that caller
> do a pstrdup and eliminating the extra thrashing in t_readline itself.
> But it seemed too fragile; somebody would get it wrong and then have
> excess space consumption for their dictionary.

I had a better idea: if we get rid of t_readline() itself, which has
been deprecated for years anyway, we can have the calling layer
tsearch_readline_xxx maintain a StringInfo across the whole file
read process and thus get rid of alloc/free cycles for the StringInfo.

However ... while working on that, I realized that the usage of
the "curline" field in that code is completely unsafe.  We save
a pointer to the result of tsearch_readline() for possible use
by the error context callback, *but the caller is likely to free that
string at some point*.  So there is a window where an error would
result in trying to print already-freed data.

It looks like all of the core-code dictionaries free the result
string at the bottoms of their loops, so that the window for
trouble is pretty much empty.  But contrib/dict_xsyn doesn't
do it like that, and so it's surely at risk.  Any external
dictionaries that have copied that code would also be at risk.

So the attached adds a pstrdup/pfree to ensure that "curline"
has its own storage, putting us right back at two palloc/pfree
cycles per line.  I don't think there's a lot of choice though;
in fact, I'm leaning to the idea that we need to back-patch
that part of this.  The odds of trouble in a production build
probably aren't high, but still...

                        regards, tom lane

diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index cb0835982d..64c979086d 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -286,11 +286,6 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
 					 errmsg("unexpected end of line")));
 
-		/*
-		 * Note: currently, tsearch_readline can't return lines exceeding 4KB,
-		 * so overflow of the word counts is impossible.  But that may not
-		 * always be true, so let's check.
-		 */
 		if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index a916dd6cb6..c85939631e 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "catalog/pg_collation.h"
+#include "common/string.h"
 #include "storage/fd.h"
 #include "tsearch/ts_locale.h"
 #include "tsearch/ts_public.h"
@@ -128,6 +129,7 @@ tsearch_readline_begin(tsearch_readline_state *stp,
 		return false;
 	stp->filename = filename;
 	stp->lineno = 0;
+	initStringInfo(&stp->buf);
 	stp->curline = NULL;
 	/* Setup error traceback support for ereport() */
 	stp->cb.callback = tsearch_readline_callback;
@@ -146,11 +148,44 @@ char *
 tsearch_readline(tsearch_readline_state *stp)
 {
 	char	   *result;
+	int			len;
 
+	/* Advance line number to use in error reports */
 	stp->lineno++;
-	stp->curline = NULL;
-	result = t_readline(stp->fp);
-	stp->curline = result;
+
+	/* Clear curline, it's no longer relevant */
+	if (stp->curline)
+	{
+		pfree(stp->curline);
+		stp->curline = NULL;
+	}
+
+	/* Collect next line, if there is one */
+	if (!pg_get_line_buf(stp->fp, &stp->buf))
+		return NULL;
+
+	/* Make sure the input is valid UTF-8 */
+	len = stp->buf.len;
+	(void) pg_verify_mbstr(PG_UTF8, stp->buf.data, len, false);
+
+	/* And convert */
+	result = pg_any_to_server(stp->buf.data, len, PG_UTF8);
+	if (result == stp->buf.data)
+	{
+		/*
+		 * Conversion didn't pstrdup, so we must.  We can use the length of
+		 * the original string, because no conversion was done.
+		 */
+		result = pnstrdup(result, len);
+	}
+
+	/*
+	 * Save a copy of the line for possible use in error reports.  (We cannot
+	 * just save "result", since it's likely to get pfree'd at some point by
+	 * the caller; an error after that would try to access freed data.)
+	 */
+	stp->curline = pstrdup(result);
+
 	return result;
 }
 
@@ -160,7 +195,16 @@ tsearch_readline(tsearch_readline_state *stp)
 void
 tsearch_readline_end(tsearch_readline_state *stp)
 {
+	/* Suppress use of curline in any error reported below */
+	if (stp->curline)
+	{
+		pfree(stp->curline);
+		stp->curline = NULL;
+	}
+
+	pfree(stp->buf.data);
 	FreeFile(stp->fp);
+
 	/* Pop the error context stack */
 	error_context_stack = stp->cb.previous;
 }
@@ -176,8 +220,7 @@ tsearch_readline_callback(void *arg)
 
 	/*
 	 * We can't include the text of the config line for errors that occur
-	 * during t_readline() itself.  This is only partly a consequence of our
-	 * arms-length use of that routine: the major cause of such errors is
+	 * during tsearch_readline() itself.  The major cause of such errors is
 	 * encoding violations, and we daren't try to print error messages
 	 * containing badly-encoded data.
 	 */
@@ -193,43 +236,6 @@ tsearch_readline_callback(void *arg)
 }
 
 
-/*
- * Read the next line from a tsearch data file (expected to be in UTF-8), and
- * convert it to database encoding if needed. The returned string is palloc'd.
- * NULL return means EOF.
- *
- * Note: direct use of this function is now deprecated.  Go through
- * tsearch_readline() to provide better error reporting.
- */
-char *
-t_readline(FILE *fp)
-{
-	int			len;
-	char	   *recoded;
-	char		buf[4096];		/* lines must not be longer than this */
-
-	if (fgets(buf, sizeof(buf), fp) == NULL)
-		return NULL;
-
-	len = strlen(buf);
-
-	/* Make sure the input is valid UTF-8 */
-	(void) pg_verify_mbstr(PG_UTF8, buf, len, false);
-
-	/* And convert */
-	recoded = pg_any_to_server(buf, len, PG_UTF8);
-	if (recoded == buf)
-	{
-		/*
-		 * conversion didn't pstrdup, so we must. We can use the length of the
-		 * original string, because no conversion was done.
-		 */
-		recoded = pnstrdup(recoded, len);
-	}
-
-	return recoded;
-}
-
 /*
  * lowerstr --- fold null-terminated string to lower case
  *
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index cc4bd9ab20..3c2e635f86 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -15,6 +15,7 @@
 #include <ctype.h>
 #include <limits.h>
 
+#include "lib/stringinfo.h"
 #include "mb/pg_wchar.h"
 #include "utils/pg_locale.h"
 
@@ -33,7 +34,8 @@ typedef struct
 	FILE	   *fp;
 	const char *filename;
 	int			lineno;
-	char	   *curline;
+	StringInfoData buf;			/* current input line, in UTF-8 */
+	char	   *curline;		/* input line in DB's encoding, or NULL */
 	ErrorContextCallback cb;
 } tsearch_readline_state;
 
@@ -57,6 +59,4 @@ extern bool tsearch_readline_begin(tsearch_readline_state *stp,
 extern char *tsearch_readline(tsearch_readline_state *stp);
 extern void tsearch_readline_end(tsearch_readline_state *stp);
 
-extern char *t_readline(FILE *fp);
-
 #endif							/* __TSLOCALE_H__ */

Reply via email to