Re: Remove dependence on integer wrapping

Joseph Koshakow Sat, 17 Aug 2024 14:53:20 -0700

>>> SET temp_buffers TO 1000000000;
>>>
>>> CREATE TEMP TABLE t(i int PRIMARY KEY);
>>> INSERT INTO t VALUES(1);
>>>
>>> #4  0x00007f385cdd37f3 in __GI_abort () at ./stdlib/abort.c:79
>>> #5  0x00005620071c4f51 in __addvsi3 ()
>>> #6  0x0000562007143f3c in init_htab (hashp=0x562008facb20,
nelem=610070812) at dynahash.c:720
>>>
>>> (gdb) f 6
>>> #6  0x0000560915207f3c in init_htab (hashp=0x560916039930,
nelem=1000000000) at dynahash.c:720
>>> 720             hctl->high_mask = (nbuckets << 1) - 1;
>>> (gdb) p nbuckets
>>> $1 = 1073741824
>>
>> Here's what it looks like is happening:
>>
>> 1. When inserting into the table, we create a new dynamic hash table
>> and set `nelem` equal to `temp_buffers`, which is 1000000000.
>>
>> 2. `nbuckets` is then set to the the next highest power of 2 from
>>    `nelem`, which is 1073741824.
>>
>>     /*
>>      * Allocate space for the next greater power of two number of
buckets,
>>      * assuming a desired maximum load factor of 1.
>>      */
>>     nbuckets = next_pow2_int(nelem);
>>
>> 3. Shift `nbuckets` to the left by 1. This would equal 2147483648,
>> which is larger than `INT_MAX`, which causes an overflow.
>>
>>     hctl->high_mask = (nbuckets << 1) - 1;
>>
>> The max value allowed for `temp_buffers` is `INT_MAX / 2` (1073741823),
>> So any value of `temp_buffers` in the range (536870912, 1073741823]
>> would cause this overflow. Without `-ftrapv`, `nbuckets` would wrap
>> around to -2147483648, which is likely to cause all sorts of havoc, I'm
>> just not sure what exactly.
>>
>> Also, `nbuckets = next_pow2_int(nelem);`, by itself is a bit sketchy
>> considering that `nelem` is a `long` and `nbuckets` is an `int`.
>> Potentially, the fix here is to just convert `nbuckets` to a `long`. >>
I plan on checking if that's feasible.
> Yeah, the minimum value that triggers the trap is 536870913 and the
maximum
> accepted is 1073741823.
>
> Without -ftrapv, hctl->high_mask is set to 2147483647 on my machine,
> when nbuckets is 1073741824, and the INSERT apparently succeeds.



> I've taken a look at this and my current proposal is to convert
> `nbuckets` to 64 bit integer which would prevent the overflow. I'm
> hoping to look into if this is feasible soon.

I've both figured out why the INSERT still succeeds and a simple
solution to this. After `nbuckets` wraps around to -2147483648, we
subtract 1 which causes it to wrap back around to 2147483647. Which
explains the result seen by Alexander.

By the way,

>> Also, `nbuckets = next_pow2_int(nelem);`, by itself is a bit sketchy
>> considering that `nelem` is a `long` and `nbuckets` is an `int`.

It turns out I was wrong about this, `next_pow2_int` will always return
a value that fits into an `int`.

    hctl->high_mask = (nbuckets << 1) - 1;

This calculation is used to ultimately populate the field
`uint32 high_mask`. I'm not very familiar with this hash table
implementation and I'm not entirely sure what it would take to convert
this to a `uint64`, but from poking around it looks like it would have
a huge blast radius.

The largest possible (theoretical) value for `nbuckets` is
`1073741824`, the largest power of 2 that fits into an `int`. So, the
largest possible value for `nbuckets << 1` is `2147483648`. This can
fully fit in a `uint32`, so the simple fix for this case is to cast
`nbuckets` to a `uint32` before shifting. I've attached this fix,
Alexander if you have time I would appreciate if you were able to test
it.

I noticed another potential issue with next_pow2_int. The
implementation is in dynahash.c and is as follows

    /* calculate first power of 2 >= num, bounded to what will fit in an
int */
    static int
    next_pow2_int(long num)
    {
        if (num > INT_MAX / 2)
            num = INT_MAX / 2;
        return 1 << my_log2(num);
    }

I'm pretty sure that `INT_MAX / 2` is not a power of 2, as `INT_MAX`
is not a power of 2. It should be `num = INT_MAX / 2 + 1;` I've also
attached a patch with this fix.

Thanks,
Joseph Koshakow

From 529452218496b9cd89464e1fad9e48c6279cef86 Mon Sep 17 00:00:00 2001
From: Joseph Koshakow <kosh...@gmail.com>
Date: Sat, 17 Aug 2024 17:50:11 -0400
Subject: [PATCH v25 3/3] Fix next_pow2_int

This commit fixes the `next_pow2_int` function so that it always
returns a power of 2.
---
 src/backend/utils/hash/dynahash.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 5aed981085..ee2bbe5dc8 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -1819,8 +1819,8 @@ next_pow2_long(long num)
 static int
 next_pow2_int(long num)
 {
-	if (num > INT_MAX / 2)
-		num = INT_MAX / 2;
+	if (num > INT_MAX / 2 + 1)
+		num = INT_MAX / 2 + 1;
 	return 1 << my_log2(num);
 }
 
-- 
2.34.1

From 397669bb89e51610928e2404c02a467ac431b280 Mon Sep 17 00:00:00 2001
From: Joseph Koshakow <kosh...@gmail.com>
Date: Sat, 17 Aug 2024 17:47:10 -0400
Subject: [PATCH v25 2/3] Remove dependence on -fwrapv semantics in dynahash

This commit updates the logic for creating a new dynamic hash table to
no longer rely on integer wrapping for correctness.
---
 src/backend/utils/hash/dynahash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 5d9c62b652..5aed981085 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -717,7 +717,7 @@ init_htab(HTAB *hashp, long nelem)
 		nbuckets <<= 1;
 
 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
-	hctl->high_mask = (nbuckets << 1) - 1;
+	hctl->high_mask = ((uint32) nbuckets << 1) - 1;
 
 	/*
 	 * Figure number of directory segments needed, round up to a power of 2
-- 
2.34.1

From 15e5a79f198a029fc6436c409a00a1592e23a46d Mon Sep 17 00:00:00 2001
From: Matthew Kim <38759997+friendlymatt...@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:25:10 -0400
Subject: [PATCH v25 1/3] Remove dependence on -fwrapv semantics in some
 date/time functions.

This commit updates a few date/time functions, such as
to_timestamp() and make_date(), to no longer rely on signed integer
wrapping for correctness. This is intended to move us closer towards
removing -fwrapv, which may enable some compiler optimizations.
However, there is presently no plan to actually remove that compiler
option in the near future.
---
 src/backend/utils/adt/date.c           |  5 +++-
 src/backend/utils/adt/formatting.c     | 35 +++++++++++++++++++++++---
 src/test/regress/expected/date.out     |  2 ++
 src/test/regress/expected/horology.out |  4 +++
 src/test/regress/sql/date.sql          |  1 +
 src/test/regress/sql/horology.sql      |  2 ++
 6 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c
index 9c854e0e5c..baa677fb96 100644
--- a/src/backend/utils/adt/date.c
+++ b/src/backend/utils/adt/date.c
@@ -257,7 +257,10 @@ make_date(PG_FUNCTION_ARGS)
 	if (tm.tm_year < 0)
 	{
 		bc = true;
-		tm.tm_year = -tm.tm_year;
+		if (pg_mul_s32_overflow(tm.tm_year, -1, &tm.tm_year))
+			ereport(ERROR,
+					(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+					 errmsg("date field value out of range")));
 	}
 
 	dterr = ValidateDate(DTK_DATE_M, false, false, bc, &tm);
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68069fcfd3..e2cb57a9e1 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -77,6 +77,7 @@
 
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
+#include "common/int.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
@@ -3809,7 +3810,15 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out,
 						ereturn(escontext,,
 								(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
 								 errmsg("invalid input string for \"Y,YYY\"")));
-					years += (millennia * 1000);
+					if (pg_mul_s32_overflow(millennia, 1000, &millennia))
+						ereturn(escontext,,
+								(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
+								 errmsg("invalid input string for \"Y,YYY\"")));
+					if (pg_add_s32_overflow(years, millennia, &years))
+						ereturn(escontext,,
+								(errcode(ERRCODE_INVALID_DATETIME_FORMAT),
+								 errmsg("invalid input string for \"Y,YYY\"")));
+
 					if (!from_char_set_int(&out->year, years, n, escontext))
 						return;
 					out->yysz = 4;
@@ -4797,11 +4806,31 @@ do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 		if (tmfc.bc)
 			tmfc.cc = -tmfc.cc;
 		if (tmfc.cc >= 0)
+		{
 			/* +1 because 21st century started in 2001 */
-			tm->tm_year = (tmfc.cc - 1) * 100 + 1;
+			/* tm->tm_year = (tmfc.cc - 1) * 100 + 1; */
+			if (pg_mul_s32_overflow((tmfc.cc - 1), 100, &tm->tm_year) ||
+				pg_add_s32_overflow(tm->tm_year, 1, &tm->tm_year))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+						 errmsg("date out of range: \"%s\"",
+								text_to_cstring(date_txt))));
+			}
+		}
 		else
+		{
 			/* +1 because year == 599 is 600 BC */
-			tm->tm_year = tmfc.cc * 100 + 1;
+			/* tm->tm_year = tmfc.cc * 100 + 1; */
+			if (pg_mul_s32_overflow(tmfc.cc, 100, &tm->tm_year) ||
+				pg_add_s32_overflow(tm->tm_year, 1, &tm->tm_year))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+						 errmsg("date out of range: \"%s\"",
+								text_to_cstring(date_txt))));
+			}
+		}
 		fmask |= DTK_M(YEAR);
 	}
 
diff --git a/src/test/regress/expected/date.out b/src/test/regress/expected/date.out
index f5949f3d17..9293e045b0 100644
--- a/src/test/regress/expected/date.out
+++ b/src/test/regress/expected/date.out
@@ -1532,3 +1532,5 @@ select make_time(10, 55, 100.1);
 ERROR:  time field value out of range: 10:55:100.1
 select make_time(24, 0, 2.1);
 ERROR:  time field value out of range: 24:00:2.1
+SELECT make_date(-2147483648, 1, 1);
+ERROR:  date field value out of range
diff --git a/src/test/regress/expected/horology.out b/src/test/regress/expected/horology.out
index 241713cc51..df02d268c0 100644
--- a/src/test/regress/expected/horology.out
+++ b/src/test/regress/expected/horology.out
@@ -3448,6 +3448,8 @@ SELECT i, to_timestamp('2018-11-02 12:34:56.123456', 'YYYY-MM-DD HH24:MI:SS.FF'
 
 SELECT i, to_timestamp('2018-11-02 12:34:56.123456789', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
 ERROR:  date/time field value out of range: "2018-11-02 12:34:56.123456789"
+SELECT to_timestamp('1000000000,999', 'Y,YYY');
+ERROR:  invalid input string for "Y,YYY"
 SELECT to_date('1 4 1902', 'Q MM YYYY');  -- Q is ignored
   to_date   
 ------------
@@ -3778,6 +3780,8 @@ SELECT to_date('0000-02-01','YYYY-MM-DD');  -- allowed, though it shouldn't be
  02-01-0001 BC
 (1 row)
 
+SELECT to_date('100000000', 'CC');
+ERROR:  date out of range: "100000000"
 -- to_char's TZ format code produces zone abbrev if known
 SELECT to_char('2012-12-12 12:00'::timestamptz, 'YYYY-MM-DD HH:MI:SS TZ');
          to_char         
diff --git a/src/test/regress/sql/date.sql b/src/test/regress/sql/date.sql
index 1c58ff6966..9a4e5832b9 100644
--- a/src/test/regress/sql/date.sql
+++ b/src/test/regress/sql/date.sql
@@ -373,3 +373,4 @@ select make_date(2013, 13, 1);
 select make_date(2013, 11, -1);
 select make_time(10, 55, 100.1);
 select make_time(24, 0, 2.1);
+SELECT make_date(-2147483648, 1, 1);
diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql
index e5cf12ff63..db532ee3c0 100644
--- a/src/test/regress/sql/horology.sql
+++ b/src/test/regress/sql/horology.sql
@@ -558,6 +558,7 @@ SELECT i, to_timestamp('2018-11-02 12:34:56.1234', 'YYYY-MM-DD HH24:MI:SS.FF' ||
 SELECT i, to_timestamp('2018-11-02 12:34:56.12345', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
 SELECT i, to_timestamp('2018-11-02 12:34:56.123456', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
 SELECT i, to_timestamp('2018-11-02 12:34:56.123456789', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
+SELECT to_timestamp('1000000000,999', 'Y,YYY');
 
 SELECT to_date('1 4 1902', 'Q MM YYYY');  -- Q is ignored
 SELECT to_date('3 4 21 01', 'W MM CC YY');
@@ -660,6 +661,7 @@ SELECT to_date('2016 365', 'YYYY DDD');  -- ok
 SELECT to_date('2016 366', 'YYYY DDD');  -- ok
 SELECT to_date('2016 367', 'YYYY DDD');
 SELECT to_date('0000-02-01','YYYY-MM-DD');  -- allowed, though it shouldn't be
+SELECT to_date('100000000', 'CC');
 
 -- to_char's TZ format code produces zone abbrev if known
 SELECT to_char('2012-12-12 12:00'::timestamptz, 'YYYY-MM-DD HH:MI:SS TZ');
-- 
2.34.1

Re: Remove dependence on integer wrapping

Reply via email to