On Thu, Jun 27, 2019 at 01:26:32PM +0200, Tomas Vondra wrote:
On Thu, Jun 27, 2019 at 12:04:30AM -0400, Tom Lane wrote:
Tomas Vondra <tomas.von...@2ndquadrant.com> writes:
OK. Attached is a patch ditching the alignment in serialized data. I've
ditched the macros to access parts of serialized data, and everything
gets copied.
I lack energy to actually read this patch right now, and I don't currently
have an opinion about whether it's worth another catversion bump to fix
this stuff in v12. But I did test the patch, and I can confirm it gets
through the core regression tests on hppa (both gaur's host environment
with gcc 3.4.6, and the OpenBSD installation with gcc 4.2.1).
Thanks for running it through regression tests, that alone is a very
useful piece of information for me.
As for the catversion bump - I'd probably vote to do it. Not just because
of this serialization stuff, but to fix the pg_mcv_list_items function.
It's not something I'm very enthusiastic about (kinda embarassed about it,
really), but it seems better than shipping something that we'll need to
rework in PG13.
Attached is a slightly improved version of the serialization patch. The
main difference is that when serializing varlena values, the previous
patch version stored
length (uint32) + full varlena (incl. the header)
which is kinda redundant, because the varlena stores the length too. So
now it only stores the length + data, without the varlena header. I
don't think there's a better way to store varlena values without
enforcing alignment (which is what happens in current master).
There's one additional change I failed to mention before - I had to add
another field to DimensionInfo, tracking how much space will be needed
for deserialized data. This is needed because the deserialization
allocates the whole MCV as a single chunk of memory, to reduce palloc
overhead. It could parse the data twice (first to determine the space,
then to actually parse it), this allows doing just a single pass. Which
seems useful for large MCV lists, but maybe it's not worth it?
Barring objections I'll commit this together with the pg_mcv_list_items
fix, posted in a separate thread. Of course, this requires catversion
bump - an alternative would be to keep enforcing the alignment, but
tweak the macros to work on all platforms without SIGBUS.
Considering how troublesome this serialiation part of the patch turner
out to be, I'm not really sure by anything at this point. So I'd welcome
thoughts about the proposed changes.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 5fe61ea0a4..256728a02a 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -52,15 +52,7 @@
* ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
*/
#define ITEM_SIZE(ndims) \
- MAXALIGN((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
-
-/*
- * Macros for convenient access to parts of a serialized MCV item.
- */
-#define ITEM_INDEXES(item) ((uint16 *) (item))
-#define ITEM_NULLS(item,ndims) ((bool *) (ITEM_INDEXES(item) +
(ndims)))
-#define ITEM_FREQUENCY(item,ndims) ((double *) (ITEM_NULLS(item, ndims) +
(ndims)))
-#define ITEM_BASE_FREQUENCY(item,ndims) ((double *)
(ITEM_FREQUENCY(item, ndims) + 1))
+ ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
/*
* Used to compute size of serialized MCV list representation.
@@ -68,10 +60,16 @@
#define MinSizeOfMCVList \
(VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
+/*
+ * Size of the serialized MCV list, exluding the space needed for
+ * deduplicated per-dimension values. The macro is meant to be used
+ * when it's not yet safe to access the serialized info about amount
+ * of data for each column.
+ */
#define SizeOfMCVList(ndims,nitems) \
- (MAXALIGN(MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
- MAXALIGN((ndims) * sizeof(DimensionInfo)) + \
- MAXALIGN((nitems) * ITEM_SIZE(ndims)))
+ ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
+ ((ndims) * sizeof(DimensionInfo)) + \
+ ((nitems) * ITEM_SIZE(ndims)))
static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
@@ -491,19 +489,16 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
int i;
int dim;
int ndims = mcvlist->ndimensions;
- int itemsize = ITEM_SIZE(ndims);
SortSupport ssup;
DimensionInfo *info;
Size total_length;
- /* allocate the item just once */
- char *item = palloc0(itemsize);
-
/* serialized items (indexes into arrays, etc.) */
- char *raw;
+ bytea *raw;
char *ptr;
+ char *endptr PG_USED_FOR_ASSERTS_ONLY;
/* values per dimension (and number of non-NULL values) */
Datum **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
@@ -597,8 +592,30 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
*/
info[dim].nvalues = ndistinct;
- if (info[dim].typlen > 0) /* fixed-length data types */
+ if (info[dim].typbyval) /* by-value data types */
+ {
info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
+
+ /*
+ * We copy the data into the MCV item during
deserialization, so
+ * we don't need to allocate any extra space.
+ */
+ info[dim].nbytes_aligned = 0;
+ }
+ else if (info[dim].typlen > 0) /* fixed-length by-ref
*/
+ {
+ /*
+ * We don't care about alignment in the serialized
data, so we
+ * pack the data as much as possible. But we also track
how much
+ * data will be needed after deserialization, and in
that case
+ * we need to account for alignment of each item.
+ *
+ * Note: As the items are fixed-length, we could easily
compute
+ * this during deserialization, but we do it here
anyway.
+ */
+ info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
+ info[dim].nbytes_aligned = info[dim].nvalues *
MAXALIGN(info[dim].typlen);
+ }
else if (info[dim].typlen == -1) /* varlena */
{
info[dim].nbytes = 0;
@@ -606,10 +623,24 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
{
Size len;
+ /*
+ * For varlena values, we detoast the values
and store the
+ * length and data separately. We don't bother
with alignment
+ * here, which means that during
deserialization we need to
+ * copy the fields and only access the copies.
+ */
values[dim][i] =
PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
- len = VARSIZE_ANY(values[dim][i]);
- info[dim].nbytes += MAXALIGN(len);
+ /* serialized length (uint32 length + data) */
+ len = VARSIZE_ANY_EXHDR(values[dim][i]);
+ info[dim].nbytes += sizeof(uint32); /*
length */
+ info[dim].nbytes += len;
/* value (no header) */
+
+ /*
+ * During deserialization we'll build regular
varlena values
+ * with full headers, and we need to align them
properly.
+ */
+ info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ +
len);
}
}
else if (info[dim].typlen == -2) /* cstring */
@@ -619,11 +650,20 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
{
Size len;
- /* c-strings include terminator, so +1 byte */
- values[dim][i] =
PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
+ /*
+ * For cstring, we do similar thing as for
varlena - first we
+ * store the length as uint32 and then the
data. We don't care
+ * about alignment, which means that during
deserialization we
+ * need to copy the fields and only access the
copies.
+ */
+ /* c-strings include terminator, so +1 byte */
len = strlen(DatumGetCString(values[dim][i])) +
1;
- info[dim].nbytes += MAXALIGN(len);
+ info[dim].nbytes += sizeof(uint32); /*
length */
+ info[dim].nbytes += len;
/* value */
+
+ /* space needed for properly aligned
deserialized copies */
+ info[dim].nbytes_aligned += MAXALIGN(len);
}
}
@@ -635,34 +675,33 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
* Now we can finally compute how much space we'll actually need for the
* whole serialized MCV list (varlena header, MCV header, dimension info
* for each attribute, deduplicated values and items).
- *
- * The header fields are copied one by one, so that we don't need any
- * explicit alignment (we copy them while deserializing). All fields
after
- * this need to be properly aligned, for direct access.
*/
- total_length = MAXALIGN(VARHDRSZ + (3 * sizeof(uint32))
- + sizeof(AttrNumber) +
(ndims * sizeof(Oid)));
+ total_length = (3 * sizeof(uint32)) /* magic + type
+ nitems */
+ + sizeof(AttrNumber) /*
ndimensions */
+ + (ndims * sizeof(Oid)); /*
attribute types */
/* dimension info */
- total_length += MAXALIGN(ndims * sizeof(DimensionInfo));
+ total_length += ndims * sizeof(DimensionInfo);
/* add space for the arrays of deduplicated values */
for (i = 0; i < ndims; i++)
- total_length += MAXALIGN(info[i].nbytes);
+ total_length += info[i].nbytes;
/*
- * And finally the items (no additional alignment needed, we start at
- * proper alignment and the itemsize formula uses MAXALIGN)
+ * And finally account for the items (those are fixed-length, thanks to
+ * replacing values with uint16 indexes into the deduplicated arrays).
*/
- total_length += mcvlist->nitems * itemsize;
+ total_length += mcvlist->nitems * ITEM_SIZE(dim);
/*
* Allocate space for the whole serialized MCV list (we'll skip bytes,
so
* we set them to zero to make the result more compressible).
*/
- raw = palloc0(total_length);
- SET_VARSIZE(raw, total_length);
+ raw = (bytea *) palloc0(VARHDRSZ + total_length);
+ SET_VARSIZE(raw, VARHDRSZ + total_length);
+
ptr = VARDATA(raw);
+ endptr = ptr + total_length;
/* copy the MCV list header fields, one by one */
memcpy(ptr, &mcvlist->magic, sizeof(uint32));
@@ -680,12 +719,9 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
ptr += (sizeof(Oid) * ndims);
- /* the header may not be exactly aligned, so make sure it is */
- ptr = raw + MAXALIGN(ptr - raw);
-
- /* store information about the attributes */
+ /* store information about the attributes (data amounts, ...) */
memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
- ptr += MAXALIGN(sizeof(DimensionInfo) * ndims);
+ ptr += sizeof(DimensionInfo) * ndims;
/* Copy the deduplicated values for all attributes to the output. */
for (dim = 0; dim < ndims; dim++)
@@ -723,17 +759,27 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
}
else if (info[dim].typlen == -1) /* varlena */
{
- int len =
VARSIZE_ANY(value);
+ uint32 len = VARSIZE_ANY_EXHDR(value);
+
+ /* copy the length */
+ memcpy(ptr, &len, sizeof(uint32));
+ ptr += sizeof(uint32);
- memcpy(ptr, DatumGetPointer(value), len);
- ptr += MAXALIGN(len);
+ /* data from the varlena value (without the
header) */
+ memcpy(ptr, VARDATA(DatumGetPointer(value)),
len);
+ ptr += len;
}
else if (info[dim].typlen == -2) /* cstring */
{
- Size len =
strlen(DatumGetCString(value)) + 1; /* terminator */
+ uint32 len = (uint32)
strlen(DatumGetCString(value)) + 1;
+ /* copy the length */
+ memcpy(ptr, &len, sizeof(uint32));
+ ptr += sizeof(uint32);
+
+ /* value */
memcpy(ptr, DatumGetCString(value), len);
- ptr += MAXALIGN(len);
+ ptr += len;
}
/* no underflows or overflows */
@@ -742,62 +788,65 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats
**stats)
/* we should get exactly nbytes of data for this dimension */
Assert((ptr - start) == info[dim].nbytes);
-
- /* make sure the pointer is aligned correctly after each
dimension */
- ptr = raw + MAXALIGN(ptr - raw);
}
/* Serialize the items, with uint16 indexes instead of the values. */
for (i = 0; i < mcvlist->nitems; i++)
{
MCVItem *mcvitem = &mcvlist->items[i];
+ int itemlen = ITEM_SIZE(dim);
/* don't write beyond the allocated space */
- Assert(ptr <= raw + total_length - itemsize);
+ Assert(ptr <= (endptr - itemlen));
+
+ /* copy NULL and frequency flags into the serialized MCV */
+ memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
+ ptr += sizeof(bool) * ndims;
- /* reset the item (we only allocate it once and reuse it) */
- memset(item, 0, itemsize);
+ memcpy(ptr, &mcvitem->frequency, sizeof(double));
+ ptr += sizeof(double);
+ memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
+ ptr += sizeof(double);
+
+ /* store the indexes last */
for (dim = 0; dim < ndims; dim++)
{
+ uint16 index = 0;
Datum *value;
/* do the lookup only for non-NULL values */
- if (mcvitem->isnull[dim])
- continue;
+ if (!mcvitem->isnull[dim])
+ {
+ value = (Datum *)
bsearch_arg(&mcvitem->values[dim], values[dim],
+
info[dim].nvalues, sizeof(Datum),
+
compare_scalars_simple, &ssup[dim]);
- value = (Datum *) bsearch_arg(&mcvitem->values[dim],
values[dim],
-
info[dim].nvalues, sizeof(Datum),
-
compare_scalars_simple, &ssup[dim]);
+ Assert(value != NULL); /* serialization or
deduplication error */
- Assert(value != NULL); /* serialization or
deduplication error */
+ /* compute index within the deduplicated array
*/
+ index = (uint16) (value - values[dim]);
- /* compute index within the array */
- ITEM_INDEXES(item)[dim] = (uint16) (value -
values[dim]);
+ /* check the index is within expected bounds */
+ Assert(index < info[dim].nvalues);
+ }
- /* check the index is within expected bounds */
- Assert(ITEM_INDEXES(item)[dim] < info[dim].nvalues);
+ /* copy the index into the serialized MCV */
+ memcpy(ptr, &index, sizeof(uint16));
+ ptr += sizeof(uint16);
}
- /* copy NULL and frequency flags into the item */
- memcpy(ITEM_NULLS(item, ndims), mcvitem->isnull, sizeof(bool) *
ndims);
- memcpy(ITEM_FREQUENCY(item, ndims), &mcvitem->frequency,
sizeof(double));
- memcpy(ITEM_BASE_FREQUENCY(item, ndims),
&mcvitem->base_frequency, sizeof(double));
-
- /* copy the serialized item into the array */
- memcpy(ptr, item, itemsize);
-
- ptr += itemsize;
+ /* make sure we don't overflow the allocated value */
+ Assert(ptr <= endptr);
}
/* at this point we expect to match the total_length exactly */
- Assert((ptr - raw) == total_length);
+ Assert(ptr == endptr);
- pfree(item);
pfree(values);
pfree(counts);
- return (bytea *) raw;
+ return raw;
}
/*
@@ -816,6 +865,7 @@ statext_mcv_deserialize(bytea *data)
MCVList *mcvlist;
char *raw;
char *ptr;
+ char *endptr PG_USED_FOR_ASSERTS_ONLY;
int ndims,
nitems;
@@ -849,8 +899,9 @@ statext_mcv_deserialize(bytea *data)
mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
/* pointer to the data part (skip the varlena header) */
- ptr = VARDATA_ANY(data);
raw = (char *) data;
+ ptr = VARDATA_ANY(raw);
+ endptr = (char *) raw + VARSIZE_ANY(data);
/* get the header and perform further sanity checks */
memcpy(&mcvlist->magic, ptr, sizeof(uint32));
@@ -909,12 +960,11 @@ statext_mcv_deserialize(bytea *data)
memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
ptr += (sizeof(Oid) * ndims);
- /* ensure alignment of the pointer (after the header fields) */
- ptr = raw + MAXALIGN(ptr - raw);
-
/* Now it's safe to access the dimension info. */
- info = (DimensionInfo *) ptr;
- ptr += MAXALIGN(ndims * sizeof(DimensionInfo));
+ info = palloc(ndims * sizeof(DimensionInfo));
+
+ memcpy(info, ptr, ndims * sizeof(DimensionInfo));
+ ptr += (ndims * sizeof(DimensionInfo));
/* account for the value arrays */
for (dim = 0; dim < ndims; dim++)
@@ -926,7 +976,7 @@ statext_mcv_deserialize(bytea *data)
Assert(info[dim].nvalues >= 0);
Assert(info[dim].nbytes >= 0);
- expected_size += MAXALIGN(info[dim].nbytes);
+ expected_size += info[dim].nbytes;
}
/*
@@ -955,15 +1005,18 @@ statext_mcv_deserialize(bytea *data)
map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
/* space needed for a copy of data for by-ref types */
- if (!info[dim].typbyval)
- datalen += MAXALIGN(info[dim].nbytes);
+ datalen += info[dim].nbytes_aligned;
}
/*
- * Now resize the MCV list so that the allocation includes all the data
+ * Now resize the MCV list so that the allocation includes all the data.
+ *
* Allocate space for a copy of the data, as we can't simply reference
the
- * original data - it may disappear while we're still using the MCV
list,
- * e.g. due to catcache release. Only needed for by-ref types.
+ * serialized data - it's not aligned properly, and it may disappear
while
+ * we're still using the MCV list, e.g. due to catcache release.
+ *
+ * We do care about alignment here, because we will allocate all the
pieces
+ * at once, but then use pointers to different parts.
*/
mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) *
nitems));
@@ -1024,7 +1077,7 @@ statext_mcv_deserialize(bytea *data)
/* just point into the array */
map[dim][i] = PointerGetDatum(dataptr);
- dataptr += info[dim].typlen;
+ dataptr += MAXALIGN(info[dim].typlen);
}
}
else if (info[dim].typlen == -1)
@@ -1032,13 +1085,21 @@ statext_mcv_deserialize(bytea *data)
/* varlena */
for (i = 0; i < info[dim].nvalues; i++)
{
- Size len = VARSIZE_ANY(ptr);
+ uint32 len;
- memcpy(dataptr, ptr, len);
- ptr += MAXALIGN(len);
+ /* read the uint32 length */
+ memcpy(&len, ptr, sizeof(uint32));
+ ptr += sizeof(uint32);
+
+ /* the length is data-only */
+ SET_VARSIZE(dataptr, len + VARHDRSZ);
+ memcpy(VARDATA(dataptr), ptr, len);
+ ptr += len;
/* just point into the array */
map[dim][i] = PointerGetDatum(dataptr);
+
+ /* skip to place of the next
deserialized value */
dataptr += MAXALIGN(len);
}
}
@@ -1047,10 +1108,13 @@ statext_mcv_deserialize(bytea *data)
/* cstring */
for (i = 0; i < info[dim].nvalues; i++)
{
- Size len = (strlen(ptr) +
1); /* don't forget the \0 */
+ uint32 len;
+
+ memcpy(&len, ptr, sizeof(uint32));
+ ptr += sizeof(uint32);
memcpy(dataptr, ptr, len);
- ptr += MAXALIGN(len);
+ ptr += len;
/* just point into the array */
map[dim][i] = PointerGetDatum(dataptr);
@@ -1067,9 +1131,6 @@ statext_mcv_deserialize(bytea *data)
/* check we consumed input data for this dimension exactly */
Assert(ptr == (start + info[dim].nbytes));
-
- /* ensure proper alignment of the data */
- ptr = raw + MAXALIGN(ptr - raw);
}
/* we should have also filled the MCV list exactly */
@@ -1078,7 +1139,6 @@ statext_mcv_deserialize(bytea *data)
/* deserialize the MCV items and translate the indexes to Datums */
for (i = 0; i < nitems; i++)
{
- uint16 *indexes = NULL;
MCVItem *item = &mcvlist->items[i];
item->values = (Datum *) valuesptr;
@@ -1087,27 +1147,35 @@ statext_mcv_deserialize(bytea *data)
item->isnull = (bool *) isnullptr;
isnullptr += MAXALIGN(sizeof(bool) * ndims);
+ memcpy(item->isnull, ptr, sizeof(bool) * ndims);
+ ptr += sizeof(bool) * ndims;
- /* just point to the right place */
- indexes = ITEM_INDEXES(ptr);
+ memcpy(&item->frequency, ptr, sizeof(double));
+ ptr += sizeof(double);
- memcpy(item->isnull, ITEM_NULLS(ptr, ndims), sizeof(bool) *
ndims);
- memcpy(&item->frequency, ITEM_FREQUENCY(ptr, ndims),
sizeof(double));
- memcpy(&item->base_frequency, ITEM_BASE_FREQUENCY(ptr, ndims),
sizeof(double));
+ memcpy(&item->base_frequency, ptr, sizeof(double));
+ ptr += sizeof(double);
- /* translate the values */
+ /* finally translate the indexes (for non-NULL only) */
for (dim = 0; dim < ndims; dim++)
- if (!item->isnull[dim])
- item->values[dim] = map[dim][indexes[dim]];
+ {
+ uint16 index;
+
+ memcpy(&index, ptr, sizeof(uint16));
+ ptr += sizeof(uint16);
- ptr += ITEM_SIZE(ndims);
+ if (item->isnull[dim])
+ continue;
+
+ item->values[dim] = map[dim][index];
+ }
/* check we're not overflowing the input */
- Assert(ptr <= (char *) raw + VARSIZE_ANY(data));
+ Assert(ptr <= endptr);
}
/* check that we processed all the data */
- Assert(ptr == raw + VARSIZE_ANY(data));
+ Assert(ptr == endptr);
/* release the buffers used for mapping */
for (dim = 0; dim < ndims; dim++)
diff --git a/src/include/statistics/extended_stats_internal.h
b/src/include/statistics/extended_stats_internal.h
index fb03c52f50..6778746db1 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -36,6 +36,7 @@ typedef struct DimensionInfo
{
int nvalues; /* number of
deduplicated values */
int nbytes; /* number of bytes
(serialized) */
+ int nbytes_aligned; /* deserialized data with
alignment */
int typlen; /* pg_type.typlen */
bool typbyval; /* pg_type.typbyval */
} DimensionInfo;