Hi, diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index 9fd7b4e019b..97c0125a4ba 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -337,17 +337,75 @@ DecrTupleDescRefCount(TupleDesc tupdesc) { Assert(tupdesc->tdrefcount > 0); - ResourceOwnerForgetTupleDesc(CurrentResourceOwner, tupdesc); + if (CurrentResourceOwner != NULL) + ResourceOwnerForgetTupleDesc(CurrentResourceOwner, tupdesc); if (--tupdesc->tdrefcount == 0) FreeTupleDesc(tupdesc); }
What's this about? CurrentResourceOwner should always be valid here, no? If so, why did that change? I don't think it's good to detach this from the resowner infrastructure... /* - * Compare two TupleDesc structures for logical equality + * Compare two TupleDescs' attributes for logical equality * * Note: we deliberately do not check the attrelid and tdtypmod fields. * This allows typcache.c to use this routine to see if a cached record type * matches a requested type, and is harmless for relcache.c's uses. + */ +bool +equalTupleDescAttrs(Form_pg_attribute attr1, Form_pg_attribute attr2) +{ comment not really accurate, this routine afaik isn't used by typcache.c? /* - * Magic numbers for parallel state sharing. Higher-level code should use - * smaller values, leaving these very large ones for use by this module. + * Magic numbers for per-context parallel state sharing. Higher-level code + * should use smaller values, leaving these very large ones for use by this + * module. */ #define PARALLEL_KEY_FIXED UINT64CONST(0xFFFFFFFFFFFF0001) #define PARALLEL_KEY_ERROR_QUEUE UINT64CONST(0xFFFFFFFFFFFF0002) @@ -63,6 +74,16 @@ #define PARALLEL_KEY_ACTIVE_SNAPSHOT UINT64CONST(0xFFFFFFFFFFFF0007) #define PARALLEL_KEY_TRANSACTION_STATE UINT64CONST(0xFFFFFFFFFFFF0008) #define PARALLEL_KEY_ENTRYPOINT UINT64CONST(0xFFFFFFFFFFFF0009) +#define PARALLEL_KEY_SESSION_DSM UINT64CONST(0xFFFFFFFFFFFF000A) + +/* Magic number for per-session DSM TOC. */ +#define PARALLEL_SESSION_MAGIC 0xabb0fbc9 + +/* + * Magic numbers for parallel state sharing in the per-session DSM area. + */ +#define PARALLEL_KEY_SESSION_DSA UINT64CONST(0xFFFFFFFFFFFF0001) +#define PARALLEL_KEY_RECORD_TYPMOD_REGISTRY UINT64CONST(0xFFFFFFFFFFFF0002) Not this patch's fault, but this infrastructure really isn't great. We should really replace it with a shmem.h style infrastructure, using a dht hashtable as backing... +/* The current per-session DSM segment, if attached. */ +static dsm_segment *current_session_segment = NULL; + I think it'd be better if we had a proper 'SessionState' and 'BackendSessionState' infrastructure that then contains the dsm segment etc. I think we'll otherwise just end up with a bunch of parallel infrastructures. +/* + * A mechanism for sharing record typmods between backends. + */ +struct SharedRecordTypmodRegistry +{ + dht_hash_table_handle atts_index_handle; + dht_hash_table_handle typmod_index_handle; + pg_atomic_uint32 next_typmod; +}; + I think the code needs to explain better how these are intended to be used. IIUC, atts_index is used to find typmods by "identity", and typmod_index by the typmod, right? And we need both to avoid all workers generating different tupledescs, right? Kinda guessable by reading typecache.c, but that shouldn't be needed. +/* + * A flattened/serialized representation of a TupleDesc for use in shared + * memory. Can be converted to and from regular TupleDesc format. Doesn't + * support constraints and doesn't store the actual type OID, because this is + * only for use with RECORD types as created by CreateTupleDesc(). These are + * arranged into a linked list, in the hash table entry corresponding to the + * OIDs of the first 16 attributes, so we'd expect to get more than one entry + * in the list when named and other properties differ. + */ +typedef struct SerializedTupleDesc +{ + dsa_pointer next; /* next with the same same attribute OIDs */ + int natts; /* number of attributes in the tuple */ + int32 typmod; /* typmod for tuple type */ + bool hasoid; /* tuple has oid attribute in its header */ + + /* + * The attributes follow. We only ever access the first + * ATTRIBUTE_FIXED_PART_SIZE bytes of each element, like the code in + * tupdesc.c. + */ + FormData_pg_attribute attributes[FLEXIBLE_ARRAY_MEMBER]; +} SerializedTupleDesc; Not a fan of a separate tupledesc representation, that's just going to lead to divergence over time. I think we should rather change the normal tupledesc representation to be compatible with this, and 'just' have a wrapper struct for the parallel case (with next and such). +/* + * An entry in SharedRecordTypmodRegistry's attribute index. The key is the + * first REC_HASH_KEYS attribute OIDs. That means that collisions are + * possible, but that's OK because SerializedTupleDesc objects are arranged + * into a list. + */ +/* Parameters for SharedRecordTypmodRegistry's attributes hash table. */ +const static dht_parameters srtr_atts_index_params = { + sizeof(Oid) * REC_HASH_KEYS, + sizeof(SRTRAttsIndexEntry), + memcmp, + tag_hash, + LWTRANCHE_SHARED_RECORD_ATTS_INDEX +}; + +/* Parameters for SharedRecordTypmodRegistry's typmod hash table. */ +const static dht_parameters srtr_typmod_index_params = { + sizeof(uint32), + sizeof(SRTRTypmodIndexEntry), + memcmp, + tag_hash, + LWTRANCHE_SHARED_RECORD_TYPMOD_INDEX +}; + I'm very much not a fan of this representation. I know you copied the logic, but I think it's a bad idea. I think the key should just be a dsa_pointer, and then we can have a proper tag_hash that hashes the whole thing, and a proper comparator too. Just have /* * Combine two hash values, resulting in another hash value, with decent bit * mixing. * * Similar to boost's hash_combine(). */ static inline uint32 hash_combine(uint32 a, uint32 b) { a ^= b + 0x9e3779b9 + (a << 6) + (a >> 2); return a; } and then hash everything. +/* + * Make sure that RecordCacheArray is large enough to store 'typmod'. + */ +static void +ensure_record_cache_typmod_slot_exists(int32 typmod) +{ + if (RecordCacheArray == NULL) + { + RecordCacheArray = (TupleDesc *) + MemoryContextAllocZero(CacheMemoryContext, 64 * sizeof(TupleDesc)); + RecordCacheArrayLen = 64; + } + + if (typmod >= RecordCacheArrayLen) + { + int32 newlen = RecordCacheArrayLen * 2; + + while (typmod >= newlen) + newlen *= 2; + + RecordCacheArray = (TupleDesc *) repalloc(RecordCacheArray, + newlen * sizeof(TupleDesc)); + memset(RecordCacheArray + RecordCacheArrayLen, 0, + (newlen - RecordCacheArrayLen) * sizeof(TupleDesc *)); + RecordCacheArrayLen = newlen; + } +} Do we really want to keep this? Could just have an equivalent dynahash for the non-parallel case? /* * lookup_rowtype_tupdesc_internal --- internal routine to lookup a rowtype @@ -1229,15 +1347,49 @@ lookup_rowtype_tupdesc_internal(Oid type_id, int32 typmod, bool noError) /* * It's a transient record type, so look in our record-type table. */ - if (typmod < 0 || typmod >= NextRecordTypmod) + if (typmod >= 0) { - if (!noError) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("record type has not been registered"))); - return NULL; + /* It is already in our local cache? */ + if (typmod < RecordCacheArrayLen && + RecordCacheArray[typmod] != NULL) + return RecordCacheArray[typmod]; + + /* Are we attached to a SharedRecordTypmodRegistry? */ + if (CurrentSharedRecordTypmodRegistry.shared != NULL) Why do we want to do lookups in both? I don't think it's a good idea to have a chance that you could have the same typmod in both the local registry (because it'd been created before the shared one) and in the shared (because it was created in a worker). Ah, that's for caching purposes? If so, see my above point that we shouldn't have a serialized version of typdesc (yesyes, constraints will be a bit ugly). +/* + * If we are attached to a SharedRecordTypmodRegistry, find or create a + * SerializedTupleDesc that matches 'tupdesc', and return its typmod. + * Otherwise return -1. + */ +static int32 +find_or_allocate_shared_record_typmod(TupleDesc tupdesc) +{ + SRTRAttsIndexEntry *atts_index_entry; + SRTRTypmodIndexEntry *typmod_index_entry; + SerializedTupleDesc *serialized; + dsa_pointer serialized_dp; + Oid hashkey[REC_HASH_KEYS]; + bool found; + int32 typmod; + int i; + + /* If not even attached, nothing to do. */ + if (CurrentSharedRecordTypmodRegistry.shared == NULL) + return -1; + + /* Try to find a match. */ + memset(hashkey, 0, sizeof(hashkey)); + for (i = 0; i < tupdesc->natts; ++i) + hashkey[i] = tupdesc->attrs[i]->atttypid; + atts_index_entry = (SRTRAttsIndexEntry *) + dht_find_or_insert(CurrentSharedRecordTypmodRegistry.atts_index, + hashkey, + &found); + if (!found) + { + /* Making a new entry. */ + memcpy(atts_index_entry->leading_attr_oids, + hashkey, + sizeof(hashkey)); + atts_index_entry->serialized_tupdesc = InvalidDsaPointer; + } + + /* Scan the list we found for a matching serialized one. */ + serialized_dp = atts_index_entry->serialized_tupdesc; + while (DsaPointerIsValid(serialized_dp)) + { + serialized = + dsa_get_address(CurrentSharedRecordTypmodRegistry.area, + serialized_dp); + if (serialized_tupledesc_matches(serialized, tupdesc)) + { + /* Found a match, we are finished. */ + typmod = serialized->typmod; + dht_release(CurrentSharedRecordTypmodRegistry.atts_index, + atts_index_entry); + return typmod; + } + serialized_dp = serialized->next; + } + + /* We didn't find a matching entry, so let's allocate a new one. */ + typmod = (int) + pg_atomic_fetch_add_u32(&CurrentSharedRecordTypmodRegistry.shared->next_typmod, + 1); + + /* Allocate shared memory and serialize the TupleDesc. */ + serialized_dp = serialize_tupledesc(CurrentSharedRecordTypmodRegistry.area, + tupdesc); + serialized = (SerializedTupleDesc *) + dsa_get_address(CurrentSharedRecordTypmodRegistry.area, serialized_dp); + serialized->typmod = typmod; + + /* + * While we still hold the atts_index entry locked, add this to + * typmod_index. That's important because we don't want anyone to be able + * to find a typmod via the former that can't yet be looked up in the + * latter. + */ + typmod_index_entry = + dht_find_or_insert(CurrentSharedRecordTypmodRegistry.typmod_index, + &typmod, &found); + if (found) + elog(ERROR, "cannot create duplicate shared record typmod"); + typmod_index_entry->typmod = typmod; + typmod_index_entry->serialized_tupdesc = serialized_dp; + dht_release(CurrentSharedRecordTypmodRegistry.typmod_index, + typmod_index_entry); What if we fail to allocate memory for the entry in typmod_index? - Andres -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers