Hi, On 2021-02-04 23:51:39 +1300, David Rowley wrote:
> I ended up adding just two new API functions to table AM. > > void (*scan_set_tid_range) (TableScanDesc sscan, > ItemPointer mintid, > ItemPointer maxtid); > > and > bool (*scan_tid_range_nextslot) (TableScanDesc sscan, > ScanDirection direction, > TupleTableSlot *slot); > > I added an additional function in tableam.h that does not have a > corresponding API function: > > static inline TableScanDesc > table_tid_range_start(Relation rel, Snapshot snapshot, > ItemPointer mintid, > ItemPointer maxtid) > > This just calls the standard scan_begin then calls scan_set_tid_range > setting the specified mintid and maxtid. Hm. But that means we can't rescan? > I also added 2 new fields to TableScanDesc: > > ItemPointerData rs_mintid; > ItemPointerData rs_maxtid; > > I didn't quite see a need to have a new start and end scan API function. Yea. I guess they're not that large. Avoiding that was one of the two reasons to have a separate scan state somewhere. The other that it seemed like it'd possibly a bit cleaner API wise to deal with rescan. > +bool > +heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, > + TupleTableSlot *slot) > +{ > + HeapScanDesc scan = (HeapScanDesc) sscan; > + ItemPointer mintid = &sscan->rs_mintid; > + ItemPointer maxtid = &sscan->rs_maxtid; > + > + /* Note: no locking manipulations needed */ > + for (;;) > + { > + if (sscan->rs_flags & SO_ALLOW_PAGEMODE) > + heapgettup_pagemode(scan, direction, sscan->rs_nkeys, > sscan->rs_key); > + else > + heapgettup(scan, direction, sscan->rs_nkeys, > sscan->rs_key); > + > + if (scan->rs_ctup.t_data == NULL) > + { > + ExecClearTuple(slot); > + return false; > + } > + > + /* > + * heap_set_tidrange will have used heap_setscanlimits to limit > the > + * range of pages we scan to only ones that can contain the TID > range > + * we're scanning for. Here we must filter out any tuples from > these > + * pages that are outwith that range. > + */ > + if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0) > + { > + ExecClearTuple(slot); > + > + /* > + * When scanning backwards, the TIDs will be in > descending order. > + * Future tuples in this direction will be lower still, > so we can > + * just return false to indicate there will be no more > tuples. > + */ > + if (ScanDirectionIsBackward(direction)) > + return false; > + > + continue; > + } > + > + /* > + * Likewise for the final page, we must filter out TIDs greater > than > + * maxtid. > + */ > + if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0) > + { > + ExecClearTuple(slot); > + > + /* > + * When scanning forward, the TIDs will be in ascending > order. > + * Future tuples in this direction will be higher > still, so we can > + * just return false to indicate there will be no more > tuples. > + */ > + if (ScanDirectionIsForward(direction)) > + return false; > + continue; > + } > + > + break; > + } > + > + /* > + * if we get here it means we have a new current scan tuple, so point to > + * the proper return buffer and return the tuple. > + */ > + pgstat_count_heap_getnext(scan->rs_base.rs_rd); I wonder if there's an argument for counting the misses above via pgstat_count_heap_fetch()? Probably not, right? > +#define IsCTIDVar(node) \ > + ((node) != NULL && \ > + IsA((node), Var) && \ > + ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \ > + ((Var *) (node))->varlevelsup == 0) > + > +typedef enum > +{ > + TIDEXPR_UPPER_BOUND, > + TIDEXPR_LOWER_BOUND > +} TidExprType; > + > +/* Upper or lower range bound for scan */ > +typedef struct TidOpExpr > +{ > + TidExprType exprtype; /* type of op; lower or upper */ > + ExprState *exprstate; /* ExprState for a TID-yielding subexpr > */ > + bool inclusive; /* whether op is inclusive */ > +} TidOpExpr; > + > +/* > + * For the given 'expr', build and return an appropriate TidOpExpr taking > into > + * account the expr's operator and operand order. > + */ > +static TidOpExpr * > +MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) > +{ > + Node *arg1 = get_leftop((Expr *) expr); > + Node *arg2 = get_rightop((Expr *) expr); > + ExprState *exprstate = NULL; > + bool invert = false; > + TidOpExpr *tidopexpr; > + > + if (IsCTIDVar(arg1)) > + exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps); > + else if (IsCTIDVar(arg2)) > + { > + exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps); > + invert = true; > + } > + else > + elog(ERROR, "could not identify CTID variable"); > + > + tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr)); > + tidopexpr->inclusive = false; /* for now */ > + > + switch (expr->opno) > + { > + case TIDLessEqOperator: > + tidopexpr->inclusive = true; > + /* fall through */ > + case TIDLessOperator: > + tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : > TIDEXPR_UPPER_BOUND; > + break; > + case TIDGreaterEqOperator: > + tidopexpr->inclusive = true; > + /* fall through */ > + case TIDGreaterOperator: > + tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : > TIDEXPR_LOWER_BOUND; > + break; > + default: > + elog(ERROR, "could not identify CTID operator"); > + } > + > + tidopexpr->exprstate = exprstate; > + > + return tidopexpr; > +} > + > +/* > + * Extract the qual subexpressions that yield TIDs to search for, > + * and compile them into ExprStates if they're ordinary expressions. > + */ > +static void > +TidExprListCreate(TidRangeScanState *tidrangestate) > +{ > + TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan; > + List *tidexprs = NIL; > + ListCell *l; > + > + foreach(l, node->tidrangequals) > + { > + OpExpr *opexpr = lfirst(l); > + TidOpExpr *tidopexpr; > + > + if (!IsA(opexpr, OpExpr)) > + elog(ERROR, "could not identify CTID expression"); > + > + tidopexpr = MakeTidOpExpr(opexpr, tidrangestate); > + tidexprs = lappend(tidexprs, tidopexpr); > + } > + > + tidrangestate->trss_tidexprs = tidexprs; > +} Architecturally it feels like this is something that really belongs more into plan time? > +/* > + * table_set_tidrange resets the minimum and maximum TID range to scan for a > + * TableScanDesc created by table_beginscan_tidrange. > + */ > +static inline void > +table_set_tidrange(TableScanDesc sscan, ItemPointer mintid, > + ItemPointer maxtid) > +{ > + /* Ensure table_beginscan_tidrange() was used. */ > + Assert((sscan->rs_flags & SO_TYPE_TIDRANGESCAN) != 0); > + > + sscan->rs_rd->rd_tableam->scan_set_tidrange(sscan, mintid, maxtid); > +} How does this interact with rescans? Greetings, Andres Freund