Index: doc/src/sgml/ref/cluster.sgml
===================================================================
RCS file: /projects/cvsroot/pgsql/doc/src/sgml/ref/cluster.sgml,v
retrieving revision 1.50
diff -c -r1.50 cluster.sgml
*** doc/src/sgml/ref/cluster.sgml	11 May 2010 16:07:42 -0000	1.50
--- doc/src/sgml/ref/cluster.sgml	21 Jul 2010 14:04:11 -0000
***************
*** 128,138 ****
     </para>
  
     <para>
!     During the cluster operation, a temporary copy of the table is created
!     that contains the table data in the index order.  Temporary copies of
!     each index on the table are created as well.  Therefore, you need free
!     space on disk at least equal to the sum of the table size and the index
!     sizes.
     </para>
  
     <para>
--- 128,159 ----
     </para>
  
     <para>
!     Based on:
!     <itemizedlist spacing="compact">
!      <listitem><para>statistics on the table</para></listitem>
!      <listitem><para>resource consumption config (see <xref linkend="runtime-config-resource">)</para></listitem>
!      <listitem><para>planner method configuration (see <xref linkend="runtime-config-query-enable">)</para></listitem>
!     </itemizedlist>
!     the planner can choose between two different methods to cluster the table:
!     <itemizedlist spacing="compact">
!      <listitem><para>index scan</para></listitem>
!      <listitem><para>table scan followed by sort</para></listitem>
!     </itemizedlist>
!     In the first case (index scan), during the cluster operation, a temporary
!     copy of the table is created that contains the table data in the index
!     order.
!     Temporary copies of each index on the table are created as well.  Therefore,
!     you need free space on disk at least equal to the sum of the table size and
!     the index sizes.
!     In the second case a full table scan is followed by a sort operation.
!     In addition to the free space needed by the previous case, this approach
!     may also need a temporary disk sort file which can be as big as the original
!     table. 
!     It is advisable to run <xref linkend="sql-analyze"> on the table before
!     clustering it, and to set the resource consumption parameters to sensible
!     values (especially <xref linkend="guc-work-mem">). 
!     Otherwise, the planner might make poor choices of the clustering method to
!     use.
     </para>
  
     <para>
***************
*** 149,184 ****
      Otherwise, the planner might make poor choices of query plans.
     </para>
  
-    <para>
-     There is another way to cluster data. The
-     <command>CLUSTER</command> command reorders the original table by
-     scanning it using the index you specify. This can be slow
-     on large tables because the rows are fetched from the table
-     in index order, and if the table is disordered, the
-     entries are on random pages, so there is one disk page
-     retrieved for every row moved. (<productname>PostgreSQL</productname> has
-     a cache, but the majority of a big table will not fit in the cache.)
-     The other way to cluster a table is to use:
- 
- <programlisting>
- CREATE TABLE <replaceable class="parameter">newtable</replaceable> AS
-     SELECT * FROM <replaceable class="parameter">table</replaceable> ORDER BY <replaceable class="parameter">columnlist</replaceable>;
- </programlisting>
- 
-     which uses the <productname>PostgreSQL</productname> sorting code
-     to produce the desired order;
-     this is usually much faster than an index scan for disordered data.
-     Then you drop the old table, use
-     <command>ALTER TABLE ... RENAME</command>
-     to rename <replaceable class="parameter">newtable</replaceable> to the
-     old name, and recreate the table's indexes.
-     The big disadvantage of this approach is that it does not preserve
-     OIDs, constraints, foreign key relationships, granted privileges, and
-     other ancillary properties of the table &mdash; all such items must be
-     manually recreated.  Another disadvantage is that this way requires a sort
-     temporary file about the same size as the table itself, so peak disk usage
-     is about three times the table size instead of twice the table size.
-    </para>
   </refsect1>
  
   <refsect1>
--- 170,175 ----
Index: src/include/optimizer/cost.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/optimizer/cost.h,v
retrieving revision 1.101
diff -c -r1.101 cost.h
*** src/include/optimizer/cost.h	19 Apr 2010 00:55:26 -0000	1.101
--- src/include/optimizer/cost.h	21 Jul 2010 14:04:17 -0000
***************
*** 110,115 ****
--- 110,117 ----
  extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan);
  extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root);
  extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
+ extern void cost_index_scan_vs_seqscansort(Oid tableOid, Oid indexOid,
+ 			  Cost *indexScanCost, Cost *seqScanAndSortCost);
  extern void set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel);
  extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
  						   RelOptInfo *outer_rel,
Index: src/include/utils/tuplesort.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/utils/tuplesort.h,v
retrieving revision 1.36
diff -c -r1.36 tuplesort.h
*** src/include/utils/tuplesort.h	26 Feb 2010 02:01:29 -0000	1.36
--- src/include/utils/tuplesort.h	21 Jul 2010 14:04:17 -0000
***************
*** 64,69 ****
--- 64,72 ----
  extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
  					  Oid sortOperator, bool nullsFirstFlag,
  					  int workMem, bool randomAccess);
+ extern Tuplesortstate *tuplesort_begin_rawheap(Relation indexRel,
+ 												TupleDesc tupDesc,
+ 												int workMem, bool randomAccess);
  
  extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
  
***************
*** 72,77 ****
--- 75,81 ----
  extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);
  extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
  				   bool isNull);
+ extern void tuplesort_putrawtuple(Tuplesortstate *state, HeapTuple tup);
  
  extern void tuplesort_performsort(Tuplesortstate *state);
  
***************
*** 81,86 ****
--- 85,92 ----
  						bool *should_free);
  extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
  				   Datum *val, bool *isNull);
+ extern HeapTuple tuplesort_getrawtuple(Tuplesortstate *state, bool forward,
+ 				   bool *should_free);
  
  extern void tuplesort_end(Tuplesortstate *state);
  
Index: src/backend/optimizer/path/costsize.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v
retrieving revision 1.218
diff -c -r1.218 costsize.c
*** src/backend/optimizer/path/costsize.c	6 Jul 2010 19:18:56 -0000	1.218
--- src/backend/optimizer/path/costsize.c	21 Jul 2010 14:04:14 -0000
***************
*** 76,81 ****
--- 76,82 ----
  #include "optimizer/cost.h"
  #include "optimizer/pathnode.h"
  #include "optimizer/placeholder.h"
+ #include "optimizer/plancat.h"
  #include "optimizer/planmain.h"
  #include "optimizer/restrictinfo.h"
  #include "parser/parsetree.h"
***************
*** 2672,2677 ****
--- 2673,2749 ----
  								  (void *) context);
  }
  
+ /*
+  * cost_index_scan_vs_seqscansort
+  *		Estimates and returns the cost of an full-index scan and a sort after
+  *		a sequential scan.
+  */
+ void
+ cost_index_scan_vs_seqscansort(Oid tableOid,
+ 							   Oid indexOid,
+ 							   Cost *indexScanCost,
+ 							   Cost *seqScanAndSortCost)
+ {
+ 	RelOptInfo	   *rel;
+ 	PlannerInfo	   *root;
+ 	Query		   *query;
+ 	PlannerGlobal  *glob;
+ 	RangeTblEntry  *rte;
+ 	ListCell	   *index;
+ 	IndexPath	   *indexScanPath;
+ 	Path		   *seqScanAndSortPath;
+ 
+ 	/* make a dummy planner */
+ 	glob = makeNode(PlannerGlobal);
+ 
+ 	query = makeNode(Query);
+ 	query->resultRelation = 0;
+ 
+ 	root = makeNode(PlannerInfo);
+ 	root->parse = query;
+ 	root->glob = glob;
+ 
+ 	rel = makeNode(RelOptInfo);
+ 	rel->reloptkind = RELOPT_BASEREL;
+ 	rel->relid = 1;
+ 	rel->rtekind = RTE_RELATION;
+ 	get_relation_info(root, tableOid, false, rel);
+ 	rel->rows = rel->tuples;
+ 
+ 	root->total_table_pages = rel->pages;
+ 
+ 	rte = makeNode(RangeTblEntry);
+ 	rte->rtekind = RTE_RELATION;
+ 	rte->relid = tableOid;
+ 
+ 	root->simple_rel_array_size = 2;
+ 	root->simple_rte_array = (RangeTblEntry **)
+ 		palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
+ 	root->simple_rte_array[1] = rte;
+ 
+ 	/* estimate the cost of seq scan + sort */
+ 	seqScanAndSortPath = create_seqscan_path(NULL, rel);
+ 	cost_sort(seqScanAndSortPath, root, NULL, seqScanAndSortPath->total_cost,
+ 			  rel->tuples, rel->width, -1);
+ 
+ 	/* estimate the cost of index scan */
+ 	indexScanPath = NULL;
+ 	foreach(index, rel->indexlist)
+ 	{
+ 		IndexOptInfo *indexInfo = (IndexOptInfo*) lfirst(index);
+ 		if (indexInfo->indexoid == indexOid)
+ 		{
+ 			indexScanPath = create_index_path(root, indexInfo, NULL, NULL,
+ 											  ForwardScanDirection, NULL);
+ 			break;
+ 		}
+ 	}
+ 	Assert(indexScanPath != NULL);
+ 
+ 	*indexScanCost = indexScanPath->path.total_cost;
+ 	*seqScanAndSortCost = seqScanAndSortPath->total_cost;
+ }
+ 
  
  /*
   * adjust_semi_join
Index: src/backend/commands/cluster.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/commands/cluster.c,v
retrieving revision 1.203
diff -c -r1.203 cluster.c
*** src/backend/commands/cluster.c	28 Apr 2010 16:10:41 -0000	1.203
--- src/backend/commands/cluster.c	21 Jul 2010 14:04:13 -0000
***************
*** 36,41 ****
--- 36,42 ----
  #include "commands/trigger.h"
  #include "commands/vacuum.h"
  #include "miscadmin.h"
+ #include "optimizer/cost.h"
  #include "storage/bufmgr.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
***************
*** 49,54 ****
--- 50,56 ----
  #include "utils/snapmgr.h"
  #include "utils/syscache.h"
  #include "utils/tqual.h"
+ #include "utils/tuplesort.h"
  
  
  /*
***************
*** 69,74 ****
--- 71,79 ----
  			   int freeze_min_age, int freeze_table_age,
  			   bool *pSwapToastByContent, TransactionId *pFreezeXid);
  static List *get_tables_to_cluster(MemoryContext cluster_context);
+ static void deform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc,
+ 			   TupleDesc newTupDesc, Datum *values, bool *isnull,
+ 			   bool newRelHasOids, RewriteState rwstate);
  
  
  
***************
*** 757,762 ****
--- 762,769 ----
  	TransactionId OldestXmin;
  	TransactionId FreezeXid;
  	RewriteState rwstate;
+ 	bool 		 use_sort;
+ 	Tuplesortstate *tuplesort;
  
  	/*
  	 * Open the relations we need.
***************
*** 848,876 ****
  	 * tuples that still need to be copied, we scan with SnapshotAny and use
  	 * HeapTupleSatisfiesVacuum for the visibility test.
  	 */
! 	if (OldIndex != NULL)
  	{
  		heapScan = NULL;
  		indexScan = index_beginscan(OldHeap, OldIndex,
  									SnapshotAny, 0, (ScanKey) NULL);
  	}
  	else
  	{
  		heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
  		indexScan = NULL;
  	}
  
  	for (;;)
  	{
  		HeapTuple	tuple;
- 		HeapTuple	copiedTuple;
  		Buffer		buf;
  		bool		isdead;
- 		int			i;
  
  		CHECK_FOR_INTERRUPTS();
  
! 		if (OldIndex != NULL)
  		{
  			tuple = index_getnext(indexScan, ForwardScanDirection);
  			if (tuple == NULL)
--- 855,903 ----
  	 * tuples that still need to be copied, we scan with SnapshotAny and use
  	 * HeapTupleSatisfiesVacuum for the visibility test.
  	 */
! 	if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
! 	{
! 		/*
! 		 * Check to see if a scan+sort would be less expensive than an index
! 		 * scan
! 		 */
! 		Cost 	indexScanCost,
! 				seqScanAndSortCost;
! 
! 		cost_index_scan_vs_seqscansort(OIDOldHeap, OIDOldIndex,
! 									   &indexScanCost, &seqScanAndSortCost);
! 		use_sort = seqScanAndSortCost < indexScanCost;
! 	}
! 	else
! 		use_sort = false;
! 
! 	if (OldIndex != NULL && !use_sort)
  	{
  		heapScan = NULL;
  		indexScan = index_beginscan(OldHeap, OldIndex,
  									SnapshotAny, 0, (ScanKey) NULL);
+ 		tuplesort = NULL;
  	}
  	else
  	{
  		heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
  		indexScan = NULL;
+ 		if (use_sort)
+ 			tuplesort = tuplesort_begin_rawheap(OldIndex, oldTupDesc,
+ 												maintenance_work_mem, false);
+ 		else
+ 			tuplesort = NULL;
  	}
  
  	for (;;)
  	{
  		HeapTuple	tuple;
  		Buffer		buf;
  		bool		isdead;
  
  		CHECK_FOR_INTERRUPTS();
  
! 		if (indexScan != NULL)
  		{
  			tuple = index_getnext(indexScan, ForwardScanDirection);
  			if (tuple == NULL)
***************
*** 949,993 ****
  			continue;
  		}
  
! 		/*
! 		 * We cannot simply copy the tuple as-is, for several reasons:
! 		 *
! 		 * 1. We'd like to squeeze out the values of any dropped columns, both
! 		 * to save space and to ensure we have no corner-case failures. (It's
! 		 * possible for example that the new table hasn't got a TOAST table
! 		 * and so is unable to store any large values of dropped cols.)
! 		 *
! 		 * 2. The tuple might not even be legal for the new table; this is
! 		 * currently only known to happen as an after-effect of ALTER TABLE
! 		 * SET WITHOUT OIDS.
! 		 *
! 		 * So, we must reconstruct the tuple from component Datums.
! 		 */
! 		heap_deform_tuple(tuple, oldTupDesc, values, isnull);
  
! 		/* Be sure to null out any dropped columns */
! 		for (i = 0; i < natts; i++)
! 		{
! 			if (newTupDesc->attrs[i]->attisdropped)
! 				isnull[i] = true;
! 		}
  
! 		copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
  
! 		/* Preserve OID, if any */
! 		if (NewHeap->rd_rel->relhasoids)
! 			HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
  
! 		/* The heap rewrite module does the rest */
! 		rewrite_heap_tuple(rwstate, tuple, copiedTuple);
  
! 		heap_freetuple(copiedTuple);
  	}
  
! 	if (OldIndex != NULL)
  		index_endscan(indexScan);
! 	else
  		heap_endscan(heapScan);
  
  	/* Write out any remaining tuples, and fsync if needed */
  	end_heap_rewrite(rwstate);
--- 976,1022 ----
  			continue;
  		}
  
! 		if (tuplesort != NULL)
! 			tuplesort_putrawtuple(tuplesort, tuple);
! 		else
! 			deform_and_rewrite_tuple(tuple, oldTupDesc, newTupDesc,
! 									 values, isnull,
! 									 NewHeap->rd_rel->relhasoids, rwstate);
! 	}
  
! 	/*
! 	 * In scan-and-sort mode, read all tuples from tuplestore and write out
! 	 * them into the new relation.
! 	 */
! 	if (tuplesort != NULL)
! 	{
! 		tuplesort_performsort(tuplesort);
  
! 		for (;;)
! 		{
! 			HeapTuple	tuple;
! 			bool		shouldfree;
  
! 			CHECK_FOR_INTERRUPTS();
  
! 			tuple = tuplesort_getrawtuple(tuplesort, true, &shouldfree);
! 			if (tuple == NULL)
! 				break;
  
! 			deform_and_rewrite_tuple(tuple, oldTupDesc, newTupDesc,
! 									 values, isnull,
! 									 NewHeap->rd_rel->relhasoids, rwstate);
! 			if (shouldfree)
! 				heap_freetuple(tuple);
! 		}
  	}
  
! 	if (indexScan != NULL)
  		index_endscan(indexScan);
! 	if (heapScan != NULL)
  		heap_endscan(heapScan);
+ 	if (tuplesort != NULL)
+ 		tuplesort_end(tuplesort);
  
  	/* Write out any remaining tuples, and fsync if needed */
  	end_heap_rewrite(rwstate);
***************
*** 1486,1488 ****
--- 1515,1561 ----
  
  	return rvs;
  }
+ 
+ static void deform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc,
+ 										Datum *values, bool *isnull,
+ 										bool newRelHasOids, RewriteState rwstate)
+ {
+ 	HeapTuple	copiedTuple;
+ 	int 		i;
+ 
+ 	/*
+ 	 * We cannot simply copy the tuple as-is, for several reasons:
+ 	 *
+ 	 * 1. We'd like to squeeze out the values of any dropped columns, both
+ 	 * to save space and to ensure we have no corner-case failures. (It's
+ 	 * possible for example that the new table hasn't got a TOAST table
+ 	 * and so is unable to store any large values of dropped cols.)
+ 	 *
+ 	 * 2. The tuple might not even be legal for the new table; this is
+ 	 * currently only known to happen as an after-effect of ALTER TABLE
+ 	 * SET WITHOUT OIDS.
+ 	 *
+ 	 * So, we must reconstruct the tuple from component Datums.
+ 	 */
+ 
+ 	heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+ 
+ 	/* Be sure to null out any dropped columns */
+ 	for (i = 0; i < newTupDesc->natts; i++)
+ 	{
+ 		if (newTupDesc->attrs[i]->attisdropped)
+ 			isnull[i] = true;
+ 	}
+ 
+ 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+ 
+ 	/* Preserve OID, if any */
+ 	if (newRelHasOids)
+ 		HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
+ 
+ 	/* The heap rewrite module does the rest */
+ 	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+ 
+ 	heap_freetuple(copiedTuple);
+ }
+ 
Index: src/backend/utils/sort/tuplesort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v
retrieving revision 1.95
diff -c -r1.95 tuplesort.c
*** src/backend/utils/sort/tuplesort.c	26 Feb 2010 02:01:15 -0000	1.95
--- src/backend/utils/sort/tuplesort.c	21 Jul 2010 14:04:16 -0000
***************
*** 104,109 ****
--- 104,110 ----
  #include "access/nbtree.h"
  #include "catalog/pg_amop.h"
  #include "catalog/pg_operator.h"
+ #include "catalog/index.h"
  #include "commands/tablespace.h"
  #include "miscadmin.h"
  #include "pg_trace.h"
***************
*** 115,126 ****
--- 116,129 ----
  #include "utils/rel.h"
  #include "utils/syscache.h"
  #include "utils/tuplesort.h"
+ #include "executor/executor.h"
  
  
  /* sort-type codes for sort__start probes */
  #define HEAP_SORT	0
  #define INDEX_SORT	1
  #define DATUM_SORT	2
+ #define RAWHEAP_SORT	3
  
  /* GUC variables */
  #ifdef TRACE_SORT
***************
*** 366,371 ****
--- 369,378 ----
  	int			datumTypeLen;
  	bool		datumTypeByVal;
  
+ 	/* These are specific to the rawheap subcase: */
+ 	EState 	   *estate;
+ 	IndexInfo  *indexInfo;
+ 
  	/*
  	 * Resource snapshot for time of sort start.
  	 */
***************
*** 450,455 ****
--- 457,468 ----
  static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
  			 int tapenum, unsigned int len);
  static void reversedirection_heap(Tuplesortstate *state);
+ static int comparetup_rawheap(const SortTuple *a, const SortTuple *b,
+ 							  Tuplesortstate *state);
+ static void copytup_rawheap(Tuplesortstate *state, SortTuple *stup, void *tup);
+ static void writetup_rawheap(Tuplesortstate *state, int tapenum, SortTuple *stup);
+ static void readtup_rawheap(Tuplesortstate *state, SortTuple *stup,
+ 							int tapenum, unsigned int len);
  static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
  					   Tuplesortstate *state);
  static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
***************
*** 549,554 ****
--- 562,570 ----
  
  	state->result_tape = -1;	/* flag that result tape has not been formed */
  
+ 	/* set estate to NULL, so we don't try to free it later if not used */
+ 	state->estate = NULL;
+ 
  	MemoryContextSwitchTo(oldcontext);
  
  	return state;
***************
*** 762,767 ****
--- 778,843 ----
  	return state;
  }
  
+ Tuplesortstate *
+ tuplesort_begin_rawheap(Relation indexRel, TupleDesc tupDesc,
+ 						int workMem, bool randomAccess)
+ {
+ 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
+ 	MemoryContext oldcontext;
+ 
+ 	TupleTableSlot *existing_slot;
+ 	ExprContext	   *econtext;
+ 
+ 	Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
+ 
+ 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ 
+ #ifdef TRACE_SORT
+ 	if (trace_sort)
+ 		elog(LOG,
+ 			 "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
+ 			 RelationGetNumberOfAttributes(indexRel), workMem, randomAccess ? 't' : 'f');
+ #endif
+ 
+ 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
+ 
+ 	TRACE_POSTGRESQL_SORT_START(RAWHEAP_SORT, false, state->nKeys, workMem, randomAccess);
+ 
+ 	state->comparetup = comparetup_rawheap;
+ 	state->copytup = copytup_rawheap;
+ 	state->writetup = writetup_rawheap;
+ 	state->readtup = readtup_rawheap;
+ 	state->reversedirection = reversedirection_heap;
+ 
+ 	state->indexInfo = BuildIndexInfo(indexRel);
+ 	state->indexScanKey = _bt_mkscankey_nodata(indexRel);
+ 	state->enforceUnique = false;
+ 
+ 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
+ 
+ 	if (state->indexInfo->ii_Expressions != NULL)
+ 	{
+ 		/* allocate the vars used by FormIndexDatum */
+ 		state->estate = CreateExecutorState();
+ 
+ 		/*
+ 		 * Need a TupleTableSlot to put existing tuples in.
+ 		 *
+ 		 * To use FormIndexDatum, we have to make the econtext's scantuple point
+ 		 * to this slot.  Be sure to save and restore caller's value for
+ 		 * scantuple.
+ 		 */
+ 		existing_slot = MakeSingleTupleTableSlot(tupDesc);
+ 
+ 		econtext = GetPerTupleExprContext(state->estate);
+ 		econtext->ecxt_scantuple = existing_slot;
+ 	}
+ 
+ 	MemoryContextSwitchTo(oldcontext);
+ 
+ 	return state;
+ }
+ 
  /*
   * tuplesort_set_bound
   *
***************
*** 849,854 ****
--- 925,933 ----
  	 */
  	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
  #endif
+ 	if (state->estate != NULL)
+ 		ExecDropSingleTupleTableSlot(GetPerTupleExprContext(state->estate)->ecxt_scantuple);
+ 
  
  	MemoryContextSwitchTo(oldcontext);
  
***************
*** 980,985 ****
--- 1059,1086 ----
  }
  
  /*
+ * Accept one tuple while collecting input data for sort.
+ *
+ * Note that the input data is always copied; the caller need not save it.
+ */
+ void
+ tuplesort_putrawtuple(Tuplesortstate *state, HeapTuple tup)
+ {
+ 	MemoryContext	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ 	SortTuple		stup;
+ 
+ 	/*
+ 	 * Copy the given tuple into memory we control, and decrease availMem.
+ 	 * Then call the common code.
+ 	 */
+ 	COPYTUP(state, &stup, (void *) tup);
+ 
+ 	puttuple_common(state, &stup);
+ 
+ 	MemoryContextSwitchTo(oldcontext);
+ }
+ 
+ /*
   * Shared code for tuple and datum cases.
   */
  static void
***************
*** 1482,1487 ****
--- 1583,1607 ----
  }
  
  /*
+ * Fetch the next tuple in either forward or back direction. Returns NULL if
+ * no more tuples. If *should_free is set, the caller must pfree the returned
+ * tuple when done with it.
+ */
+ HeapTuple
+ tuplesort_getrawtuple(Tuplesortstate *state, bool forward, bool *should_free)
+ {
+ 	MemoryContext	oldcontext = MemoryContextSwitchTo(state->sortcontext);
+ 	SortTuple		stup;
+ 
+ 	if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
+ 		stup.tuple = NULL;
+ 
+ 	MemoryContextSwitchTo(oldcontext);
+ 
+ 	return stup.tuple;
+ }
+ 
+ /*
   * tuplesort_merge_order - report merge order we'll use for given memory
   * (note: "merge order" just means the number of input tapes in the merge).
   *
***************
*** 3079,3084 ****
--- 3199,3414 ----
  }
  
  /*
+ * Routines specialized for Raw on-disk HeapTuple case These are only used when
+ * we need the visibility info for things like CLUSTER. Otherwise we used the
+ * regular *tup_heap methods which actually manipulate MinimalTuples.
+ */
+ static int
+ comparetup_rawheap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+ {
+ 	ScanKey		scanKey = state->indexScanKey;
+ 	HeapTuple	ltup;
+ 	HeapTuple	rtup;
+ 	TupleDesc	tupDesc;
+ 	int			nkey;
+ 	int32		compare;
+ 
+ 	/* Allow interrupting long sorts */
+ 	CHECK_FOR_INTERRUPTS();
+ 
+ 	/* Compare the leading sort key */
+ 	compare = inlineApplySortFunction(&scanKey->sk_func,
+ 									  scanKey->sk_flags,
+ 									  a->datum1, a->isnull1,
+ 									  b->datum1, b->isnull1);
+ 	if (compare != 0)
+ 		return compare;
+ 
+ 	/* Compare additional sort keys */
+ 	ltup = (HeapTuple) a->tuple;
+ 	rtup = (HeapTuple) b->tuple;
+ 
+ 	if (state->indexInfo->ii_Expressions == NULL)
+ 	{
+ 		/* if not expression index, just get the proper heap_getattr */
+ 
+ 		tupDesc = state->tupDesc;
+ 		scanKey++;
+ 
+ 		for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++)
+ 		{
+ 			Datum		datum1,
+ 						datum2;
+ 			bool		isnull1,
+ 						isnull2;
+ 
+ 			datum1 = heap_getattr(ltup, state->indexInfo->ii_KeyAttrNumbers[nkey], tupDesc, &isnull1);
+ 			datum2 = heap_getattr(rtup, state->indexInfo->ii_KeyAttrNumbers[nkey], tupDesc, &isnull2);
+ 
+ 			compare = inlineApplySortFunction(&scanKey->sk_func,
+ 											  scanKey->sk_flags,
+ 											  datum1, isnull1,
+ 											  datum2, isnull2);
+ 			if (compare != 0)
+ 				return compare;
+ 		}
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * in the expression index case, we get all the values/nulls:
+ 		 * it would be faster to get only the required ones, but it would mean
+ 		 * copy&paste from FormIndexDatum
+ 		 */
+ 
+ 		Datum		l_existing_values[INDEX_MAX_KEYS];
+ 		bool		l_existing_isnull[INDEX_MAX_KEYS];
+ 		Datum		r_existing_values[INDEX_MAX_KEYS];
+ 		bool		r_existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+ 
+ 		ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
+ 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 					   l_existing_values, l_existing_isnull);
+ 
+ 		ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
+ 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 					   r_existing_values, r_existing_isnull);
+ 
+ 		for (nkey = 1; nkey < state->nKeys; nkey++, scanKey++)
+ 		{
+ 			compare = inlineApplySortFunction(&scanKey->sk_func,
+ 											  scanKey->sk_flags,
+ 											  l_existing_values[nkey],
+ 											  l_existing_isnull[nkey],
+ 											  r_existing_values[nkey],
+ 											  r_existing_isnull[nkey]);
+ 			if (compare != 0)
+ 				return compare;
+ 		}
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static void
+ copytup_rawheap(Tuplesortstate *state, SortTuple *stup, void *tup)
+ {
+ 	HeapTuple	tuple = (HeapTuple) tup;
+ 
+ 	/* copy the tuple into sort storage */
+ 	stup->tuple = (void *) heap_copytuple(tuple);
+ 	USEMEM(state, GetMemoryChunkSpace(stup->tuple));
+ 	/* set up first-column key value */
+ 	if (state->indexInfo->ii_Expressions == NULL)
+ 	{
+ 		/* no expression index, just get the key datum value */
+ 		stup->datum1 = heap_getattr((HeapTuple) stup->tuple,
+ 									state->indexInfo->ii_KeyAttrNumbers[0],
+ 									state->tupDesc,
+ 									&stup->isnull1);
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * Extract the index column values and isnull flags from the existing
+ 		 * tuple; we're interested only in the very first one, but to avoid
+ 		 * copy&paste from FormIndexDatum we get all of them (even if it's
+ 		 * slower)
+ 		 */
+ 
+ 		Datum		existing_values[INDEX_MAX_KEYS];
+ 		bool		existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+ 		ExecStoreTuple(tuple,	ecxt_scantuple, InvalidBuffer, false);
+ 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 					   existing_values, existing_isnull);
+ 
+ 		stup->datum1 = existing_values[0];
+ 		stup->isnull1 = existing_isnull[0];
+ 	}
+ }
+ 
+ static void
+ writetup_rawheap(Tuplesortstate *state, int tapenum, SortTuple *stup)
+ {
+ 	HeapTuple	tuple = (HeapTuple) stup->tuple;
+ 	int	    	tuplen = tuple->t_len + HEAPTUPLESIZE;
+ 
+ 	LogicalTapeWrite(state->tapeset, tapenum,
+ 					 &tuplen, sizeof(tuplen));
+ 	LogicalTapeWrite(state->tapeset, tapenum,
+ 					 (char *) tuple + sizeof(tuplen),
+ 					 HEAPTUPLESIZE - sizeof(tuplen));
+ 	LogicalTapeWrite(state->tapeset, tapenum, tuple->t_data, tuple->t_len);
+ 	if (state->randomAccess)	/* need trailing length word? */
+ 		LogicalTapeWrite(state->tapeset, tapenum, &tuplen, sizeof(tuplen));
+ 
+ 	FREEMEM(state, GetMemoryChunkSpace(tuple));
+ 	heap_freetuple(tuple);
+ }
+ 
+ static void
+ readtup_rawheap(Tuplesortstate *state, SortTuple *stup,
+ 				int tapenum, unsigned int tuplen)
+ {
+ 	HeapTuple	tuple = (HeapTuple) palloc(tuplen);
+ 
+ 	USEMEM(state, GetMemoryChunkSpace(tuple));
+ 
+ 	tuple->t_len = tuplen - HEAPTUPLESIZE;
+ 	if (LogicalTapeRead(state->tapeset, tapenum,
+ 						 (char *) tuple + sizeof(tuplen),
+ 			HEAPTUPLESIZE - sizeof(tuplen)) != HEAPTUPLESIZE - sizeof(tuplen))
+ 		elog(ERROR, "unexpected end of data");
+ 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
+ 	if (LogicalTapeRead(state->tapeset, tapenum,
+ 						tuple->t_data, tuple->t_len) != tuple->t_len)
+ 		elog(ERROR, "unexpected end of data");
+ 	if (state->randomAccess)	/* need trailing length word? */
+ 		if (LogicalTapeRead(state->tapeset, tapenum, &tuplen,
+ 							sizeof(tuplen)) != sizeof(tuplen))
+ 			elog(ERROR, "unexpected end of data");
+ 
+ 	stup->tuple = tuple;
+ 
+ 	/* set up first-column key value */
+ 	if (state->indexInfo->ii_Expressions == NULL)
+ 	{
+ 		/* no expression index, just get the key datum value */
+ 		stup->datum1 = heap_getattr((HeapTuple) stup->tuple,
+ 									state->indexInfo->ii_KeyAttrNumbers[0],
+ 									state->tupDesc,
+ 									&stup->isnull1);
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * Extract the index column values and isnull flags from the existing
+ 		 * tuple; we're interested only in the very first one, but to avoid
+ 		 * copy&paste from FormIndexDatum we get all of them (even if it's
+ 		 * slower)
+ 		 */
+ 
+ 		Datum		existing_values[INDEX_MAX_KEYS];
+ 		bool		existing_isnull[INDEX_MAX_KEYS];
+ 		TupleTableSlot *ecxt_scantuple;
+ 
+ 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
+ 		ExecStoreTuple(tuple,	ecxt_scantuple, InvalidBuffer, false);
+ 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
+ 					   existing_values, existing_isnull);
+ 
+ 		stup->datum1 = existing_values[0];
+ 		stup->isnull1 = existing_isnull[0];
+ 	}
+ }
+ 
+ /*
   * Convenience routine to free a tuple previously loaded into sort memory
   */
  static void