On Tue, Dec 20, 2011 at 07:08, Tom Lane <t...@sss.pgh.pa.us> wrote: > it'd likely be better if this code ignored unrecognized qual expression > types rather than Assert'ing they're not there.
The patch replaced that Assert with an elog(ERROR) > Hmm. I am reminded of how utterly unreadable "diff -u" format is for > anything longer than single-line changes :-( ... Sorry, the new patch is in context (-C) diff format proper. I also moved around code a bit and removed an unused variable that was left around from the refactoring. > but I think I don't > like this refactoring much. Will take a closer look tomorrow. I was afraid you'd say that, especially for a change that should be backpatched. But I couldn't think of alternative ways to do it that give non-bogus estimates. ---- While writing this patch, the largest dilemma was where to account for the multiple array scans. Given that this code is mostly a heuristic and I lack a deep understanding of GIN indexes, it's likely that I got this part wrong. Currently I'm doing this: partialEntriesInQuals *= array_scans; exactEntriesInQuals *= array_scans; searchEntriesInQuals *= array_scans; Which seems to be the right thing as far as random disk accesses are concerned (successive scans are more likely to hit the cache) and also works well with queries that don't touch most of the index. But this fails spectacularly when multiple full scans are performed e.g. LIKE ANY ('{%,%,%}'). Because index_pages_fetched() ends up removing all of the rescan costs. Another approach is multiplying the total cost from the number of scans. This overestimates random accesses from rescans, but fixes the above case: *indexTotalCost = (*indexStartupCost + dataPagesFetched * spc_random_page_cost) * array_scans; Regards, Marti
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out new file mode 100644 index e7af7d4..250d853 *** a/contrib/pg_trgm/expected/pg_trgm.out --- b/contrib/pg_trgm/expected/pg_trgm.out *************** explain (costs off) *** 3486,3491 **** --- 3486,3501 ---- Index Cond: (t ~~* '%BCD%'::text) (4 rows) + explain (costs off) + select * from test2 where t like any ('{%bcd%,qua%}'); + QUERY PLAN + --------------------------------------------------------- + Bitmap Heap Scan on test2 + Recheck Cond: (t ~~ ANY ('{%bcd%,qua%}'::text[])) + -> Bitmap Index Scan on test2_idx_gin + Index Cond: (t ~~ ANY ('{%bcd%,qua%}'::text[])) + (4 rows) + select * from test2 where t like '%BCD%'; t --- *************** select * from test2 where t ilike 'qua%' *** 3509,3514 **** --- 3519,3531 ---- quark (1 row) + select * from test2 where t like any ('{%bcd%,qua%}'); + t + -------- + abcdef + quark + (2 rows) + drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; *************** explain (costs off) *** 3528,3533 **** --- 3545,3560 ---- Index Cond: (t ~~* '%BCD%'::text) (2 rows) + explain (costs off) + select * from test2 where t like any ('{%bcd%,qua%}'); + QUERY PLAN + --------------------------------------------------------- + Bitmap Heap Scan on test2 + Recheck Cond: (t ~~ ANY ('{%bcd%,qua%}'::text[])) + -> Bitmap Index Scan on test2_idx_gist + Index Cond: (t ~~ ANY ('{%bcd%,qua%}'::text[])) + (4 rows) + select * from test2 where t like '%BCD%'; t --- *************** select * from test2 where t ilike 'qua%' *** 3551,3553 **** --- 3578,3587 ---- quark (1 row) + select * from test2 where t like any ('{%bcd%,qua%}'); + t + -------- + abcdef + quark + (2 rows) + diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql new file mode 100644 index ea902f6..ac969e6 *** a/contrib/pg_trgm/sql/pg_trgm.sql --- b/contrib/pg_trgm/sql/pg_trgm.sql *************** explain (costs off) *** 47,56 **** --- 47,59 ---- select * from test2 where t like '%BCD%'; explain (costs off) select * from test2 where t ilike '%BCD%'; + explain (costs off) + select * from test2 where t like any ('{%bcd%,qua%}'); select * from test2 where t like '%BCD%'; select * from test2 where t like '%bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; + select * from test2 where t like any ('{%bcd%,qua%}'); drop index test2_idx_gin; create index test2_idx_gist on test2 using gist (t gist_trgm_ops); set enable_seqscan=off; *************** explain (costs off) *** 58,64 **** --- 61,70 ---- select * from test2 where t like '%BCD%'; explain (costs off) select * from test2 where t ilike '%BCD%'; + explain (costs off) + select * from test2 where t like any ('{%bcd%,qua%}'); select * from test2 where t like '%BCD%'; select * from test2 where t like '%bcd%'; select * from test2 where t ilike '%BCD%'; select * from test2 where t ilike 'qua%'; + select * from test2 where t like any ('{%bcd%,qua%}'); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c new file mode 100644 index d06809e..4bca715 *** a/src/backend/utils/adt/selfuncs.c --- b/src/backend/utils/adt/selfuncs.c *************** find_index_column(Node *op, IndexOptInfo *** 6591,6596 **** --- 6591,6660 ---- } /* + * Estimate gin costs for a single search value. Returns false if no match + * is possible, true otherwise. All pointer arguments must be initialized by + * the caller. + */ + static bool + gin_costestimate_search(Oid extractProcOid, Datum value, int strategy_op, + double numEntries, double *partialEntriesInQuals, + double *exactEntriesInQuals, + double *searchEntriesInQuals) + { + int32 nentries = 0; + bool *partial_matches = NULL; + Pointer *extra_data = NULL; + bool *nullFlags = NULL; + int32 searchMode = GIN_SEARCH_MODE_DEFAULT; + int32 i; + + OidFunctionCall7(extractProcOid, + value, + PointerGetDatum(&nentries), + UInt16GetDatum(strategy_op), + PointerGetDatum(&partial_matches), + PointerGetDatum(&extra_data), + PointerGetDatum(&nullFlags), + PointerGetDatum(&searchMode)); + + if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) + return false; + + for (i = 0; i < nentries; i++) + { + /* + * For partial match we haven't any information to estimate + * number of matched entries in index, so, we just estimate it + * as 100 + */ + if (partial_matches && partial_matches[i]) + *partialEntriesInQuals += 100; + else + (*exactEntriesInQuals)++; + + (*searchEntriesInQuals)++; + } + + if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) + { + /* Treat "include empty" like an exact-match item */ + (*exactEntriesInQuals)++; + (*searchEntriesInQuals)++; + } + else if (searchMode != GIN_SEARCH_MODE_DEFAULT) + { + /* + * It's GIN_SEARCH_MODE_ALL, full index scan will be required. We + * treat this as if every key in the index had been listed in the + * query; is that reasonable? + */ + *searchEntriesInQuals += numEntries; + } + + return true; + } + + /* * GIN has search behavior completely different from other index types */ Datum *************** gincostestimate(PG_FUNCTION_ARGS) *** 6613,6619 **** numDataPages, numPendingPages, numEntries; ! bool haveFullScan = false; double partialEntriesInQuals = 0.0; double searchEntriesInQuals = 0.0; double exactEntriesInQuals = 0.0; --- 6677,6683 ---- numDataPages, numPendingPages, numEntries; ! bool matchPossible = true; double partialEntriesInQuals = 0.0; double searchEntriesInQuals = 0.0; double exactEntriesInQuals = 0.0; *************** gincostestimate(PG_FUNCTION_ARGS) *** 6623,6629 **** double qual_op_cost, qual_arg_cost, spc_random_page_cost, ! num_scans; QualCost index_qual_cost; Relation indexRel; GinStatsData ginStats; --- 6687,6694 ---- double qual_op_cost, qual_arg_cost, spc_random_page_cost, ! outer_scans, ! array_scans = 1.0; QualCost index_qual_cost; Relation indexRel; GinStatsData ginStats; *************** gincostestimate(PG_FUNCTION_ARGS) *** 6721,6739 **** int strategy_op; Oid lefttype, righttype; - int32 nentries = 0; - bool *partial_matches = NULL; - Pointer *extra_data = NULL; - bool *nullFlags = NULL; - int32 searchMode = GIN_SEARCH_MODE_DEFAULT; int indexcol; Assert(IsA(rinfo, RestrictInfo)); clause = rinfo->clause; ! Assert(IsA(clause, OpExpr)); ! leftop = get_leftop(clause); ! rightop = get_rightop(clause); ! clause_op = ((OpExpr *) clause)->opno; if ((indexcol = find_index_column(leftop, index)) >= 0) { --- 6786,6816 ---- int strategy_op; Oid lefttype, righttype; int indexcol; Assert(IsA(rinfo, RestrictInfo)); clause = rinfo->clause; ! ! if (IsA(clause, OpExpr)) ! { ! leftop = get_leftop(clause); ! rightop = get_rightop(clause); ! clause_op = ((OpExpr *) clause)->opno; ! } ! else if (IsA(clause, ScalarArrayOpExpr)) ! { ! ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; ! ! leftop = (Node *) linitial(saop->args); ! rightop = (Node *) lsecond(saop->args); ! clause_op = saop->opno; ! } ! else ! { ! elog(ERROR, "unsupported GIN indexqual type: %d", ! (int) nodeTag(clause)); ! continue; /* keep compiler quiet */ ! } if ((indexcol = find_index_column(leftop, index)) >= 0) { *************** gincostestimate(PG_FUNCTION_ARGS) *** 6761,6776 **** if (!IsA(operand, Const)) { searchEntriesInQuals++; continue; } /* If Const is null, there can be no matches */ if (((Const *) operand)->constisnull) { ! *indexStartupCost = 0; ! *indexTotalCost = 0; ! *indexSelectivity = 0; ! PG_RETURN_VOID(); } /* --- 6838,6855 ---- if (!IsA(operand, Const)) { searchEntriesInQuals++; + + if (IsA(rinfo->clause, ScalarArrayOpExpr)) + array_scans *= estimate_array_length(operand); + continue; } /* If Const is null, there can be no matches */ if (((Const *) operand)->constisnull) { ! matchPossible = false; ! break; } /* *************** gincostestimate(PG_FUNCTION_ARGS) *** 6800,6856 **** get_rel_name(index->indexoid)); } ! OidFunctionCall7(extractProcOid, ! ((Const *) operand)->constvalue, ! PointerGetDatum(&nentries), ! UInt16GetDatum(strategy_op), ! PointerGetDatum(&partial_matches), ! PointerGetDatum(&extra_data), ! PointerGetDatum(&nullFlags), ! PointerGetDatum(&searchMode)); ! ! if (nentries <= 0 && searchMode == GIN_SEARCH_MODE_DEFAULT) { ! /* No match is possible */ ! *indexStartupCost = 0; ! *indexTotalCost = 0; ! *indexSelectivity = 0; ! PG_RETURN_VOID(); } ! else { ! int32 i; ! for (i = 0; i < nentries; i++) { ! /* ! * For partial match we haven't any information to estimate ! * number of matched entries in index, so, we just estimate it ! * as 100 ! */ ! if (partial_matches && partial_matches[i]) ! partialEntriesInQuals += 100; ! else ! exactEntriesInQuals++; ! searchEntriesInQuals++; } - } ! if (searchMode == GIN_SEARCH_MODE_INCLUDE_EMPTY) ! { ! /* Treat "include empty" like an exact-match item */ ! exactEntriesInQuals++; ! searchEntriesInQuals++; ! } ! else if (searchMode != GIN_SEARCH_MODE_DEFAULT) ! { ! /* It's GIN_SEARCH_MODE_ALL */ ! haveFullScan = true; } } ! if (haveFullScan || indexQuals == NIL) { /* * Full index scan will be required. We treat this as if every key in --- 6879,6977 ---- get_rel_name(index->indexoid)); } ! if (IsA(rinfo->clause, OpExpr)) { ! matchPossible = ! gin_costestimate_search(extractProcOid, ! ((Const *) operand)->constvalue, ! strategy_op, ! numEntries, ! &partialEntriesInQuals, ! &exactEntriesInQuals, ! &searchEntriesInQuals); ! if (!matchPossible) ! break; } ! else if (IsA(rinfo->clause, ScalarArrayOpExpr)) { ! ArrayType *arrayval; ! int16 elmlen; ! bool elmbyval; ! char elmalign; ! int numElems; ! Datum *elemValues; ! bool *elemNulls; ! int numPossible = 0; ! int i; ! double partialEntries = 0.0, ! exactEntries = 0.0, ! searchEntries = 0.0; ! arrayval = DatumGetArrayTypeP(((Const *) operand)->constvalue); ! get_typlenbyvalalign(ARR_ELEMTYPE(arrayval), ! &elmlen, &elmbyval, &elmalign); ! deconstruct_array(arrayval, ! ARR_ELEMTYPE(arrayval), ! elmlen, elmbyval, elmalign, ! &elemValues, &elemNulls, &numElems); ! ! for (i = 0; i < numElems; i++) { ! bool elemMatchPossible; ! /* NULL can't match anything */ ! if (elemNulls[i]) ! continue; ! ! elemMatchPossible = ! gin_costestimate_search(extractProcOid, ! elemValues[i], ! strategy_op, ! numEntries, ! &partialEntries, ! &exactEntries, ! &searchEntries); ! ! /* We only count array elements that can return any results */ ! if (elemMatchPossible) ! numPossible++; } ! if (numPossible == 0) ! { ! /* No useful searches in the array */ ! matchPossible = false; ! break; ! } ! ! /* ! * When there are multiple scalar expressions in an index scan, ! * the executor repeats the index scan for all possible ! * combinations (cartesian product) of input arrays. Instead of ! * doing that here, we only do a single pass over input arrays and ! * calculate the average entries for each scan. ! * ! * Later, we multiply average cost by array_scans (the product of ! * effective array lengths) ! */ ! partialEntriesInQuals += partialEntries / numPossible; ! exactEntriesInQuals += exactEntries / numPossible; ! searchEntriesInQuals += searchEntries / numPossible; ! ! array_scans *= numPossible; } } ! if (!matchPossible) ! { ! /* No match is possible */ ! *indexStartupCost = 0; ! *indexTotalCost = 0; ! *indexSelectivity = 0; ! PG_RETURN_VOID(); ! } ! ! if (indexQuals == NIL) { /* * Full index scan will be required. We treat this as if every key in *************** gincostestimate(PG_FUNCTION_ARGS) *** 6861,6869 **** /* Will we have more than one iteration of a nestloop scan? */ if (outer_rel != NULL && outer_rel->rows > 1) ! num_scans = outer_rel->rows; else ! num_scans = 1; /* * cost to begin scan, first of all, pay attention to pending list. --- 6982,6997 ---- /* Will we have more than one iteration of a nestloop scan? */ if (outer_rel != NULL && outer_rel->rows > 1) ! outer_scans = outer_rel->rows; else ! outer_scans = 1; ! ! /* ! * The cost needs to account for all rescans of ScalarArrayOpExprs ! */ ! partialEntriesInQuals *= array_scans; ! exactEntriesInQuals *= array_scans; ! searchEntriesInQuals *= array_scans; /* * cost to begin scan, first of all, pay attention to pending list. *************** gincostestimate(PG_FUNCTION_ARGS) *** 6888,6900 **** /* * Partial match algorithm reads all data pages before doing actual scan, ! * so it's a startup cost. Again, we havn't any useful stats here, so, * estimate it as proportion */ dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries); /* calculate cache effects */ ! if (num_scans > 1 || searchEntriesInQuals > 1) { entryPagesFetched = index_pages_fetched(entryPagesFetched, (BlockNumber) numEntryPages, --- 7016,7028 ---- /* * Partial match algorithm reads all data pages before doing actual scan, ! * so it's a startup cost. Again, we haven't any useful stats here, so, * estimate it as proportion */ dataPagesFetched = ceil(numDataPages * partialEntriesInQuals / numEntries); /* calculate cache effects */ ! if (outer_scans > 1 || array_scans > 1 || searchEntriesInQuals > 1) { entryPagesFetched = index_pages_fetched(entryPagesFetched, (BlockNumber) numEntryPages, *************** gincostestimate(PG_FUNCTION_ARGS) *** 6932,6938 **** dataPagesFetched = dataPagesFetchedBySel; } ! if (num_scans > 1) dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); --- 7060,7066 ---- dataPagesFetched = dataPagesFetchedBySel; } ! if (outer_scans > 1 || array_scans > 1) dataPagesFetched = index_pages_fetched(dataPagesFetched, (BlockNumber) numDataPages, numDataPages, root); diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out new file mode 100644 index e1d7646..9341dbe *** a/src/test/regress/expected/tsearch.out --- b/src/test/regress/expected/tsearch.out *************** SELECT count(*) FROM test_tsvector WHERE *** 142,147 **** --- 142,153 ---- 494 (1 row) + SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count + ------- + 158 + (1 row) + RESET enable_seqscan; DROP INDEX wowidx; CREATE INDEX wowidx ON test_tsvector USING gin (a); *************** SELECT count(*) FROM test_tsvector WHERE *** 188,193 **** --- 194,205 ---- 494 (1 row) + SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count + ------- + 158 + (1 row) + RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH'); SELECT * FROM ts_stat('SELECT a FROM test_tsvector') ORDER BY ndoc DESC, nentry DESC, word LIMIT 10; diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql new file mode 100644 index d261da2..9fd1207 *** a/src/test/regress/sql/tsearch.sql --- b/src/test/regress/sql/tsearch.sql *************** SELECT count(*) FROM test_tsvector WHERE *** 60,65 **** --- 60,66 ---- SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; + SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; *************** SELECT count(*) FROM test_tsvector WHERE *** 76,81 **** --- 77,83 ---- SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; + SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); RESET enable_seqscan; INSERT INTO test_tsvector VALUES ('???', 'DFG:1A,2B,6C,10 FGH');
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers