Re: Use extended statistics to estimate (Var op Var) clauses

Mark Dilger Mon, 09 Aug 2021 12:20:17 -0700


> On Jul 20, 2021, at 11:28 AM, Tomas Vondra <tomas.von...@enterprisedb.com> 
> wrote:
> 
> Tomas Vondra
> EnterpriseDB: http://www.enterprisedb.com
> The Enterprise PostgreSQL Company
> <0001-Handling-Expr-op-Expr-clauses-in-extended-stats-20210720.patch>


Hi Tomas,

I tested this patch against master looking for types of clauses that uniformly 
get worse with the patch applied.  I found some.

The tests are too large to attach, but the scripts that generate them are not.  
To perform the tests:

        git checkout master
        perl ./gentest.pl > src/test/regress/sql/gentest.sql
        cat /dev/null > src/test/regress/expected/gentest.out
        echo "test: gentest" >> src/test/regress/parallel_schedule
        ./configure && make && make check
        cp src/test/regress/results/gentest.out 
src/test/regress/expected/gentest.out
        patch -p 1 < 
0001-Handling-Expr-op-Expr-clauses-in-extended-stats-20210720.patch
        make check
        cat src/test/regress/regression.diffs | perl ./check.pl

This shows patterns of conditions that get worse, such as:

better:0, worse:80:  A < B and A <> A or not A < A
better:0, worse:80:  A < B and not A <= A or A <= A
better:0, worse:80:  A < B or A = A
better:0, worse:80:  A < B or A = A or not A >= A
better:0, worse:80:  A < B or A >= A
better:0, worse:80:  A < B or A >= A and not A <> A
better:0, worse:80:  A < B or not A < A
better:0, worse:80:  A < B or not A <> A
better:0, worse:80:  A < B or not A <> A or A <= A
better:0, worse:80:  A < B or not A >= A or not A < A

It seems things get worse when the conditions contain a column compared against 
itself.  I suspect that is being handled incorrectly.

#!/usr/bin/perl

use strict;
use warnings;

my ($query, $where, $before, $after, @before, @after);
my $better = 0;
my $worse = 0;
my %where = ();
my %gripe;
while (<>)
{
 	if (/from check_estimated_rows\('(.*)'\)/)
	{
		$query = $1;
		if ($query =~ m/where (.*)$/)
		{
			$where = $1;
			my %columns = map { $_ => 1 } ($where =~ m/(column_\d+)/g);
			my @normal = ('A'..'Z');
			my @columns = sort keys %columns;
			for my $i (0..$#columns)
			{
				my $old = $columns[$i];
				my $new = $normal[$i];
				$where =~ s/\b$old\b/$new/g;
			}
		}
	}
	elsif (m/^-(\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(\d+))\s*$/)
	{
		($before, @before) = ($1, $2, $3, $4);
	}
	elsif (m/^\+(\s*(\d+)\s*\|\s*(\d+)\s*\|\s*(\d+))\s*$/)
	{
		($after, @after) = ($1, $2, $3, $4);

		$where{$where}->{better} ||= 0;
		$where{$where}->{worse} ||= 0;

		# Don't count the difference as meaningful unless we're more than 5 better or worse than before
		if ($after[2] > 5 + $before[2])
		{
			$worse++;
			$where{$where}->{worse}++;
		}
		elsif ($after[2] + 5 < $before[2])
		{
			$better++;
			$where{$where}->{better}++;
		}

		if (!exists $gripe{$where} && $where{$where}->{better} == 0 && $where{$where}->{worse} > 50)
		{
			print "TERRIBLE:\n";
			print "\tQUERY: $query\n";
			print "\tBEFORE: $before\n";
			print "\tAFTER:  $after\n\n";
			$gripe{$query} = 1;
		}
	}
}

foreach my $where (sort keys %where)
{
	if ($where{$where}->{better} < $where{$where}->{worse})
	{
		print("better:", $where{$where}->{better}, ", worse:", $where{$where}->{worse}, ":  $where\n");
	}
}
print("\n\nTOTAL:\n");
print("\tbetter: $better\n");
print("\tworse: $worse\n");

#!/usr/bin/perl

use strict;
use warnings;

our ($tblnum, $tblname, $colnum, $colname);

# Generate the where clauses to be used on all tables
our (%wherepattern1, %wherepattern2, %wherepattern3);
my @ops = (">", "<", ">=", "<=", "=", "<>");
my @conj = ("and", "or", "and not", "or not");
for (1..100)
{
	my $op1 = $ops[int(rand(@ops))];
	my $op2 = $ops[int(rand(@ops))];
	my $op3 = $ops[int(rand(@ops))];
	my $conj1 = $conj[int(rand(@conj))];
	my $conj2 = $conj[int(rand(@conj))];

	$wherepattern1{"\%s $op1 \%s"} = 1;
	$wherepattern2{"\%s $op1 \%s $conj1 \%s $op2 \%s"} = 1;
	$wherepattern3{"\%s $op1 \%s $conj1 \%s $op2 \%s $conj2 \%s $op3 \%s"} = 1;
}

sub next_table {
	$tblnum++;
	$tblname = "table_$tblnum";
	$colnum = 0;
	$colname = "column_$colnum";
}

sub next_column {
	$colnum++;
	$colname = "column_$colnum";
}

for my $colcnt (2..10)
{
	next_table();
	print("CREATE TABLE $tblname (\n");
	for (1..$colcnt-1)
	{
		next_column();
		print("\t$colname INTEGER,\n");
	}
	next_column();
	print("\t$colname INTEGER\n);\n");
	print("INSERT INTO $tblname (SELECT ",
		  join(", ", map { "gs-$_" } (1..$colcnt)),
		  " FROM generate_series(1,100) gs);\n");
	print("VACUUM FREEZE $tblname;\n");
	for my $colmax (2..$colcnt)
	{
		print("CREATE STATISTICS ${tblname}_stats_${colmax} ON ",
			  join(", ", map { "column_$_" } (1..$colmax)),
			  " FROM $tblname;\n");
	}
	print("ANALYZE $tblname;\n");
}

# Restart the table sequence
$tblnum = 0;

for my $colcnt (2..10)
{
	next_table();

	for (1..100)
	{
		my $a = sprintf("column_%d", 1+int(rand($colcnt)));
		my $b = sprintf("column_%d", 1+int(rand($colcnt)));
		my $c = sprintf("column_%d", 1+int(rand($colcnt)));

		foreach my $where1 (keys %wherepattern1)
		{
			my $whereclause1 = sprintf($where1, $a, $b);
			print("
select actual, estimated, abs(actual - estimated) AS misestimate
	from check_estimated_rows('select * from $tblname where $whereclause1');");
		}
		foreach my $where2 (keys %wherepattern2)
		{
			my $whereclause1 = sprintf($where2, $a, $b, $a, $c);
			print("
select actual, estimated, abs(actual - estimated) AS misestimate
	from check_estimated_rows('select * from $tblname where $whereclause1');");
		}
		foreach my $where3 (keys %wherepattern3)
		{
			my $whereclause1 = sprintf($where3, $a, $b, $a, $c, $c, $a);
			print("
select actual, estimated, abs(actual - estimated) AS misestimate
	from check_estimated_rows('select * from $tblname where $whereclause1');");
		}
	}
}


—
Mark Dilger
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Re: Use extended statistics to estimate (Var op Var) clauses

Reply via email to