Correct oversight in hashjoin cost estimation: nodeHash sizes its hash

table for an average of NTUP_PER_BUCKET tuples/bucket, but cost_hashjoin was assuming a target load of one tuple/bucket. This was causing a noticeable underestimate of hashjoin costs.
2000-04-18 05:43:02 +00:00 · 2000-04-18 05:43:02 +00:00 · 25442d8d2f
parent 24864d048e
commit 25442d8d2f
3 changed files with 16 additions and 8 deletions
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
- *	$Id: nodeHash.c,v 1.44 2000/01/26 05:56:22 momjian Exp $
+ *	$Id: nodeHash.c,v 1.45 2000/04/18 05:43:01 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -221,7 +221,6 @@ ExecEndHash(Hash *node)
 *		create a hashtable in shared memory for hashjoin.
 * ----------------------------------------------------------------
 */
-#define NTUP_PER_BUCKET			10
 #define FUDGE_FAC				2.0

 HashJoinTable
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -42,7 +42,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.57 2000/04/12 17:15:19 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.58 2000/04/18 05:43:02 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -51,6 +51,7 @@

 #include <math.h>

+#include "executor/nodeHash.h"
 #include "miscadmin.h"
 #include "nodes/plannodes.h"
 #include "optimizer/clauses.h"
@ -604,12 +605,17 @@ cost_hashjoin(Path *path,
 	run_cost += cpu_operator_cost * outer_path->parent->rows;

 	/*
-	 * the number of tuple comparisons needed is the number of outer
-	 * tuples times the typical hash bucket size, which we estimate
-	 * conservatively as the inner disbursion times the inner tuple count.
+	 * The number of tuple comparisons needed is the number of outer
+	 * tuples times the typical hash bucket size.  nodeHash.c tries for
+	 * average bucket loading of NTUP_PER_BUCKET, but that goal will
+	 * be reached only if data values are uniformly distributed among
+	 * the buckets.  To be conservative, we scale up the target bucket
+	 * size by the number of inner rows times inner disbursion, giving
+	 * an estimate of the typical number of duplicates of each value.
+	 * We then charge one cpu_operator_cost per tuple comparison.
 	 */
 	run_cost += cpu_operator_cost * outer_path->parent->rows *
-		ceil(inner_path->parent->rows * innerdisbursion);
+		NTUP_PER_BUCKET * ceil(inner_path->parent->rows * innerdisbursion);

 	/*
 	 * Estimate the number of tuples that get through the hashing filter
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: nodeHash.h,v 1.15 2000/01/26 05:58:05 momjian Exp $
+ * $Id: nodeHash.h,v 1.16 2000/04/18 05:43:00 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -16,6 +16,9 @@

 #include "nodes/plannodes.h"

+/* NTUP_PER_BUCKET is exported because planner wants to see it */
+#define NTUP_PER_BUCKET			10
+
 extern TupleTableSlot *ExecHash(Hash *node);
 extern bool ExecInitHash(Hash *node, EState *estate, Plan *parent);
 extern int	ExecCountSlotsHash(Hash *node);