Modify index-opening code to guarantee that the indexes of a relation

are opened in a consistent order by different backends (I ordered them by index OID because that's easy, but any other consistent order would do as well). This avoids potential deadlock for index types that we acquire exclusive locks on ... ie, rtree.
2000-06-19 23:40:48 +00:00 · 2000-06-19 23:40:48 +00:00 · a1dfaef6c6
parent 1f75cdd5ed
commit a1dfaef6c6
2 changed files with 71 additions and 16 deletions
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execUtils.c,v 1.60 2000/06/17 21:48:49 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execUtils.c,v 1.61 2000/06/19 23:40:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -799,15 +799,20 @@ ExecOpenIndices(RelationInfo *resultRelationInfo)
 		/* ----------------
 		 * Open (and lock, if necessary) the index relation
 		 *
-		 * Hack for not btree and hash indices: they use relation
-		 * level exclusive locking on update (i.e. - they are not
-		 * ready for MVCC) and so we have to exclusively lock
-		 * indices here to prevent deadlocks if we will scan them
-		 * - index_beginscan places AccessShareLock, indices
-		 * update methods don't use locks at all. We release this
-		 * lock in ExecCloseIndices. Note, that hashes use page
-		 * level locking - i.e. are not deadlock-free, - let's
-		 * them be on their way -:)) vadim 03-12-1998
+		 * Hack for not btree and hash indices: they use relation level
+		 * exclusive locking on update (i.e. - they are not ready for MVCC)
+		 * and so we have to exclusively lock indices here to prevent
+		 * deadlocks if we will scan them - index_beginscan places
+		 * AccessShareLock, indices update methods don't use locks at all.
+		 * We release this lock in ExecCloseIndices. Note, that hashes use
+		 * page level locking - i.e. are not deadlock-free - let's them be
+		 * on their way -:)) vadim 03-12-1998
+		 *
+		 * If there are multiple not-btree-or-hash indices, all backends must
+		 * lock the indices in the same order or we will get deadlocks here
+		 * during concurrent updates.  This is now guaranteed by
+		 * RelationGetIndexList(), which promises to return the index list
+		 * in OID order.  tgl 06-19-2000
 		 * ----------------
 		 */
 		indexDesc = index_open(indexOid);
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.102 2000/06/18 22:44:17 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.103 2000/06/19 23:40:48 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -230,6 +230,7 @@ static Relation RelationBuildDesc(RelationBuildDescInfo buildinfo,
 static void IndexedAccessMethodInitialize(Relation relation);
 static void AttrDefaultFetch(Relation relation);
 static void RelCheckFetch(Relation relation);
+static List *insert_ordered_oid(List *list, Oid datum);

 static bool criticalRelcacheBuild = false;

@ -2078,6 +2079,12 @@ RelCheckFetch(Relation relation)
 * so that we must recompute the index list on next request.  This handles
 * creation or deletion of an index.
 *
+ * The returned list is guaranteed to be sorted in order by OID.  This is
+ * needed by the executor, since for index types that we obtain exclusive
+ * locks on when updating the index, all backends must lock the indexes in
+ * the same order or we will get deadlocks (see ExecOpenIndices()).  Any
+ * consistent ordering would do, but ordering by OID is easy.
+ *
 * Since shared cache inval causes the relcache's copy of the list to go away,
 * we return a copy of the list palloc'd in the caller's context.  The caller
 * may freeList() the returned list after scanning it.  This is necessary
@ -2163,7 +2170,7 @@ RelationGetIndexList(Relation relation)

 		index = (Form_pg_index) GETSTRUCT(htup);

-		result = lappendi(result, index->indexrelid);
+		result = insert_ordered_oid(result, index->indexrelid);

 		if (hasindex)
 			ReleaseBuffer(buffer);
@ -2178,7 +2185,7 @@ RelationGetIndexList(Relation relation)
 		heap_endscan(hscan);
 	heap_close(indrel, AccessShareLock);

-	/* Now we can save the completed list in the relcache entry. */
+	/* Now save a copy of the completed list in the relcache entry. */
 	oldcxt = MemoryContextSwitchTo((MemoryContext) CacheCxt);
 	relation->rd_indexlist = listCopy(result);
 	relation->rd_indexfound = true;
@ -2187,6 +2194,39 @@ RelationGetIndexList(Relation relation)
 	return result;
 }

+/*
+ * insert_ordered_oid
+ *		Insert a new Oid into a sorted list of Oids, preserving ordering
+ *
+ * Building the ordered list this way is O(N^2), but with a pretty small
+ * constant, so for the number of entries we expect it will probably be
+ * faster than trying to apply qsort().  Most tables don't have very many
+ * indexes...
+ */
+static List *
+insert_ordered_oid(List *list, Oid datum)
+{
+	List	   *l;
+
+	/* Does the datum belong at the front? */
+	if (list == NIL || datum < (Oid) lfirsti(list))
+		return lconsi(datum, list);
+	/* No, so find the entry it belongs after */
+	l = list;
+	for (;;)
+	{
+		List	   *n = lnext(l);
+
+		if (n == NIL || datum < (Oid) lfirsti(n))
+			break;				/* it belongs before n */
+		l = n;
+	}
+	/* Insert datum into list after item l */
+	lnext(l) = lconsi(datum, lnext(l));
+	return list;
+}
+
+
 /*
 *	init_irels(), write_irels() -- handle special-case initialization of
 *								   index relation descriptors.
@ -2412,7 +2452,14 @@ write_irels(void)

 	fd = PathNameOpenFile(tempfilename, O_WRONLY | O_CREAT | O_TRUNC | PG_BINARY, 0600);
 	if (fd < 0)
-		elog(FATAL, "cannot create init file %s", tempfilename);
+	{
+		/*
+		 * We used to consider this a fatal error, but we might as well
+		 * continue with backend startup ...
+		 */
+		elog(NOTICE, "Cannot create init file %s: %m\n\tContinuing anyway, but there's something wrong.", tempfilename);
+		return;
+	}

 	FileSeek(fd, 0L, SEEK_SET);

@ -2540,7 +2587,10 @@ write_irels(void)

 	/*
 	 * And rename the temp file to its final name, deleting any
-	 * previously- existing init file.
+	 * previously-existing init file.
 	 */
-	rename(tempfilename, finalfilename);
+	if (rename(tempfilename, finalfilename) < 0)
+	{
+		elog(NOTICE, "Cannot rename init file %s to %s: %m\n\tContinuing anyway, but there's something wrong.", tempfilename, finalfilename);
+	}
 }