Beginnings of new free page manager.
authorRobert Haas <rhaas@postgresql.org>
Wed, 12 Feb 2014 18:33:26 +0000 (13:33 -0500)
committerRobert Haas <rhaas@postgresql.org>
Wed, 12 Feb 2014 18:33:26 +0000 (13:33 -0500)
src/backend/utils/mmgr/Makefile
src/backend/utils/mmgr/freepage.c [new file with mode: 0644]
src/include/utils/freepage.h [new file with mode: 0644]
src/include/utils/relptr.h [new file with mode: 0644]

index b2403e186f5a5fe773b4c0dff74e9cf2f9a76789..20973af3ca9758c905e8e16ccb4cc51de8799511 100644 (file)
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = aset.o mcxt.o portalmem.o
+OBJS = aset.o freepage.o mcxt.o portalmem.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c
new file mode 100644 (file)
index 0000000..7acb434
--- /dev/null
@@ -0,0 +1,404 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.c
+ *       Management of free memory pages.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/utils/mmgr/freepage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "utils/freepage.h"
+
+/* Magic numbers to identify various page types */
+#define FREE_PAGE_SPAN_LEADER_MAGIC            0xea4020f0
+#define FREE_PAGE_LEAF_MAGIC            0x98eae728
+#define FREE_PAGE_INTERNAL_MAGIC        0x19aa32c9
+
+/* Doubly linked list of spans of free pages; stored in first page of span. */
+struct FreePageSpanLeader
+{
+       int             magic;                          /* always FREE_PAGE_SPAN_LEADER_MAGIC */
+       Size    npages;                         /* number of pages in span */
+       relptr(FreePageSpanLeader)      prev;
+       relptr(FreePageSpanLeader)      next;
+};
+
+/* Common header for btree leaf and internal pages. */
+typedef struct FreePageBtreeHeader
+{
+       int             magic;          /* FREE_PAGE_LEAF_MAGIC or FREE_PAGE_INTERNAL_MAGIC */
+       Size    nused;          /* number of items used */
+       relptr(FreePageBtree) parent;   /* uplink */
+} FreePageBtreeHeader;
+
+/* Internal key; points to next level of btree. */
+typedef struct FreePageBtreeInternalKey
+{
+       Size    first_page;                             /* low bound for keys on child page */
+       relptr(FreePageBtree) child;    /* downlink */
+} FreePageBtreeInternalKey;
+
+/* Leaf key; no payload data. */
+typedef struct FreePageBtreeLeafKey
+{
+       Size    first_page;                             /* low bound for keys on child page */
+       Size    last_page;                              /* high bound for keys on child page */
+} FreePageBtreeLeafKey;
+
+/* Work out how many keys will fit on a page. */
+#define FPM_ITEMS_PER_INTERNAL_PAGE \
+       ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+               sizeof(FreePageBtreeInternalKey))
+#define FPM_ITEMS_PER_LEAF_PAGE \
+       ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+               sizeof(FreePageBtreeLeafKey))
+
+/* A btree page of either sort */
+struct FreePageBtree
+{
+       FreePageBtreeHeader     hdr;
+       union
+       {
+               FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
+               FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
+       } u;
+};
+
+/* Results of a btree search */
+typedef struct FreePageBtreeSearchResult
+{
+       FreePageBtree  *page_exact;
+       Size                    index_exact;
+       FreePageBtree  *page_next;
+       Size                    index_next;
+       FreePageBtree  *page_prev;
+       Size                    index_prev;
+} FreePageBtreeSearchResult;
+
+/* Helper functions */
+static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
+                                       Size index);
+static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+                                       FreePageBtreeSearchResult *result);
+static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
+static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
+
+/*
+ * Initialize a new, empty free page manager.
+ *
+ * 'fpm' should reference caller-provided memory large enough to contain a
+ * FreePageManager.  We'll initialize it here.
+ *
+ * 'base' is the address to which all pointers are relative.  When managing
+ * a dynamic shared memory segment, it should normally be the base of the
+ * segment.  When managing backend-private memory, it can be either NULL or,
+ * if managing a single contiguous extent of memory, the start of that extent.
+ *
+ * 'lock' is the lock to be used to synchronize access to this FreePageManager.
+ * It can be NULL if synchronization is not required, either because we're
+ * managing backend-private memory or because we're managing shared memory but
+ * synchronization is caller-provided or not required.  (For example, if only
+ * one process is allocating and freeing memory, locking isn't needed.)
+ *
+ * 'lock_address_is_fixed' should be false if the LWLock to be used for
+ * synchronization is stored in the same dynamic shared memory segment as
+ * the managed region, and true if it is stored in the main shared memory
+ * segment.  Storing the LWLock in some other dynamic shared memory segment
+ * isn't supported.  This is ignored when lock is NULL.
+ */
+void
+FreePageManagerInitialize(FreePageManager *fpm, char *base, LWLock *lock,
+                                                 bool lock_address_is_fixed)
+{
+       Size    f;
+
+       relptr_store(base, fpm->self, fpm);
+       relptr_store(base, fpm->lock, lock);
+       fpm->lock_address_is_fixed = lock_address_is_fixed;
+       relptr_store(base, fpm->root, (FreePageBtree *) NULL);
+
+       for (f = 0; f < FPM_NUM_FREELISTS; f++)
+               relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
+}
+
+/*
+ * Allocate a run of pages of the given length from the free page manager.
+ * The return value indicates whether we were able to satisfy the request;
+ * if true, the first page of the allocation is stored in *first_page.
+ */
+bool
+FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
+{
+       LWLock *lock = fpm_lock(fpm);
+       char *base = fpm_segment_base(fpm);
+       FreePageSpanLeader *victim = NULL;
+       Size    victim_page = 0;                /* placate compiler */
+       Size    f;
+
+       /* Acquire lock (if there is one). */
+       if (lock != NULL)
+               LWLockAcquire(lock, LW_EXCLUSIVE);
+
+       /*
+        * Search for a free span.
+        *
+        * Right now, we use a simple best-fit policy here, but it's possible for
+        * this to result in memory fragmentation if we're repeatedly asked to
+        * allocate chunks just a little smaller than what we have available.
+        * Hopefully, this is unlikely, because we expect most requests to be
+        * single pages (for the bootstrap allocator) or superblock-sized chunks
+        * (for the superblock allocator, and for address space map memory),
+        * but no policy can be optimal under all circumstances unless it has
+        * knowledge of future allocation patterns.
+        */
+       for (f = Max(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
+       {
+               /* Skip empty freelists. */
+               if (relptr_is_null(fpm->freelist[f]))
+                       continue;
+
+               /*
+                * All of the freelists except the last one contain only items of a
+                * single size, so we just take the first one.  But the final free
+                * list contains everything too big for any of the other lists, so
+                * we need to search the list.
+                */
+               if (f < FPM_NUM_FREELISTS - 1)
+                       victim = relptr_access(base, fpm->freelist[f]);
+               else
+               {
+                       FreePageSpanLeader *candidate;
+
+                       candidate = relptr_access(base, fpm->freelist[f]);
+                       do
+                       {
+                               if (candidate->npages >= npages && (victim == NULL ||
+                                       victim->npages > candidate->npages))
+                               {
+                                       victim = candidate;
+                                       if (victim->npages == npages)
+                                               break;
+                               }
+                               candidate = relptr_access(base, candidate->next);
+                       } while (candidate != NULL);
+               }
+               break;
+       }
+
+       /* If we found a victim, remove it from the freelist and btree. */
+       if (victim != NULL)
+       {
+               FreePageSpanLeader *prev = relptr_access(base, victim->prev);
+               FreePageSpanLeader *next = relptr_access(base, victim->next);
+               FreePageBtreeSearchResult result;
+
+               if (prev != NULL)
+                       relptr_copy(prev->next, victim->next);
+               else
+                       relptr_copy(fpm->freelist[f], victim->next);
+               if (next != NULL)
+                       relptr_copy(next->prev, victim->prev);
+
+               victim_page = fpm_pointer_to_page(base, victim);
+               FreePageBtreeSearch(fpm, victim_page, &result);
+               Assert(result.page_exact != NULL);
+               FreePageBtreeRemove(fpm, result.page_exact, result.index_exact);
+       }
+
+       /* Release lock (if there is one). */
+       if (lock != NULL)
+               LWLockRelease(lock);
+
+       /* Return results to caller. */
+       if (victim == NULL)
+               return false;
+       *first_page = victim_page;
+       return true;
+}
+
+/*
+ * Remove from the btree the item in the given position on the given page.
+ */
+static void
+FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
+{
+       Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+       Assert(index < btp->hdr.nused);
+
+       /* Shuffle remaining keys. */
+       --btp->hdr.nused;
+       if (index < btp->hdr.nused)
+               memmove(&btp->u.leaf_key[index], btp->u.leaf_key[index + 1],
+                               sizeof(FreePageBtreeLeafKey) * btp->hdr.nused - index);
+
+       /*
+        * XXX.  At this point, the key is gone, but the node may be empty or
+        * may contain very few keys.   If we're the root page, that's life.
+        * Otherwise, unlink this page from the parent.  Alternatively, if this
+        * page is non-empty but less than half full, try to merge it with a
+        * sibling, which will likewise delete one key from the parent.
+        *
+        * Either way, we free up a page; make that into a free span.  Attempt
+        * a "no-split" insertion of that span into the btree: that is, succeed if
+        * the page can be combined with an existing span with which it is
+        * contiguous or if the btree page into which it needs to be inserted
+        * isn't full.  Otherwise, skip the insertion, which will lose the ability
+        * to consolidate that span with adjacent spans, but that's life sometimes.
+        * Put the span on the free list, setting a flag to indicate whether or not
+        * we managed to insert it into the btree.
+        *
+        * After freeing the page, repeat this process for our parent, which may
+        * now be empty or underfull.
+        */
+}
+
+/*
+ * Search the btree for an entry for the given first page and initialize
+ * *result with the results of the search.  If an exact match is found,
+ * result->page_exact and result->index_exact will be set to the page and
+ * slot where it is located. Otherwise, result->page_exact will be NULL;
+ * result->page_next and result->index_next will indicate the location of
+ * the following key (unless the proposed first_page would follow everything
+ * currently in the tree, in which case result->page_next will be NULL); and
+ * result->page_prev and result->index_prev will indicate the preceding
+ * key (unless the proposed first_page would precede everything currently
+ * in the tree, in which case result->page_prev will be NULL).  Except
+ * as described above, the contents of fields in the result object are
+ * undefined on return.
+ */
+static void
+FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+                                       FreePageBtreeSearchResult *result)
+{
+       char *base = fpm_segment_base(fpm);
+       FreePageBtree *btp = relptr_access(base, fpm->root);
+       Size    index;
+
+       /* If the btree is empty, then this would be the only item. */
+       if (btp == NULL)
+       {
+               result->page_exact = NULL;
+               result->page_next = NULL;
+               result->page_prev = NULL;
+               return;
+       }
+
+       /* Descend until we hit a leaf. */
+       while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+       {
+               index = FreePageBtreeSearchInternal(btp, first_page);
+               btp = relptr_access(base, btp->u.internal_key[index].child);
+       }
+
+       /* Search leaf page. */
+       index = FreePageBtreeSearchLeaf(btp, first_page);
+
+       /* Did we get an exact match?  If so, return it. */
+       if (first_page == btp->u.leaf_key[index].first_page)
+       {
+               result->page_exact = btp;
+               result->index_exact = index;
+               return;
+       }
+
+       /* No exact match, so we have the next key. */
+       result->page_exact = NULL;
+       result->page_next = btp;
+       result->index_next = index;
+
+       /* Find the previous key. */
+       if (index > 0)
+       {
+               /* Previous key on same page. */
+               result->page_prev = btp;
+               result->index_prev = index - 1;
+       }
+       else
+       {
+               /* Walk up tree until we can move left. */
+               while (index == 0)
+               {
+                       btp = relptr_access(base, btp->hdr.parent);
+                       if (btp == NULL)
+                       {
+                               /* We're the first key in the btree. */
+                               result->page_prev = NULL;
+                               return;
+                       }
+                       index = FreePageBtreeSearchInternal(btp, first_page);
+               }
+
+               /* Move left. */
+               btp = relptr_access(base, btp->u.internal_key[index - 1].child);
+
+               /* Descend right. */
+               while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+               {
+                       Size    nused = btp->hdr.nused;
+
+                       btp = relptr_access(base, btp->u.internal_key[nused - 1].child);
+               }
+
+               /* Get rightmost key on page. */
+               result->page_prev = btp;
+               result->index_prev = btp->hdr.nused - 1;
+       }
+}
+
+/*
+ * Search an internal page for the first key greater than or equal to a given
+ * page number.
+ */
+static Size
+FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
+{
+       Size    low = 0;
+       Size    high = btp->hdr.nused - 1;
+
+       Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+       Assert(high > 0 && high < FPM_ITEMS_PER_INTERNAL_PAGE);
+
+       while (low < high)
+       {
+               Size    mid = (low + high + 1) / 2;
+
+               if (first_page < btp->u.internal_key[mid].first_page)
+                       high = mid - 1;
+               else
+                       low = mid;
+       }
+
+       return low;
+}
+
+/*
+ * Search a leaf page for the first key greater than or equal to a given
+ * page number.
+ */
+static Size
+FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
+{
+       Size    low = 0;
+       Size    high = btp->hdr.nused - 1;
+
+       Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+       Assert(high > 0 && high < FPM_ITEMS_PER_LEAF_PAGE);
+
+       while (low < high)
+       {
+               Size    mid = (low + high + 1) / 2;
+
+               if (first_page < btp->u.leaf_key[mid].first_page)
+                       high = mid - 1;
+               else
+                       low = mid;
+       }
+
+       return low;
+}
diff --git a/src/include/utils/freepage.h b/src/include/utils/freepage.h
new file mode 100644 (file)
index 0000000..84ee0d3
--- /dev/null
@@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.h
+ *       Management of page-organized free memory.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/relptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef FREEPAGE_H 
+#define FREEPAGE_H
+
+#include "storage/lwlock.h"
+#include "utils/relptr.h"
+
+/* Forward declarations. */
+typedef struct FreePageSpanLeader FreePageSpanLeader;
+typedef struct FreePageBtree FreePageBtree;
+typedef struct FreePageManager FreePageManager;
+
+/*
+ * PostgreSQL normally uses 8kB pages for most things, but many common
+ * architecture/operating system pairings use a 4kB page size for memory
+ * allocation, so we do that here also.  We assume that a large allocation
+ * is likely to begin on a page boundary; if not, we'll discard bytes from
+ * the beginning and end of the object and use only the middle portion that
+ * is properly aligned.  This works, but is not ideal, so it's best to keep
+ * this conservatively small.  There don't seem to be any common architectures
+ * where the page size is less than 4kB, so this should be good enough; also,
+ * making it smaller would increase the space consumed by the address space
+ * map, which also uses this page size.
+ */
+#define FPM_PAGE_SIZE                  4096
+
+/*
+ * Each freelist except for the last contains only spans of one particular
+ * size.  Everything larger goes on the last one.  In some sense this seems
+ * like a waste since most allocations are in a few common sizes, but it
+ * means that small allocations can simply pop the head of the relevant list
+ * without needing to worry about whether the object we find there is of
+ * precisely the correct size (because we know it must be).
+ */
+#define FPM_NUM_FREELISTS              129
+
+/* Everything we need in order to manage free pages (see freepage.c) */
+struct FreePageManager
+{
+    relptr(FreePageManager)  self;
+    relptr(LWLock)  lock;
+       bool                    lock_address_is_fixed;
+    relptr(FreePageBtree)   root;
+    relptr(FreePageSpanLeader)  freelist[FPM_NUM_FREELISTS];
+};
+
+/* Macros to convert between page numbers (expressed as Size) and pointers. */
+#define fpm_page_to_pointer(base, page)        \
+       (AssertVariableIsOfTypeMacro(page, Size), \
+        (base) + FPM_PAGE_SIZE * (page))
+#define fpm_pointer_to_page(base, ptr)         \
+       (((Size) (((char *) (ptr)) - (base))) / FPM_PAGE_SIZE)
+
+/* Macros to check alignment of absolute and relative pointers. */
+#define fpm_pointer_is_page_aligned(base, ptr)         \
+       (((Size) (((char *) (ptr)) - (base))) % FPM_PAGE_SIZE == 0)
+#define fpm_relptr_is_page_aligned(base, relptr)               \
+       ((relptr).relptr_off % FPM_PAGE_SIZE == 0)
+
+/* Macro to find base address of the segment containing a FreePageManager. */
+#define fpm_segment_base(fpm)  \
+       (((char *) fpm) - fpm->self.relptr_off)
+
+/* Macro to find the lwlock for the FreePageManager. */
+#define fpm_lock(fpm) \
+       (relptr_access((fpm)->lock_address_is_fixed ? NULL : \
+               fpm_segment_base(fpm), (fpm)->lock))
+
+/* Functions to manipulate the free page map. */
+extern void FreePageManagerInitialize(FreePageManager *fpm, char *base,
+                                                 LWLock *lock, bool lock_address_is_fixed);
+extern bool FreePageManagerGet(FreePageManager *fpm, Size npages,
+                                               Size *first_page);
+
+#endif   /* FREEPAGE_H */
diff --git a/src/include/utils/relptr.h b/src/include/utils/relptr.h
new file mode 100644 (file)
index 0000000..46281cf
--- /dev/null
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * relptr.h
+ *       This file contains basic declarations for relative pointers.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/relptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef RELPTR_H
+#define RELPTR_H
+
+/*
+ * Relative pointers are intended to be used when storing an address that may
+ * be relative either to the base of the processes address space or some
+ * dynamic shared memory segment mapped therein.
+ *
+ * The idea here is that you declare a relative pointer as relptr(type)
+ * and then use relptr_access to dereference it and relptr_store to change
+ * it.  The use of a union here is a hack, because what's stored in the
+ * relptr is always a Size, never an actual pointer.  But including a pointer
+ * in the union allows us to use stupid macro tricks to provide some measure
+ * of type-safety.
+ */
+#define relptr(type)     union { type *relptr_type; Size relptr_off; }
+#define relptr_access(base, rp) \
+       (AssertVariableIsOfTypeMacro(base, char *), \
+        (__typeof__((rp).relptr_type)) ((rp).relptr_off == 0 ? NULL : \
+               (base + (rp).relptr_off)))
+#define relptr_is_null(rp) \
+       ((rp).relptr_off == 0)
+#define relptr_store(base, rp, val) \
+       (AssertVariableIsOfTypeMacro(base, char *), \
+        AssertVariableIsOfTypeMacro(val, __typeof__((rp).relptr_type)), \
+        (rp).relptr_off = ((val) == NULL ? 0 : ((char *) (val)) - (base)))
+#define relptr_copy(rp1, rp2) \
+       ((rp1).relptr_off = (rp2).relptr_off)
+
+#endif   /* RELPTR_H */