commit 74615f69cf6bf95c73443331733677c61ee387b5
Author: Mike Kelly <mike@weatherwax.co.uk>
Date:   Tue Mar 10 21:28:12 2026 +0000

    Rebalance segments for privileged page allocation
    
    It's possible to allocate all pages in a segment (and higher priority
    ones) to privileged threads. This can result in a panic even though
    memory is available in lower priority segments which haven't yet been
    rebalanced by the pageout daemon. Rebalancing is now done actively
    during privileged page allocation.
    Message-ID: <20260310212847.153812-2-mike@weatherwax.co.uk>

diff --git a/vm/vm_page.c b/vm/vm_page.c
index 5cdf0c7b..a656aa01 100644
--- a/vm/vm_page.c
+++ b/vm/vm_page.c
@@ -990,7 +990,8 @@ vm_page_seg_double_unlock(struct vm_page_seg *seg1, struct vm_page_seg *seg2)
  */
 static boolean_t
 vm_page_seg_balance_page(struct vm_page_seg *seg,
-                         struct vm_page_seg *remote_seg)
+                         struct vm_page_seg *remote_seg,
+                         boolean_t priv_alloc)
 {
     struct vm_page *src, *dest;
     vm_object_t object;
@@ -1002,7 +1003,9 @@ vm_page_seg_balance_page(struct vm_page_seg *seg,
     vm_page_seg_double_lock(seg, remote_seg);
 
     if (vm_page_seg_usable(seg)
-        || !vm_page_seg_page_available(remote_seg)) {
+        || (priv_alloc
+            ? remote_seg->nr_free_pages == 0
+            : !vm_page_seg_page_available(remote_seg))) {
         goto error;
     }
 
@@ -1082,7 +1085,7 @@ error:
 }
 
 static boolean_t
-vm_page_seg_balance(struct vm_page_seg *seg)
+vm_page_seg_balance(struct vm_page_seg *seg, boolean_t priv_alloc)
 {
     struct vm_page_seg *remote_seg;
     unsigned int i;
@@ -1100,7 +1103,7 @@ vm_page_seg_balance(struct vm_page_seg *seg)
             continue;
         }
 
-        balanced = vm_page_seg_balance_page(seg, remote_seg);
+        balanced = vm_page_seg_balance_page(seg, remote_seg, priv_alloc);
 
         if (balanced) {
             return TRUE;
@@ -1611,16 +1614,28 @@ vm_page_alloc_pa(unsigned int order, unsigned int selector, unsigned short type)
     struct vm_page *page;
     unsigned int i;
 
-    for (i = vm_page_select_alloc_seg(selector); i < vm_page_segs_size; i--) {
+    const unsigned int seg_index = vm_page_select_alloc_seg(selector);
+
+retry:
+    simple_lock(&vm_page_queue_free_lock);
+
+    for (i = seg_index; i < vm_page_segs_size; i--) {
         page = vm_page_seg_alloc(&vm_page_segs[i], order, type);
 
         if (page != NULL)
             return page;
     }
 
-    /* FIXME: rebalance segments? */
     if (!current_thread() || current_thread()->vm_privilege)
-        panic("vm_page: privileged thread unable to allocate page");
+      {
+	simple_unlock(&vm_page_queue_free_lock);
+
+	for (i = seg_index; i < vm_page_segs_size; i--)
+	  if (vm_page_seg_balance(vm_page_seg_get(i), TRUE))
+	    goto retry;
+
+	panic("vm_page: privileged thread unable to allocate page");
+      }
 
     return NULL;
 }
@@ -1989,7 +2004,7 @@ vm_page_balance_once(void)
      */
 
     for (i = 0; i < vm_page_segs_size; i++) {
-        balanced = vm_page_seg_balance(vm_page_seg_get(i));
+        balanced = vm_page_seg_balance(vm_page_seg_get(i), FALSE);
 
         if (balanced) {
             return TRUE;
diff --git a/vm/vm_page.h b/vm/vm_page.h
index 9e110209..49b5e602 100644
--- a/vm/vm_page.h
+++ b/vm/vm_page.h
@@ -461,6 +461,9 @@ struct vm_page * vm_page_lookup_pa(phys_addr_t pa);
  * The selector is used to determine the segments from which allocation can
  * be attempted.
  *
+ * vm_page_queue_free_lock should be in an unlocked state pre-call but
+ * will always be locked on return.
+ *
  * This function should only be used by the vm_resident module.
  */
 struct vm_page * vm_page_alloc_pa(unsigned int order, unsigned int selector,
diff --git a/vm/vm_resident.c b/vm/vm_resident.c
index a6a90026..aaf5fc8b 100644
--- a/vm/vm_resident.c
+++ b/vm/vm_resident.c
@@ -808,8 +808,6 @@ vm_page_t vm_page_grab(unsigned flags)
 	else
 		selector = VM_PAGE_SEL_DMA;
 
-	simple_lock(&vm_page_queue_free_lock);
-
 	/*
 	 * XXX Mach has many modules that merely assume memory is
 	 * directly mapped in kernel space. Instead of updating all
@@ -901,8 +899,6 @@ vm_page_t vm_page_grab_contig(
 	order = vm_page_order(size);
 	nr_pages = 1 << order;
 
-	simple_lock(&vm_page_queue_free_lock);
-
 	/* TODO Allow caller to pass type */
 	mem = vm_page_alloc_pa(order, selector, VM_PT_KERNEL);
 
