guix-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Ludovic Courtès
Date: Sat, 19 Oct 2024 16:04:48 -0400 (EDT)

branch: main
commit 79190e4352caf38eeb39af23d40fa278bbd2afbf
Author: Ludovic Courtès <ludo@gnu.org>
AuthorDate: Fri Oct 18 17:19:57 2024 +0200

    remote-server: Move back to rescheduling builds asynchronously.
    
    This undoes 5e5912717957d294de83e5dcea894214ad97ac75 and
    6cd1608e644a755dbbfd2937489061ed31eb8bbf.
    
    That approach was bogus for two reasons: first because the logic in
    ‘db-update-build-status!’ was wrong since we cannot tell whether the
    previous attempt of the *same* build failed.  Second because the “update
    status of dependent builds” operation is too expensive when the ‘Builds’
    table has lots of rows.
    
    * src/cuirass/database.scm (db-reschedule-dependent-builds): Remove.
    (db-reschedule-builds-without-failed-dependencies): New procedure; this
    is ‘db-update-resumable-builds!’ as it was removed in
    6cd1608e644a755dbbfd2937489061ed31eb8bbf.
    (db-update-build-status!): Remove call to
    ‘db-reschedule-dependent-builds’.
    * src/cuirass/scripts/remote-server.scm (serve-build-requests): Send
    'failed-dependency to BUILD-MAINTAINER.
    * src/cuirass/base.scm (build-maintainer): Remove ‘&timeout’.  Expect a
    symbol on CHANNEL and honor it.  Change default #:period to (* 10 60).
    (spawn-build-maintainer): Change default #:period to (* 10 60).
    * tests/database.scm ("status of dependent builds"): Add call to
    ‘db-reschedule-builds-without-failed-dependencies’.
---
 src/cuirass/base.scm                  | 27 +++++++-------
 src/cuirass/database.scm              | 68 ++++++++++++++---------------------
 src/cuirass/scripts/remote-server.scm |  6 ++--
 tests/database.scm                    |  1 +
 4 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/src/cuirass/base.scm b/src/cuirass/base.scm
index ade04fa..30a90e5 100644
--- a/src/cuirass/base.scm
+++ b/src/cuirass/base.scm
@@ -998,16 +998,13 @@ information about derivations that are still needed by 
queued builds."
 ;;; Database maintenance.
 ;;;
 
-(define* (build-maintainer channel #:key (period (* 15 60)))
+(define* (build-maintainer channel #:key (period (* 10 60)))
   (lambda ()
     (define max-queue-length
       ;; Maximum number of queued maintenance requests.  This is necessary to
       ;; avoid queuing too many maintenance fibers, which could be expensive.
       3)
 
-    (define &timeout
-      (list 'timeout))
-
     (define pool
       (make-resource-pool '(x) 'build-maintainer))
 
@@ -1015,7 +1012,7 @@ information about derivations that are still needed by 
queued builds."
       (make-atomic-box 0))
 
     (log-info "build maintainer will run every ~as" period)
-    (let loop ()
+    (let loop ((type 'all))
       (if (> (atomic-box-fetch-and-increment! queued) max-queue-length)
           (atomic-box-fetch-and-decrement! queued)
           (spawn-fiber
@@ -1023,16 +1020,22 @@ information about derivations that are still needed by 
queued builds."
              ;; Since these are expensive queries, run at most one at a given
              ;; point in time and do not block anyone writing to CHANNEL.
              (with-resource-from-pool pool token
-               (db-reschedule-stale-builds)
-               (db-mark-failed-dependency-builds))
+               (log-info "running build maintenance (~a)" type)
+               (when (memq type '(all stale))
+                 (db-reschedule-stale-builds))
+               (when (memq type '(all resume))
+                 (db-reschedule-builds-without-failed-dependencies))
+               (when (memq type '(all failed-dependency))
+                 (db-mark-failed-dependency-builds)))
+
              (atomic-box-fetch-and-decrement! queued))))
-      (unless (eq? (get-message* channel period &timeout) &timeout)
-        (log-info "build maintenance triggered"))
-      (loop))))
+      (loop (get-message* channel period 'all)))))
 
-(define* (spawn-build-maintainer #:key (period (* 15 60)))
+(define* (spawn-build-maintainer #:key (period (* 10 60)))
   "Start a fiber periodically running expensive database queries.  Those
-operations can be triggered by sending a message on the returned channel."
+operations can be triggered by sending a message on the returned channel with
+one of the following symbols denoting the specific maintenance task requested:
+'stale, 'resume, 'failed-dependency, or 'all."
   (let ((channel (make-channel)))
     (spawn-fiber (build-maintainer channel #:period period))
     channel))
diff --git a/src/cuirass/database.scm b/src/cuirass/database.scm
index 015478f..5b54ef1 100644
--- a/src/cuirass/database.scm
+++ b/src/cuirass/database.scm
@@ -215,6 +215,7 @@
             db-get-workers
             db-worker-current-builds
             db-reschedule-stale-builds
+            db-reschedule-builds-without-failed-dependencies
             db-mark-failed-dependency-builds
             db-remove-workers
             db-clear-workers
@@ -1147,38 +1148,6 @@ WHERE dep.source = " build))
 
 (define build-dependencies/id (compose db-get-build-dependencies build-id))
 
-(define (db-reschedule-dependent-builds build)
-  "Reschedule builds that depend on BUILD that only have succeeding
-dependencies.
-
-Note: This is an expensive query but is usually rarely needed."
-  (let ((rescheduled
-         (with-db-connection db
-           (with-timing-check (format #f "rescheduling dependents of build ~a"
-                                      (build-id build))
-             (exec-query/bind db "
-UPDATE Builds SET status = " (build-status scheduled) "
-FROM
-  -- Select the dependents of this build with exactly one failing
-  -- dependency.
-  (SELECT dependents.id
-   FROM
-     -- Get all the dependents of this build.
-     (SELECT Builds.id, Builds.derivation FROM Builds
-      LEFT JOIN BuildDependencies as bd ON bd.source = Builds.id
-      WHERE bd.target = " (build-id build) " GROUP BY Builds.id)
-   AS dependents
-   LEFT JOIN BuildDependencies AS bd ON bd.source = dependents.id
-   LEFT JOIN Builds AS dependencies
-          ON dependencies.id = bd.target AND dependencies.status != 0
-   GROUP BY dependents.id HAVING count(dependencies.id) = 0)
-AS relevantdependents
-WHERE Builds.id = relevantdependents.id;")
-             #:threshold 10))))
-    (log-info "rescheduled ~a dependent builds of build ~a (~a)"
-              rescheduled (build-id build) (build-derivation build))
-    rescheduled))
-
 (define (db-mark-as-failed-if-dependencies-failed drv)
   "Mark the build of DRV as 'failed-dependency' if it has one or more failed
 dependencies."
@@ -1225,6 +1194,27 @@ WHERE Builds.status = " (build-status scheduled)
         (log-info "marked ~a builds as 'failed-dependency'"
                   marked)))))
 
+(define (db-reschedule-builds-without-failed-dependencies)
+  "Update the build status of the failed-dependency builds which all
+dependencies are successful to scheduled."
+  (with-db-connection db
+    (let ((rescheduled
+           (with-timing-check "rescheduling builds without failed dependencies"
+             (exec-query/bind db "
+UPDATE Builds
+   SET status = " (build-status scheduled) "
+FROM
+   (SELECT Builds.id, count(dep.id) AS deps FROM Builds
+    LEFT JOIN BuildDependencies AS bd ON bd.source = Builds.id
+    LEFT JOIN Builds AS dep ON bd.target = dep.id AND dep.status != 0
+    WHERE Builds.status = " (build-status failed-dependency) "
+    GROUP BY Builds.id HAVING count(dep.id) = 0)
+AS deps
+WHERE deps.id = Builds.id"))))
+      (unless (zero? rescheduled)
+        (log-info "rescheduled ~a builds with zero failed dependencies~%"
+                  rescheduled)))))
+
 (define (db-register-builds builds specification)
   (define (previous-build outputs)
     ;; Return the previous build producing OUTPUTS or #f if there is none.
@@ -1332,16 +1322,10 @@ UPDATE Builds SET stoptime =" now
           (when (positive? rows)
             ;; Note: 'db-mark-failed-dependency-builds' is not called from
             ;; here because it takes too long when there are many 'Builds'
-            ;; rows.  Instead, that procedure is called periodically by the
-            ;; "build maintainer", which is good enough.
-
-            (when (and (= status (build-status succeeded))
-                       (build-failure? (build-current-status build)))
-              ;; Transitioning from "failed" to "succeeded", for instance
-              ;; because the build was restarted, so reschedule every build 
that
-              ;; depends on this one.
-              (db-reschedule-dependent-builds build))
-
+            ;; rows; likewise for
+            ;; 'db-reschedule-builds-without-failed-dependencies'.  Instead,
+            ;; that procedure is called periodically by the "build
+            ;; maintainer", which is good enough.
             (let* ((spec (build-specification-name build))
                    (specification (db-get-specification spec))
                    (notifications
diff --git a/src/cuirass/scripts/remote-server.scm 
b/src/cuirass/scripts/remote-server.scm
index f1208ad..caeb8ed 100644
--- a/src/cuirass/scripts/remote-server.scm
+++ b/src/cuirass/scripts/remote-server.scm
@@ -510,18 +510,20 @@ Use WORKER-DIRECTORY to maintain the list of active 
workers."
             (`(build-succeeded (drv ,drv) ,_ ,...)
              (log-debug "fetching required for ~a (success)" drv)
              (put-message fetch-worker command)
+             ;; TODO: Tell BUILD-MAINTAINER to resume failed-dependency when
+             ;; DRV is known to have failed to build on a previous attempt.
              #t)
             (`(build-failed (drv ,drv) ,_ ,...)
              (log-debug "fetching required for ~a (fail)" drv)
              (put-message fetch-worker command)
-             (put-message build-maintainer #t) ;mark 'failed-dependency' builds
+             (put-message build-maintainer 'failed-dependency) ;mark 
'failed-dependency' builds
              #t)
             (('build-dependency-failed ('drv drv) ('url url)
                                        ('dependency dependency) _ ...)
              (log-info "build failed: dependency '~a' of '~a'"
                        dependency drv)
              (db-update-build-status! drv (build-status failed-dependency))
-             (put-message build-maintainer #t) ;mark 'failed-dependency' builds
+             (put-message build-maintainer 'failed-dependency) ;mark 
'failed-dependency' builds
              (let ((parent (db-get-build drv)))
                (when parent
                  ;; DEPENDENCY presumably lacks a corresponding 'Builds' row
diff --git a/tests/database.scm b/tests/database.scm
index 9cbf67e..f836d4f 100644
--- a/tests/database.scm
+++ b/tests/database.scm
@@ -1037,6 +1037,7 @@ timestamp, checkouttime, evaltime) VALUES ('guix', 0, 0, 
0, 0);")
           ;; no other failed dependency.
           (db-update-build-status! "/dep-1.drv" (build-status succeeded))
           (db-mark-failed-dependency-builds)
+          (db-reschedule-builds-without-failed-dependencies)
 
           `((initial ,@initial)
             (final ,@(map status '("/primary-drv-1.drv"



reply via email to

[Prev in Thread] Current Thread [Next in Thread]