[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[no subject]
From: |
Ludovic Courtès |
Date: |
Sat, 19 Oct 2024 16:04:48 -0400 (EDT) |
branch: main
commit 79190e4352caf38eeb39af23d40fa278bbd2afbf
Author: Ludovic Courtès <ludo@gnu.org>
AuthorDate: Fri Oct 18 17:19:57 2024 +0200
remote-server: Move back to rescheduling builds asynchronously.
This undoes 5e5912717957d294de83e5dcea894214ad97ac75 and
6cd1608e644a755dbbfd2937489061ed31eb8bbf.
That approach was bogus for two reasons: first because the logic in
‘db-update-build-status!’ was wrong since we cannot tell whether the
previous attempt of the *same* build failed. Second because the “update
status of dependent builds” operation is too expensive when the ‘Builds’
table has lots of rows.
* src/cuirass/database.scm (db-reschedule-dependent-builds): Remove.
(db-reschedule-builds-without-failed-dependencies): New procedure; this
is ‘db-update-resumable-builds!’ as it was removed in
6cd1608e644a755dbbfd2937489061ed31eb8bbf.
(db-update-build-status!): Remove call to
‘db-reschedule-dependent-builds’.
* src/cuirass/scripts/remote-server.scm (serve-build-requests): Send
'failed-dependency to BUILD-MAINTAINER.
* src/cuirass/base.scm (build-maintainer): Remove ‘&timeout’. Expect a
symbol on CHANNEL and honor it. Change default #:period to (* 10 60).
(spawn-build-maintainer): Change default #:period to (* 10 60).
* tests/database.scm ("status of dependent builds"): Add call to
‘db-reschedule-builds-without-failed-dependencies’.
---
src/cuirass/base.scm | 27 +++++++-------
src/cuirass/database.scm | 68 ++++++++++++++---------------------
src/cuirass/scripts/remote-server.scm | 6 ++--
tests/database.scm | 1 +
4 files changed, 46 insertions(+), 56 deletions(-)
diff --git a/src/cuirass/base.scm b/src/cuirass/base.scm
index ade04fa..30a90e5 100644
--- a/src/cuirass/base.scm
+++ b/src/cuirass/base.scm
@@ -998,16 +998,13 @@ information about derivations that are still needed by
queued builds."
;;; Database maintenance.
;;;
-(define* (build-maintainer channel #:key (period (* 15 60)))
+(define* (build-maintainer channel #:key (period (* 10 60)))
(lambda ()
(define max-queue-length
;; Maximum number of queued maintenance requests. This is necessary to
;; avoid queuing too many maintenance fibers, which could be expensive.
3)
- (define &timeout
- (list 'timeout))
-
(define pool
(make-resource-pool '(x) 'build-maintainer))
@@ -1015,7 +1012,7 @@ information about derivations that are still needed by
queued builds."
(make-atomic-box 0))
(log-info "build maintainer will run every ~as" period)
- (let loop ()
+ (let loop ((type 'all))
(if (> (atomic-box-fetch-and-increment! queued) max-queue-length)
(atomic-box-fetch-and-decrement! queued)
(spawn-fiber
@@ -1023,16 +1020,22 @@ information about derivations that are still needed by
queued builds."
;; Since these are expensive queries, run at most one at a given
;; point in time and do not block anyone writing to CHANNEL.
(with-resource-from-pool pool token
- (db-reschedule-stale-builds)
- (db-mark-failed-dependency-builds))
+ (log-info "running build maintenance (~a)" type)
+ (when (memq type '(all stale))
+ (db-reschedule-stale-builds))
+ (when (memq type '(all resume))
+ (db-reschedule-builds-without-failed-dependencies))
+ (when (memq type '(all failed-dependency))
+ (db-mark-failed-dependency-builds)))
+
(atomic-box-fetch-and-decrement! queued))))
- (unless (eq? (get-message* channel period &timeout) &timeout)
- (log-info "build maintenance triggered"))
- (loop))))
+ (loop (get-message* channel period 'all)))))
-(define* (spawn-build-maintainer #:key (period (* 15 60)))
+(define* (spawn-build-maintainer #:key (period (* 10 60)))
"Start a fiber periodically running expensive database queries. Those
-operations can be triggered by sending a message on the returned channel."
+operations can be triggered by sending a message on the returned channel with
+one of the following symbols denoting the specific maintenance task requested:
+'stale, 'resume, 'failed-dependency, or 'all."
(let ((channel (make-channel)))
(spawn-fiber (build-maintainer channel #:period period))
channel))
diff --git a/src/cuirass/database.scm b/src/cuirass/database.scm
index 015478f..5b54ef1 100644
--- a/src/cuirass/database.scm
+++ b/src/cuirass/database.scm
@@ -215,6 +215,7 @@
db-get-workers
db-worker-current-builds
db-reschedule-stale-builds
+ db-reschedule-builds-without-failed-dependencies
db-mark-failed-dependency-builds
db-remove-workers
db-clear-workers
@@ -1147,38 +1148,6 @@ WHERE dep.source = " build))
(define build-dependencies/id (compose db-get-build-dependencies build-id))
-(define (db-reschedule-dependent-builds build)
- "Reschedule builds that depend on BUILD that only have succeeding
-dependencies.
-
-Note: This is an expensive query but is usually rarely needed."
- (let ((rescheduled
- (with-db-connection db
- (with-timing-check (format #f "rescheduling dependents of build ~a"
- (build-id build))
- (exec-query/bind db "
-UPDATE Builds SET status = " (build-status scheduled) "
-FROM
- -- Select the dependents of this build with exactly one failing
- -- dependency.
- (SELECT dependents.id
- FROM
- -- Get all the dependents of this build.
- (SELECT Builds.id, Builds.derivation FROM Builds
- LEFT JOIN BuildDependencies as bd ON bd.source = Builds.id
- WHERE bd.target = " (build-id build) " GROUP BY Builds.id)
- AS dependents
- LEFT JOIN BuildDependencies AS bd ON bd.source = dependents.id
- LEFT JOIN Builds AS dependencies
- ON dependencies.id = bd.target AND dependencies.status != 0
- GROUP BY dependents.id HAVING count(dependencies.id) = 0)
-AS relevantdependents
-WHERE Builds.id = relevantdependents.id;")
- #:threshold 10))))
- (log-info "rescheduled ~a dependent builds of build ~a (~a)"
- rescheduled (build-id build) (build-derivation build))
- rescheduled))
-
(define (db-mark-as-failed-if-dependencies-failed drv)
"Mark the build of DRV as 'failed-dependency' if it has one or more failed
dependencies."
@@ -1225,6 +1194,27 @@ WHERE Builds.status = " (build-status scheduled)
(log-info "marked ~a builds as 'failed-dependency'"
marked)))))
+(define (db-reschedule-builds-without-failed-dependencies)
+ "Update the build status of the failed-dependency builds which all
+dependencies are successful to scheduled."
+ (with-db-connection db
+ (let ((rescheduled
+ (with-timing-check "rescheduling builds without failed dependencies"
+ (exec-query/bind db "
+UPDATE Builds
+ SET status = " (build-status scheduled) "
+FROM
+ (SELECT Builds.id, count(dep.id) AS deps FROM Builds
+ LEFT JOIN BuildDependencies AS bd ON bd.source = Builds.id
+ LEFT JOIN Builds AS dep ON bd.target = dep.id AND dep.status != 0
+ WHERE Builds.status = " (build-status failed-dependency) "
+ GROUP BY Builds.id HAVING count(dep.id) = 0)
+AS deps
+WHERE deps.id = Builds.id"))))
+ (unless (zero? rescheduled)
+ (log-info "rescheduled ~a builds with zero failed dependencies~%"
+ rescheduled)))))
+
(define (db-register-builds builds specification)
(define (previous-build outputs)
;; Return the previous build producing OUTPUTS or #f if there is none.
@@ -1332,16 +1322,10 @@ UPDATE Builds SET stoptime =" now
(when (positive? rows)
;; Note: 'db-mark-failed-dependency-builds' is not called from
;; here because it takes too long when there are many 'Builds'
- ;; rows. Instead, that procedure is called periodically by the
- ;; "build maintainer", which is good enough.
-
- (when (and (= status (build-status succeeded))
- (build-failure? (build-current-status build)))
- ;; Transitioning from "failed" to "succeeded", for instance
- ;; because the build was restarted, so reschedule every build
that
- ;; depends on this one.
- (db-reschedule-dependent-builds build))
-
+ ;; rows; likewise for
+ ;; 'db-reschedule-builds-without-failed-dependencies'. Instead,
+ ;; that procedure is called periodically by the "build
+ ;; maintainer", which is good enough.
(let* ((spec (build-specification-name build))
(specification (db-get-specification spec))
(notifications
diff --git a/src/cuirass/scripts/remote-server.scm
b/src/cuirass/scripts/remote-server.scm
index f1208ad..caeb8ed 100644
--- a/src/cuirass/scripts/remote-server.scm
+++ b/src/cuirass/scripts/remote-server.scm
@@ -510,18 +510,20 @@ Use WORKER-DIRECTORY to maintain the list of active
workers."
(`(build-succeeded (drv ,drv) ,_ ,...)
(log-debug "fetching required for ~a (success)" drv)
(put-message fetch-worker command)
+ ;; TODO: Tell BUILD-MAINTAINER to resume failed-dependency when
+ ;; DRV is known to have failed to build on a previous attempt.
#t)
(`(build-failed (drv ,drv) ,_ ,...)
(log-debug "fetching required for ~a (fail)" drv)
(put-message fetch-worker command)
- (put-message build-maintainer #t) ;mark 'failed-dependency' builds
+ (put-message build-maintainer 'failed-dependency) ;mark
'failed-dependency' builds
#t)
(('build-dependency-failed ('drv drv) ('url url)
('dependency dependency) _ ...)
(log-info "build failed: dependency '~a' of '~a'"
dependency drv)
(db-update-build-status! drv (build-status failed-dependency))
- (put-message build-maintainer #t) ;mark 'failed-dependency' builds
+ (put-message build-maintainer 'failed-dependency) ;mark
'failed-dependency' builds
(let ((parent (db-get-build drv)))
(when parent
;; DEPENDENCY presumably lacks a corresponding 'Builds' row
diff --git a/tests/database.scm b/tests/database.scm
index 9cbf67e..f836d4f 100644
--- a/tests/database.scm
+++ b/tests/database.scm
@@ -1037,6 +1037,7 @@ timestamp, checkouttime, evaltime) VALUES ('guix', 0, 0,
0, 0);")
;; no other failed dependency.
(db-update-build-status! "/dep-1.drv" (build-status succeeded))
(db-mark-failed-dependency-builds)
+ (db-reschedule-builds-without-failed-dependencies)
`((initial ,@initial)
(final ,@(map status '("/primary-drv-1.drv"