guix-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Ludovic Courtès
Date: Thu, 5 Apr 2018 16:26:33 -0400 (EDT)

branch: master
commit 074b9d02f1ca01007f39adbc019763027a51d9bd
Author: Ludovic Courtès <address@hidden>
Date:   Thu Apr 5 22:17:45 2018 +0200

    base: Let sqlite handle deduplication of the list of pending derivations.
    
    Previously we would make a SQL query that would return many build jobs,
    and then call 'delete-duplicates' on that.  This was extremely wasteful
    because the list of returned by the query was huge leading to a heap of
    several tens of GiB on a big database, and 'delete-duplicates' would
    lead to more GC and it would take ages.
    
    Furthermore, since 'delete-duplicates' is written in C as of Guile
    2.2.3, it is uninterruptible from Fiber's viewpoint.  Consequently, the
    kernel thread running the 'restart-builds' fiber would never schedule
    other fibers, which could lead to deadlocks--e.g., since fibers are
    scheduled on a circular shuffled list of kernel threads, once every N
    times, a web server fiber would be sent to that kernel thread and not be
    serviced.
    
    * src/cuirass/base.scm (shuffle-jobs): Remove.
    (shuffle-derivations): New procedure.
    (spawn-builds): Take a list of derivations instead of a list of jobs.
    (restart-builds): Remove 'builds' parameter.  Remove 'delete-duplicates'
    call.  Remove done/remaining partitioning.
    (build-packages): Adjust to pass 'spawn-builds' a list of derivations.
    * bin/cuirass.in (main): Remove computation of PENDING.  Remove second
    parameter in call to 'restart-builds'.
---
 bin/cuirass.in       | 11 +++----
 src/cuirass/base.scm | 82 ++++++++++++++++++----------------------------------
 2 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/bin/cuirass.in b/bin/cuirass.in
index fa0d6af..d27167c 100644
--- a/bin/cuirass.in
+++ b/bin/cuirass.in
@@ -128,12 +128,9 @@ exec ${GUILE:address@hidden@} --no-auto-compile -e main -s 
"$0" "$@"
                                 new-specs)))
                (if one-shot?
                    (process-specs db (db-get-specifications db))
-                   (let ((exit-channel (make-channel))
-                         (pending
-                          (begin
-                            (clear-build-queue db)
-                            (log-message "retrieving list of pending 
builds...")
-                            (db-get-builds db '((status pending))))))
+                   (let ((exit-channel (make-channel)))
+
+                     (clear-build-queue db)
 
                      ;; First off, restart builds that had not completed or
                      ;; were not even started on a previous run.
@@ -142,7 +139,7 @@ exec ${GUILE:address@hidden@} --no-auto-compile -e main -s 
"$0" "$@"
                        'restart-builds exit-channel
                        (lambda ()
                          (with-database db
-                           (restart-builds db pending)))))
+                           (restart-builds db)))))
 
                      (spawn-fiber
                       (essential-task
diff --git a/src/cuirass/base.scm b/src/cuirass/base.scm
index a96f640..c9c5ec1 100644
--- a/src/cuirass/base.scm
+++ b/src/cuirass/base.scm
@@ -362,15 +362,10 @@ Essentially this procedure inverts the 
inversion-of-control that
 ;;; Building packages.
 ;;;
 
-(define (shuffle-jobs jobs)
-  "Shuffle JOBS, a list of job alists."
+(define (shuffle-derivations drv)
+  "Shuffle DRV, a list of derivation file names."
   ;; Our shuffling algorithm is simple: we sort by .drv file name.  :-)
-  (define (job<? job1 job2)
-    (let ((drv1 (assq-ref job1 #:derivation))
-          (drv2 (assq-ref job2 #:derivation)))
-      (string<? drv1 drv2)))
-
-  (sort jobs job<?))
+  (sort drv string<?))
 
 (define (update-build-statuses! store db lst)
   "Update the build status of the derivations listed in LST, which have just
@@ -397,11 +392,10 @@ and returns the values RESULTS."
        (print-exception (current-error-port) frame key args)
        (apply values results)))))
 
-(define* (spawn-builds store db jobs
+(define* (spawn-builds store db drv
                        #:key (max-batch-size 200))
-  "Build the derivations associated with JOBS, a list of job alists, updating
-DB as builds complete.  Derivations are submitted in batches of at most
-MAX-BATCH-SIZE items."
+  "Build the derivations listed in DRV, updating DB as builds complete.
+Derivations are submitted in batches of at most MAX-BATCH-SIZE items."
   ;; XXX: We want to pass 'build-derivations' as many derivations at once so
   ;; we benefit from as much parallelism as possible (we must be using
   ;; #:keep-going? #t).
@@ -419,31 +413,27 @@ MAX-BATCH-SIZE items."
   ;; This code works around it by submitting derivations in batches of at most
   ;; MAX-BATCH-SIZE.
 
-  (define total (length jobs))
+  (define total (length drv))
 
   (log-message "building ~a derivations in batches of ~a"
                total max-batch-size)
 
-  ;; Shuffle jobs so that we don't build sequentially i686/x86_64/aarch64,
+  ;; Shuffle DRV so that we don't build sequentially i686/x86_64/aarch64,
   ;; master/core-updates, etc., which would be suboptimal.
-  (let loop ((jobs  (shuffle-jobs jobs))
+  (let loop ((drv   (shuffle-derivations drv))
              (count total))
     (if (zero? count)
         (log-message "done with ~a derivations" total)
         (let*-values (((batch rest)
                        (if (> count max-batch-size)
-                           (split-at jobs max-batch-size)
-                           (values jobs '())))
-                      ((drv)
-                       (map (lambda (job)
-                              (assq-ref job #:derivation))
-                            batch)))
+                           (split-at drv max-batch-size)
+                           (values drv '()))))
           (guard (c ((nix-protocol-error? c)
                      (log-message "batch of builds (partially) failed:\
 ~a (status: ~a)"
                                   (nix-protocol-error-message c)
                                   (nix-protocol-error-status c))))
-            (log-message "building batch of ~a jobs (~a/~a)"
+            (log-message "building batch of ~a derivations (~a/~a)"
                          max-batch-size (- total count) total)
             (let-values (((port finish)
                           (build-derivations& store drv)))
@@ -526,43 +516,26 @@ procedure is meant to be called at startup."
                (- (time-second (current-time time-utc)) age)
                ";"))
 
-(define (restart-builds db builds)
+(define (restart-builds db)
   "Restart builds whose status in DB is \"pending\" (scheduled or started)."
   (with-store store
-    (let*-values (((builds)
-                   (delete-duplicates builds build-derivation=?))
-                  ((valid stale)
-                   (partition (lambda (build)
-                                (let ((drv (assq-ref build #:derivation)))
-                                  (valid-path? store drv)))
-                              builds)))
+    ;; Note: On a big database, 'db-get-pending-derivations' can take a couple
+    ;; of minutes, hence 'non-blocking'.
+    (log-message "retrieving list of pending builds...")
+    (let*-values (((valid stale)
+                   (partition (cut valid-path? store <>)
+                              (non-blocking (db-get-pending-derivations db)))))
       ;; We cannot restart builds listed in STALE, so mark them as canceled.
       (log-message "canceling ~a stale builds" (length stale))
-      (for-each (lambda (build)
-                  (db-update-build-status! db (assq-ref build #:derivation)
-                                           (build-status canceled)))
+      (for-each (lambda (drv)
+                  (db-update-build-status! db drv (build-status canceled)))
                 stale)
 
-      ;; Those in VALID can be restarted, but some of them may actually be
-      ;; done already--either because our database is outdated, or because it
-      ;; was not built by Cuirass.
-      (let-values (((done remaining)
-                    (partition (lambda (build)
-                                 (match (assq-ref build #:outputs)
-                                   (((name ((#:path . item))) _ ...)
-                                    (valid-path? store item))
-                                   (_ #f)))
-                               valid)))
-        (log-message "~a of the pending builds had actually completed"
-                     (length done))
-        (for-each (lambda (build)
-                    (db-update-build-status! db (assq-ref build #:derivation)
-                                             (build-status succeeded)))
-                  done)
-
-        (log-message "restarting ~a pending builds" (length remaining))
-        (spawn-builds store db remaining)
-        (log-message "done with restarted builds")))))
+      ;; Those in VALID can be restarted.  If some of them were built in the
+      ;; meantime behind our back, that's fine: 'spawn-builds' will DTRT.
+      (log-message "restarting ~a pending builds" (length valid))
+      (spawn-builds store db valid)
+      (log-message "done with restarted builds"))))
 
 (define (build-packages store db jobs)
   "Build JOBS and return a list of Build results."
@@ -595,7 +568,8 @@ procedure is meant to be called at startup."
   (define build-ids
     (map register jobs))
 
-  (spawn-builds store db jobs)
+  (spawn-builds store db
+                (map (cut assq-ref <> #:derivation) jobs))
 
   (let* ((results (filter-map (cut db-get-build db <>) build-ids))
          (status (map (cut assq-ref <> #:status) results))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]