guix-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug#31618] [PATCH 4/4] Add (guix store deduplication).


From: Ludovic Courtès
Subject: [bug#31618] [PATCH 4/4] Add (guix store deduplication).
Date: Mon, 28 May 2018 12:36:15 +0200

From: Caleb Ristvedt <address@hidden>

* guix/store/database.scm (register-path): Add #:deduplicate? and call
'deduplicate' when it's true.
(counting-wrapper-port, nar-sha256): Move to...
* guix/store/deduplication.scm: ... here.  New file.
* tests/store-deduplication.scm: New file.
* Makefile.am (STORE_MODULES): Add deduplication.scm.
(SCM_TESTS) [HAVE_GUILE_SQLITE3]: Add store-deduplication.scm.

Co-authored-by: Ludovic Courtès <address@hidden>
---
 Makefile.am                   |   6 +-
 guix/store/database.scm       |  43 ++--------
 guix/store/deduplication.scm  | 148 ++++++++++++++++++++++++++++++++++
 tests/store-deduplication.scm |  64 +++++++++++++++
 4 files changed, 222 insertions(+), 39 deletions(-)
 create mode 100644 guix/store/deduplication.scm
 create mode 100644 tests/store-deduplication.scm

diff --git a/Makefile.am b/Makefile.am
index d81fce558..474575c9f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -259,7 +259,8 @@ endif BUILD_DAEMON_OFFLOAD
 
 # Scheme implementation of the build daemon and related functionality.
 STORE_MODULES =                                        \
-  guix/store/database.scm
+  guix/store/database.scm                      \
+  guix/store/deduplication.scm
 
 if HAVE_GUILE_SQLITE3
 MODULES += $(STORE_MODULES)
@@ -392,7 +393,8 @@ endif
 if HAVE_GUILE_SQLITE3
 
 SCM_TESTS +=                                   \
-  tests/store-database.scm
+  tests/store-database.scm                     \
+  tests/store-deduplication.scm
 
 endif
 
diff --git a/guix/store/database.scm b/guix/store/database.scm
index b9745dbe1..3623c0e7a 100644
--- a/guix/store/database.scm
+++ b/guix/store/database.scm
@@ -21,10 +21,9 @@
   #:use-module (sqlite3)
   #:use-module (guix config)
   #:use-module (guix serialization)
+  #:use-module (guix store deduplication)
   #:use-module (guix base16)
-  #:use-module (guix hash)
   #:use-module (guix build syscalls)
-  #:use-module (rnrs io ports)
   #:use-module (srfi srfi-11)
   #:use-module (srfi srfi-19)
   #:use-module (ice-9 match)
@@ -140,39 +139,6 @@ bytes of the store item denoted by PATH after being 
converted to nar form."
 ;;; High-level interface.
 ;;;
 
-;; XXX: Would it be better to just make WRITE-FILE give size as well? I 
question
-;; the general utility of this approach.
-(define (counting-wrapper-port output-port)
-  "Some custom ports don't implement GET-POSITION at all. But if we want to
-figure out how many bytes are being written, we will want to use that. So this
-makes a wrapper around a port which implements GET-POSITION."
-  (let ((byte-count 0))
-    (make-custom-binary-output-port "counting-wrapper"
-                                    (lambda (bytes offset count)
-                                      (set! byte-count
-                                        (+ byte-count count))
-                                      (put-bytevector output-port bytes
-                                                      offset count)
-                                      count)
-                                    (lambda ()
-                                      byte-count)
-                                    #f
-                                    (lambda ()
-                                      (close-port output-port)))))
-
-
-(define (nar-sha256 file)
-  "Gives the sha256 hash of a file and the size of the file in nar form."
-  (let-values (((port get-hash) (open-sha256-port)))
-    (let ((wrapper (counting-wrapper-port port)))
-      (write-file file wrapper)
-      (force-output wrapper)
-      (force-output port)
-      (let ((hash (get-hash))
-            (size (port-position wrapper)))
-        (close-port wrapper)
-        (values hash size)))))
-
 ;; TODO: Factorize with that in (gnu build install).
 (define (reset-timestamps file)
   "Reset the modification time on FILE and on all the files it contains, if
@@ -211,7 +177,7 @@ it's a directory."
 
 (define* (register-path path
                         #:key (references '()) deriver prefix
-                        state-directory)
+                        state-directory (deduplicate? #t))
   ;; Priority for options: first what is given, then environment variables,
   ;; then defaults. %state-directory, %store-directory, and
   ;; %store-database-directory already handle the "environment variables /
@@ -262,4 +228,7 @@ be used internally by the daemon's build hook."
        #:deriver deriver
        #:hash (string-append "sha256:"
                              (bytevector->base16-string hash))
-       #:nar-size nar-size))))
+       #:nar-size nar-size)
+
+      (when deduplicate?
+        (deduplicate real-path hash #:store store-dir)))))
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
new file mode 100644
index 000000000..4b4ac01f6
--- /dev/null
+++ b/guix/store/deduplication.scm
@@ -0,0 +1,148 @@
+;;; GNU Guix --- Functional package management for GNU
+;;; Copyright © 2017 Caleb Ristvedt <address@hidden>
+;;; Copyright © 2018 Ludovic Courtès <address@hidden>
+;;;
+;;; This file is part of GNU Guix.
+;;;
+;;; GNU Guix is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; GNU Guix is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; This houses stuff we do to files when they arrive at the store - resetting
+;;; timestamps, deduplicating, etc.
+
+(define-module (guix store deduplication)
+  #:use-module (guix hash)
+  #:use-module (guix build utils)
+  #:use-module (guix base16)
+  #:use-module (srfi srfi-11)
+  #:use-module (rnrs io ports)
+  #:use-module (ice-9 ftw)
+  #:use-module (guix serialization)
+  #:export (nar-sha256
+            deduplicate))
+
+;; Would it be better to just make WRITE-FILE give size as well? I question
+;; the general utility of this approach.
+(define (counting-wrapper-port output-port)
+  "Some custom ports don't implement GET-POSITION at all. But if we want to
+figure out how many bytes are being written, we will want to use that. So this
+makes a wrapper around a port which implements GET-POSITION."
+  (let ((byte-count 0))
+    (make-custom-binary-output-port "counting-wrapper"
+                                    (lambda (bytes offset count)
+                                      (set! byte-count
+                                        (+ byte-count count))
+                                      (put-bytevector output-port bytes
+                                                      offset count)
+                                      count)
+                                    (lambda ()
+                                      byte-count)
+                                    #f
+                                    (lambda ()
+                                      (close-port output-port)))))
+
+(define (nar-sha256 file)
+  "Gives the sha256 hash of a file and the size of the file in nar form."
+  (let-values (((port get-hash) (open-sha256-port)))
+    (let ((wrapper (counting-wrapper-port port)))
+      (write-file file wrapper)
+      (force-output wrapper)
+      (force-output port)
+      (let ((hash (get-hash))
+            (size (port-position wrapper)))
+        (close-port wrapper)
+        (values hash size)))))
+
+(define (tempname-in directory)
+  "Gives an unused temporary name under DIRECTORY. Not guaranteed to still be
+unused by the time you create anything with that name, but a good shot."
+  (let ((const-part (string-append directory "/.tmp-link-"
+                                   (number->string (getpid)))))
+    (let try ((guess-part
+               (number->string (random most-positive-fixnum) 16)))
+      (if (file-exists? (string-append const-part "-" guess-part))
+          (try (number->string (random most-positive-fixnum) 16))
+          (string-append const-part "-" guess-part)))))
+
+(define* (get-temp-link target #:optional (link-prefix (dirname target)))
+  "Like mkstemp!, but instead of creating a new file and giving you the name,
+it creates a new hardlink to TARGET and gives you the name. Since
+cross-filesystem hardlinks don't work, the temp link must be created on the
+same filesystem - where in that filesystem it is can be controlled by
+LINK-PREFIX."
+  (let try ((tempname (tempname-in link-prefix)))
+    (catch 'system-error
+      (lambda ()
+        (link target tempname)
+        tempname)
+      (lambda (args)
+        (if (= (system-error-errno args) EEXIST)
+            (try (tempname-in link-prefix))
+            (throw 'system-error args))))))
+
+;; There are 3 main kinds of errors we can get from hardlinking: "Too many
+;; things link to this" (EMLINK), "this link already exists" (EEXIST), and
+;; "can't fit more stuff in this directory" (ENOSPC).
+
+(define (replace-with-link target to-replace)
+  "Atomically replace the file TO-REPLACE with a link to TARGET.  Note: TARGET
+and TO-REPLACE must be on the same file system."
+  (let ((temp-link (get-temp-link target (dirname to-replace))))
+    (rename-file temp-link to-replace)))
+
+(define-syntax-rule (false-if-system-error (errors ...) exp ...)
+  "Given ERRORS, a list of system error codes to ignore, evaluates EXP... and
+return #f if any of the system error codes in the given list are thrown."
+  (catch 'system-error
+    (lambda ()
+      exp ...)
+    (lambda args
+      (if (member (system-error-errno args) (list errors ...))
+          #f
+          (apply throw args)))))
+
+(define* (deduplicate path hash #:key (store %store-directory))
+  "Check if a store item with sha256 hash HASH already exists.  If so,
+replace PATH with a hardlink to the already-existing one.  If not, register
+PATH so that future duplicates can hardlink to it.  PATH is assumed to be
+under STORE."
+  (let* ((links-directory (string-append store "/.links"))
+         (link-file       (string-append links-directory "/"
+                                         (bytevector->base16-string hash))))
+    (mkdir-p links-directory)
+    (if (file-is-directory? path)
+        ;; Can't hardlink directories, so hardlink their atoms.
+        (for-each (lambda (file)
+                    (unless (member file '("." ".."))
+                      (deduplicate file (nar-sha256 file)
+                                   #:store store)))
+                  (scandir path))
+        (if (file-exists? link-file)
+            (false-if-system-error (EMLINK)
+                                   (replace-with-link link-file path))
+            (catch 'system-error
+              (lambda ()
+                (link path link-file))
+              (lambda args
+                (let ((errno (system-error-errno args)))
+                  (cond ((= errno EEXIST)
+                         ;; Someone else put an entry for PATH in
+                         ;; LINKS-DIRECTORY before we could.  Let's use it.
+                         (false-if-system-error (EMLINK)
+                                                (replace-with-link path 
link-file)))
+                        ((= errno ENOSPC)
+                         ;; There's not enough room in the directory index for
+                         ;; more entries in .links, but that's fine: we can
+                         ;; just stop.
+                         #f)
+                        (else (apply throw args))))))))))
diff --git a/tests/store-deduplication.scm b/tests/store-deduplication.scm
new file mode 100644
index 000000000..04817a193
--- /dev/null
+++ b/tests/store-deduplication.scm
@@ -0,0 +1,64 @@
+;;; GNU Guix --- Functional package management for GNU
+;;; Copyright © 2018 Ludovic Courtès <address@hidden>
+;;;
+;;; This file is part of GNU Guix.
+;;;
+;;; GNU Guix is free software; you can redistribute it and/or modify it
+;;; under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 3 of the License, or (at
+;;; your option) any later version.
+;;;
+;;; GNU Guix is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.
+
+(define-module (test-store-deduplication)
+  #:use-module (guix tests)
+  #:use-module (guix store deduplication)
+  #:use-module (guix hash)
+  #:use-module ((guix utils) #:select (call-with-temporary-directory))
+  #:use-module (guix build utils)
+  #:use-module (rnrs bytevectors)
+  #:use-module (ice-9 binary-ports)
+  #:use-module (srfi srfi-1)
+  #:use-module (srfi srfi-64))
+
+(test-begin "store-deduplication")
+
+(test-equal "deduplicate"
+  (cons* #t #f                                    ;inode comparisons
+         2 (make-list 5 6))                       ;'nlink' values
+
+  (call-with-temporary-directory
+   (lambda (store)
+     (let ((data      (string->utf8 "Hello, world!"))
+           (identical (map (lambda (n)
+                             (string-append store "/" (number->string n)))
+                           (iota 5)))
+           (unique    (string-append store "/unique")))
+       (for-each (lambda (file)
+                   (call-with-output-file file
+                     (lambda (port)
+                       (put-bytevector port data))))
+                 identical)
+       (call-with-output-file unique
+         (lambda (port)
+           (put-bytevector port (string->utf8 "This is unique."))))
+
+       (for-each (lambda (file)
+                   (deduplicate file (sha256 data) #:store store))
+                 identical)
+       (deduplicate unique (nar-sha256 unique) #:store store)
+
+       ;; (system (string-append "ls -lRia " store))
+       (cons* (apply = (map (compose stat:ino stat) identical))
+              (= (stat:ino (stat unique))
+                 (stat:ino (stat (car identical))))
+              (stat:nlink (stat unique))
+              (map (compose stat:nlink stat) identical))))))
+
+(test-end "store-deduplication")
-- 
2.17.0






reply via email to

[Prev in Thread] Current Thread [Next in Thread]