diff hgext/lfs/blobstore.py @ 35480:417e8e040102

lfs: verify lfs object content when transferring to and from the remote store This avoids inserting corrupt files into the usercache, and local and remote stores. One down side is that the bad file won't be available locally for forensic purposes after a remote download. I'm thinking about adding an 'incoming' directory to the local lfs store to handle the download, and then move it to the 'objects' directory after it passes verification. That would have the additional benefit of not concatenating each transfer chunk in memory until the full file is transferred. Verification isn't needed when the data is passed back through the revlog interface or when the oid was just calculated, but otherwise it is on by default. The additional overhead should be well worth avoiding problems with file based remote stores, or buggy lfs servers. Having two different verify functions is a little sad, but the full data of the blob is mostly passed around in memory, because that's what the revlog interface wants. The upload function, however, chunks up the data. It would be ideal if that was how the content is always handled, but that's probably a huge project. I don't really like printing the long hash, but `hg debugdata` isn't a public interface, and is the only way to get it. The filelog and revision info is nowhere near this area, so recommending `hg verify` is the easiest thing to do.
author Matt Harbison <matt_harbison@yahoo.com>
date Fri, 17 Nov 2017 00:06:45 -0500
parents b0c01a5ee35c
children bb6a80fc969a
line wrap: on
line diff
--- a/hgext/lfs/blobstore.py	Mon Dec 04 21:41:04 2017 -0500
+++ b/hgext/lfs/blobstore.py	Fri Nov 17 00:06:45 2017 -0500
@@ -7,6 +7,7 @@
 
 from __future__ import absolute_import
 
+import hashlib
 import json
 import os
 import re
@@ -99,8 +100,11 @@
         self.cachevfs = lfsvfs(usercache)
         self.ui = repo.ui
 
-    def write(self, oid, data):
+    def write(self, oid, data, verify=True):
         """Write blob to local blobstore."""
+        if verify:
+            _verify(oid, data)
+
         with self.vfs(oid, 'wb', atomictemp=True) as fp:
             fp.write(data)
 
@@ -110,14 +114,23 @@
             self.ui.note(_('lfs: adding %s to the usercache\n') % oid)
             lfutil.link(self.vfs.join(oid), self.cachevfs.join(oid))
 
-    def read(self, oid):
+    def read(self, oid, verify=True):
         """Read blob from local blobstore."""
         if not self.vfs.exists(oid):
+            blob = self._read(self.cachevfs, oid, verify)
+            self.ui.note(_('lfs: found %s in the usercache\n') % oid)
             lfutil.link(self.cachevfs.join(oid), self.vfs.join(oid))
-            self.ui.note(_('lfs: found %s in the usercache\n') % oid)
         else:
             self.ui.note(_('lfs: found %s in the local lfs store\n') % oid)
-        return self.vfs.read(oid)
+            blob = self._read(self.vfs, oid, verify)
+        return blob
+
+    def _read(self, vfs, oid, verify):
+        """Read blob (after verifying) from the given store"""
+        blob = vfs.read(oid)
+        if verify:
+            _verify(oid, blob)
+        return blob
 
     def has(self, oid):
         """Returns True if the local blobstore contains the requested blob,
@@ -233,6 +246,8 @@
         request = util.urlreq.request(href)
         if action == 'upload':
             # If uploading blobs, read data from local blobstore.
+            with localstore.vfs(oid) as fp:
+                _verifyfile(oid, fp)
             request.data = filewithprogress(localstore.vfs(oid), None)
             request.get_method = lambda: 'PUT'
 
@@ -253,7 +268,7 @@
 
         if action == 'download':
             # If downloading blobs, store downloaded data to local blobstore
-            localstore.write(oid, response)
+            localstore.write(oid, response, verify=True)
 
     def _batch(self, pointers, localstore, action):
         if action not in ['upload', 'download']:
@@ -324,14 +339,14 @@
 
     def writebatch(self, pointers, fromstore):
         for p in pointers:
-            content = fromstore.read(p.oid())
+            content = fromstore.read(p.oid(), verify=True)
             with self.vfs(p.oid(), 'wb', atomictemp=True) as fp:
                 fp.write(content)
 
     def readbatch(self, pointers, tostore):
         for p in pointers:
             content = self.vfs.read(p.oid())
-            tostore.write(p.oid(), content)
+            tostore.write(p.oid(), content, verify=True)
 
 class _nullremote(object):
     """Null store storing blobs to /dev/null."""
@@ -368,6 +383,24 @@
     None: _promptremote,
 }
 
+def _verify(oid, content):
+    realoid = hashlib.sha256(content).hexdigest()
+    if realoid != oid:
+        raise error.Abort(_('detected corrupt lfs object: %s') % oid,
+                          hint=_('run hg verify'))
+
+def _verifyfile(oid, fp):
+    sha256 = hashlib.sha256()
+    while True:
+        data = fp.read(1024 * 1024)
+        if not data:
+            break
+        sha256.update(data)
+    realoid = sha256.hexdigest()
+    if realoid != oid:
+        raise error.Abort(_('detected corrupt lfs object: %s') % oid,
+                          hint=_('run hg verify'))
+
 def remote(repo):
     """remotestore factory. return a store in _storemap depending on config"""
     defaulturl = ''