verify.js 6.67 KB
'use strict'

const BB = require('bluebird')

const contentPath = require('./content/path')
const figgyPudding = require('figgy-pudding')
const finished = BB.promisify(require('mississippi').finished)
const fixOwner = require('./util/fix-owner')
const fs = require('graceful-fs')
const glob = BB.promisify(require('glob'))
const index = require('./entry-index')
const path = require('path')
const rimraf = BB.promisify(require('rimraf'))
const ssri = require('ssri')

BB.promisifyAll(fs)

const VerifyOpts = figgyPudding({
  concurrency: {
    default: 20
  },
  filter: {},
  log: {
    default: { silly () {} }
  }
})

module.exports = verify
function verify (cache, opts) {
  opts = VerifyOpts(opts)
  opts.log.silly('verify', 'verifying cache at', cache)
  return BB.reduce([
    markStartTime,
    fixPerms,
    garbageCollect,
    rebuildIndex,
    cleanTmp,
    writeVerifile,
    markEndTime
  ], (stats, step, i) => {
    const label = step.name || `step #${i}`
    const start = new Date()
    return BB.resolve(step(cache, opts)).then(s => {
      s && Object.keys(s).forEach(k => {
        stats[k] = s[k]
      })
      const end = new Date()
      if (!stats.runTime) { stats.runTime = {} }
      stats.runTime[label] = end - start
      return stats
    })
  }, {}).tap(stats => {
    stats.runTime.total = stats.endTime - stats.startTime
    opts.log.silly('verify', 'verification finished for', cache, 'in', `${stats.runTime.total}ms`)
  })
}

function markStartTime (cache, opts) {
  return { startTime: new Date() }
}

function markEndTime (cache, opts) {
  return { endTime: new Date() }
}

function fixPerms (cache, opts) {
  opts.log.silly('verify', 'fixing cache permissions')
  return fixOwner.mkdirfix(cache, cache).then(() => {
    // TODO - fix file permissions too
    return fixOwner.chownr(cache, cache)
  }).then(() => null)
}

// Implements a naive mark-and-sweep tracing garbage collector.
//
// The algorithm is basically as follows:
// 1. Read (and filter) all index entries ("pointers")
// 2. Mark each integrity value as "live"
// 3. Read entire filesystem tree in `content-vX/` dir
// 4. If content is live, verify its checksum and delete it if it fails
// 5. If content is not marked as live, rimraf it.
//
function garbageCollect (cache, opts) {
  opts.log.silly('verify', 'garbage collecting content')
  const indexStream = index.lsStream(cache)
  const liveContent = new Set()
  indexStream.on('data', entry => {
    if (opts.filter && !opts.filter(entry)) { return }
    liveContent.add(entry.integrity.toString())
  })
  return finished(indexStream).then(() => {
    const contentDir = contentPath._contentDir(cache)
    return glob(path.join(contentDir, '**'), {
      follow: false,
      nodir: true,
      nosort: true
    }).then(files => {
      return BB.resolve({
        verifiedContent: 0,
        reclaimedCount: 0,
        reclaimedSize: 0,
        badContentCount: 0,
        keptSize: 0
      }).tap((stats) => BB.map(files, (f) => {
        const split = f.split(/[/\\]/)
        const digest = split.slice(split.length - 3).join('')
        const algo = split[split.length - 4]
        const integrity = ssri.fromHex(digest, algo)
        if (liveContent.has(integrity.toString())) {
          return verifyContent(f, integrity).then(info => {
            if (!info.valid) {
              stats.reclaimedCount++
              stats.badContentCount++
              stats.reclaimedSize += info.size
            } else {
              stats.verifiedContent++
              stats.keptSize += info.size
            }
            return stats
          })
        } else {
          // No entries refer to this content. We can delete.
          stats.reclaimedCount++
          return fs.statAsync(f).then(s => {
            return rimraf(f).then(() => {
              stats.reclaimedSize += s.size
              return stats
            })
          })
        }
      }, { concurrency: opts.concurrency }))
    })
  })
}

function verifyContent (filepath, sri) {
  return fs.statAsync(filepath).then(stat => {
    const contentInfo = {
      size: stat.size,
      valid: true
    }
    return ssri.checkStream(
      fs.createReadStream(filepath),
      sri
    ).catch(err => {
      if (err.code !== 'EINTEGRITY') { throw err }
      return rimraf(filepath).then(() => {
        contentInfo.valid = false
      })
    }).then(() => contentInfo)
  }).catch({ code: 'ENOENT' }, () => ({ size: 0, valid: false }))
}

function rebuildIndex (cache, opts) {
  opts.log.silly('verify', 'rebuilding index')
  return index.ls(cache).then(entries => {
    const stats = {
      missingContent: 0,
      rejectedEntries: 0,
      totalEntries: 0
    }
    const buckets = {}
    for (let k in entries) {
      if (entries.hasOwnProperty(k)) {
        const hashed = index._hashKey(k)
        const entry = entries[k]
        const excluded = opts.filter && !opts.filter(entry)
        excluded && stats.rejectedEntries++
        if (buckets[hashed] && !excluded) {
          buckets[hashed].push(entry)
        } else if (buckets[hashed] && excluded) {
          // skip
        } else if (excluded) {
          buckets[hashed] = []
          buckets[hashed]._path = index._bucketPath(cache, k)
        } else {
          buckets[hashed] = [entry]
          buckets[hashed]._path = index._bucketPath(cache, k)
        }
      }
    }
    return BB.map(Object.keys(buckets), key => {
      return rebuildBucket(cache, buckets[key], stats, opts)
    }, { concurrency: opts.concurrency }).then(() => stats)
  })
}

function rebuildBucket (cache, bucket, stats, opts) {
  return fs.truncateAsync(bucket._path).then(() => {
    // This needs to be serialized because cacache explicitly
    // lets very racy bucket conflicts clobber each other.
    return BB.mapSeries(bucket, entry => {
      const content = contentPath(cache, entry.integrity)
      return fs.statAsync(content).then(() => {
        return index.insert(cache, entry.key, entry.integrity, {
          metadata: entry.metadata,
          size: entry.size
        }).then(() => { stats.totalEntries++ })
      }).catch({ code: 'ENOENT' }, () => {
        stats.rejectedEntries++
        stats.missingContent++
      })
    })
  })
}

function cleanTmp (cache, opts) {
  opts.log.silly('verify', 'cleaning tmp directory')
  return rimraf(path.join(cache, 'tmp'))
}

function writeVerifile (cache, opts) {
  const verifile = path.join(cache, '_lastverified')
  opts.log.silly('verify', 'writing verifile to ' + verifile)
  try {
    return fs.writeFileAsync(verifile, '' + (+(new Date())))
  } finally {
    fixOwner.chownr.sync(cache, verifile)
  }
}

module.exports.lastRun = lastRun
function lastRun (cache) {
  return fs.readFileAsync(
    path.join(cache, '_lastverified'), 'utf8'
  ).then(data => new Date(+data))
}