verify.js 7.75 KB
'use strict'

const util = require('util')

const pMap = require('p-map')
const contentPath = require('./content/path')
const fixOwner = require('./util/fix-owner')
const fs = require('fs')
const fsm = require('fs-minipass')
const glob = util.promisify(require('glob'))
const index = require('./entry-index')
const path = require('path')
const rimraf = util.promisify(require('rimraf'))
const ssri = require('ssri')

const hasOwnProperty = (obj, key) =>
  Object.prototype.hasOwnProperty.call(obj, key)

const stat = util.promisify(fs.stat)
const truncate = util.promisify(fs.truncate)
const writeFile = util.promisify(fs.writeFile)
const readFile = util.promisify(fs.readFile)

const verifyOpts = (opts) => ({
  concurrency: 20,
  log: { silly () {} },
  ...opts,
})

module.exports = verify

function verify (cache, opts) {
  opts = verifyOpts(opts)
  opts.log.silly('verify', 'verifying cache at', cache)

  const steps = [
    markStartTime,
    fixPerms,
    garbageCollect,
    rebuildIndex,
    cleanTmp,
    writeVerifile,
    markEndTime,
  ]

  return steps
    .reduce((promise, step, i) => {
      const label = step.name
      const start = new Date()
      return promise.then((stats) => {
        return step(cache, opts).then((s) => {
          s &&
            Object.keys(s).forEach((k) => {
              stats[k] = s[k]
            })
          const end = new Date()
          if (!stats.runTime)
            stats.runTime = {}

          stats.runTime[label] = end - start
          return Promise.resolve(stats)
        })
      })
    }, Promise.resolve({}))
    .then((stats) => {
      stats.runTime.total = stats.endTime - stats.startTime
      opts.log.silly(
        'verify',
        'verification finished for',
        cache,
        'in',
        `${stats.runTime.total}ms`
      )
      return stats
    })
}

function markStartTime (cache, opts) {
  return Promise.resolve({ startTime: new Date() })
}

function markEndTime (cache, opts) {
  return Promise.resolve({ endTime: new Date() })
}

function fixPerms (cache, opts) {
  opts.log.silly('verify', 'fixing cache permissions')
  return fixOwner
    .mkdirfix(cache, cache)
    .then(() => {
      // TODO - fix file permissions too
      return fixOwner.chownr(cache, cache)
    })
    .then(() => null)
}

// Implements a naive mark-and-sweep tracing garbage collector.
//
// The algorithm is basically as follows:
// 1. Read (and filter) all index entries ("pointers")
// 2. Mark each integrity value as "live"
// 3. Read entire filesystem tree in `content-vX/` dir
// 4. If content is live, verify its checksum and delete it if it fails
// 5. If content is not marked as live, rimraf it.
//
function garbageCollect (cache, opts) {
  opts.log.silly('verify', 'garbage collecting content')
  const indexStream = index.lsStream(cache)
  const liveContent = new Set()
  indexStream.on('data', (entry) => {
    if (opts.filter && !opts.filter(entry))
      return

    liveContent.add(entry.integrity.toString())
  })
  return new Promise((resolve, reject) => {
    indexStream.on('end', resolve).on('error', reject)
  }).then(() => {
    const contentDir = contentPath.contentDir(cache)
    return glob(path.join(contentDir, '**'), {
      follow: false,
      nodir: true,
      nosort: true,
    }).then((files) => {
      return Promise.resolve({
        verifiedContent: 0,
        reclaimedCount: 0,
        reclaimedSize: 0,
        badContentCount: 0,
        keptSize: 0,
      }).then((stats) =>
        pMap(
          files,
          (f) => {
            const split = f.split(/[/\\]/)
            const digest = split.slice(split.length - 3).join('')
            const algo = split[split.length - 4]
            const integrity = ssri.fromHex(digest, algo)
            if (liveContent.has(integrity.toString())) {
              return verifyContent(f, integrity).then((info) => {
                if (!info.valid) {
                  stats.reclaimedCount++
                  stats.badContentCount++
                  stats.reclaimedSize += info.size
                } else {
                  stats.verifiedContent++
                  stats.keptSize += info.size
                }
                return stats
              })
            } else {
              // No entries refer to this content. We can delete.
              stats.reclaimedCount++
              return stat(f).then((s) => {
                return rimraf(f).then(() => {
                  stats.reclaimedSize += s.size
                  return stats
                })
              })
            }
          },
          { concurrency: opts.concurrency }
        ).then(() => stats)
      )
    })
  })
}

function verifyContent (filepath, sri) {
  return stat(filepath)
    .then((s) => {
      const contentInfo = {
        size: s.size,
        valid: true,
      }
      return ssri
        .checkStream(new fsm.ReadStream(filepath), sri)
        .catch((err) => {
          if (err.code !== 'EINTEGRITY')
            throw err

          return rimraf(filepath).then(() => {
            contentInfo.valid = false
          })
        })
        .then(() => contentInfo)
    })
    .catch((err) => {
      if (err.code === 'ENOENT')
        return { size: 0, valid: false }

      throw err
    })
}

function rebuildIndex (cache, opts) {
  opts.log.silly('verify', 'rebuilding index')
  return index.ls(cache).then((entries) => {
    const stats = {
      missingContent: 0,
      rejectedEntries: 0,
      totalEntries: 0,
    }
    const buckets = {}
    for (const k in entries) {
      /* istanbul ignore else */
      if (hasOwnProperty(entries, k)) {
        const hashed = index.hashKey(k)
        const entry = entries[k]
        const excluded = opts.filter && !opts.filter(entry)
        excluded && stats.rejectedEntries++
        if (buckets[hashed] && !excluded)
          buckets[hashed].push(entry)
        else if (buckets[hashed] && excluded) {
          // skip
        } else if (excluded) {
          buckets[hashed] = []
          buckets[hashed]._path = index.bucketPath(cache, k)
        } else {
          buckets[hashed] = [entry]
          buckets[hashed]._path = index.bucketPath(cache, k)
        }
      }
    }
    return pMap(
      Object.keys(buckets),
      (key) => {
        return rebuildBucket(cache, buckets[key], stats, opts)
      },
      { concurrency: opts.concurrency }
    ).then(() => stats)
  })
}

function rebuildBucket (cache, bucket, stats, opts) {
  return truncate(bucket._path).then(() => {
    // This needs to be serialized because cacache explicitly
    // lets very racy bucket conflicts clobber each other.
    return bucket.reduce((promise, entry) => {
      return promise.then(() => {
        const content = contentPath(cache, entry.integrity)
        return stat(content)
          .then(() => {
            return index
              .insert(cache, entry.key, entry.integrity, {
                metadata: entry.metadata,
                size: entry.size,
              })
              .then(() => {
                stats.totalEntries++
              })
          })
          .catch((err) => {
            if (err.code === 'ENOENT') {
              stats.rejectedEntries++
              stats.missingContent++
              return
            }
            throw err
          })
      })
    }, Promise.resolve())
  })
}

function cleanTmp (cache, opts) {
  opts.log.silly('verify', 'cleaning tmp directory')
  return rimraf(path.join(cache, 'tmp'))
}

function writeVerifile (cache, opts) {
  const verifile = path.join(cache, '_lastverified')
  opts.log.silly('verify', 'writing verifile to ' + verifile)
  try {
    return writeFile(verifile, '' + +new Date())
  } finally {
    fixOwner.chownr.sync(cache, verifile)
  }
}

module.exports.lastRun = lastRun

function lastRun (cache) {
  return readFile(path.join(cache, '_lastverified'), 'utf8').then(
    (data) => new Date(+data)
  )
}