llvm_checksum.py 6.06 KB
#!/usr/bin/env python
""" A small program to compute checksums of LLVM checkout.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import hashlib
import logging
import re
import sys
from argparse import ArgumentParser
from project_tree import *

SVN_DATES_REGEX = re.compile(r"\$(Date|LastChangedDate)[^\$]+\$")


def main():
  parser = ArgumentParser()
  parser.add_argument(
      "-v", "--verbose", action="store_true", help="enable debug logging")
  parser.add_argument(
      "-c",
      "--check",
      metavar="reference_file",
      help="read checksums from reference_file and " +
      "check they match checksums of llvm_path.")
  parser.add_argument(
      "--partial",
      action="store_true",
      help="ignore projects from reference_file " +
      "that are not checked out in llvm_path.")
  parser.add_argument(
      "--multi_dir",
      action="store_true",
      help="indicates llvm_path contains llvm, checked out " +
      "into multiple directories, as opposed to a " +
      "typical single source tree checkout.")
  parser.add_argument("llvm_path")

  args = parser.parse_args()
  if args.check is not None:
    with open(args.check, "r") as f:
      reference_checksums = ReadLLVMChecksums(f)
  else:
    reference_checksums = None

  if args.verbose:
    logging.basicConfig(level=logging.DEBUG)

  llvm_projects = CreateLLVMProjects(not args.multi_dir)
  checksums = ComputeLLVMChecksums(args.llvm_path, llvm_projects)

  if reference_checksums is None:
    WriteLLVMChecksums(checksums, sys.stdout)
    sys.exit(0)

  if not ValidateChecksums(reference_checksums, checksums, args.partial):
    sys.stdout.write("Checksums differ.\nNew checksums:\n")
    WriteLLVMChecksums(checksums, sys.stdout)
    sys.stdout.write("Reference checksums:\n")
    WriteLLVMChecksums(reference_checksums, sys.stdout)
    sys.exit(1)
  else:
    sys.stdout.write("Checksums match.")


def ComputeLLVMChecksums(root_path, projects):
  """Compute checksums for LLVM sources checked out using svn.

  Args:
    root_path: a directory of llvm checkout.
    projects: a list of LLVMProject instances, which describe checkout paths,
      relative to root_path.

  Returns:
    A dict mapping from project name to project checksum.
  """
  hash_algo = hashlib.sha256

  def collapse_svn_substitutions(contents):
    # Replace svn substitutions for $Date$ and $LastChangedDate$.
    # Unfortunately, these are locale-specific.
    return SVN_DATES_REGEX.sub("$\1$", contents)

  def read_and_collapse_svn_subsitutions(file_path):
    with open(file_path, "rb") as f:
      contents = f.read()
      new_contents = collapse_svn_substitutions(contents)
      if contents != new_contents:
        logging.debug("Replaced svn keyword substitutions in %s", file_path)
        logging.debug("\n\tBefore\n%s\n\tAfter\n%s", contents, new_contents)
      return new_contents

  project_checksums = dict()
  # Hash each project.
  for proj in projects:
    project_root = os.path.join(root_path, proj.relpath)
    if not os.path.exists(project_root):
      logging.info("Folder %s doesn't exist, skipping project %s", proj.relpath,
                   proj.name)
      continue

    files = list()

    def add_file_hash(file_path):
      if os.path.islink(file_path) and not os.path.exists(file_path):
        content = os.readlink(file_path)
      else:
        content = read_and_collapse_svn_subsitutions(file_path)
      hasher = hash_algo()
      hasher.update(content)
      file_digest = hasher.hexdigest()
      logging.debug("Checksum %s for file %s", file_digest, file_path)
      files.append((file_path, file_digest))

    logging.info("Computing checksum for %s", proj.name)
    WalkProjectFiles(root_path, projects, proj, add_file_hash)

    # Compute final checksum.
    files.sort(key=lambda x: x[0])
    hasher = hash_algo()
    for file_path, file_digest in files:
      file_path = os.path.relpath(file_path, project_root)
      hasher.update(file_path)
      hasher.update(file_digest)
    project_checksums[proj.name] = hasher.hexdigest()
  return project_checksums


def WriteLLVMChecksums(checksums, f):
  """Writes checksums to a text file.

  Args:
    checksums: a dict mapping from project name to project checksum (result of
      ComputeLLVMChecksums).
    f: a file object to write into.
  """

  for proj in sorted(checksums.keys()):
    f.write("{} {}\n".format(checksums[proj], proj))


def ReadLLVMChecksums(f):
  """Reads checksums from a text file, produced by WriteLLVMChecksums.

  Returns:
    A dict, mapping from project name to project checksum.
  """
  checksums = {}
  while True:
    line = f.readline()
    if line == "":
      break
    checksum, proj = line.split()
    checksums[proj] = checksum
  return checksums


def ValidateChecksums(reference_checksums,
                      new_checksums,
                      allow_missing_projects=False):
  """Validates that reference_checksums and new_checksums match.

  Args:
    reference_checksums: a dict of reference checksums, mapping from a project
      name to a project checksum.
    new_checksums: a dict of checksums to be checked, mapping from a project
      name to a project checksum.
    allow_missing_projects:
      When True, reference_checksums may contain more projects than
        new_checksums. Projects missing from new_checksums are ignored.
      When False, new_checksums and reference_checksums must contain checksums
        for the same set of projects. If there is a project in
        reference_checksums, missing from new_checksums, ValidateChecksums
        will return False.

  Returns:
    True, if checksums match with regards to allow_missing_projects flag value.
    False, otherwise.
  """
  if not allow_missing_projects:
    if len(new_checksums) != len(reference_checksums):
      return False

  for proj, checksum in new_checksums.items():
    # We never computed a checksum for this project.
    if proj not in reference_checksums:
      return False
    # Checksum did not match.
    if reference_checksums[proj] != checksum:
      return False

  return True


if __name__ == "__main__":
  main()