gen_std.py 4.69 KB
#!/usr/bin/env python
#===- gen_std.py -  ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#

"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C/C++ Standard Library symbols by parsing archieved HTML
files from cppreference.

Caveats and FIXMEs:
  - only symbols directly in "std" namespace are added, we should also add std's
    subnamespace symbols (e.g. chrono).
  - symbols with multiple variants or defined in multiple headers aren't added,
    e.g. std::move, std::swap

Usage:
  1. Install BeautifulSoup dependency, see instruction:
       https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
  2. Download cppreference offline HTML files (e.g. html_book_20181028.zip) at
       https://en.cppreference.com/w/Cppreference:Archives
  3. Unzip the zip file from step 2 to directory </cppreference>, you should
     get a "reference" directory in </cppreference>
  4. Run the command:
       // Generate C++ symbols
       gen_std.py -cppreference </cppreference/reference> -language=cpp > StdSymbolMap.inc
       // Generate C symbols
       gen_std.py -cppreference </cppreference/reference> -language=c > CSymbolMap.inc
"""


import cppreference_parser
import argparse
import datetime
import os
import sys

CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// Automatically generated file, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""

def ParseArg():
  parser = argparse.ArgumentParser(description='Generate StdGen file')
  parser.add_argument('-cppreference', metavar='PATH',
                      default='',
                      help='path to the cppreference offline HTML directory',
                      required=True
                     )
  parser.add_argument('-language',
                      default='cpp',
                      help='Generate c or cpp symbols',
                      required=True)
  return parser.parse_args()


def main():
  args = ParseArg()
  if args.language == 'cpp':
    page_root = os.path.join(args.cppreference, "en", "cpp")
    symbol_index_root = os.path.join(page_root, "symbol_index")
    parse_pages =  [
      (page_root, "symbol_index.html", "std::"),
      # std sub-namespace symbols have separated pages.
      # We don't index std literal operators (e.g.
      # std::literals::chrono_literals::operator""d), these symbols can't be
      # accessed by std::<symbol_name>.
      # FIXME: index std::placeholders symbols, placeholders.html page is
      # different (which contains one entry for _1, _2, ..., _N), we need special
      # handling.
      (symbol_index_root, "chrono.html", "std::chrono::"),
      (symbol_index_root, "filesystem.html", "std::filesystem::"),
      (symbol_index_root, "pmr.html", "std::pmr::"),
      (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
      (symbol_index_root, "this_thread.html", "std::this_thread::"),
    ]
  elif args.language == 'c':
    page_root = os.path.join(args.cppreference, "en", "c")
    symbol_index_root = page_root
    parse_pages = [(page_root, "index.html", None)]

  if not os.path.exists(symbol_index_root):
    exit("Path %s doesn't exist!" % symbol_index_root)

  symbols = cppreference_parser.GetSymbols(parse_pages)

  # We don't have version information from the unzipped offline HTML files.
  # so we use the modified time of the symbol_index.html as the version.
  index_page_path = os.path.join(page_root, "index.html")
  cppreference_modified_date = datetime.datetime.fromtimestamp(
    os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
  print CODE_PREFIX % (args.language.upper(), cppreference_modified_date)
  for symbol in symbols:
    if len(symbol.headers) == 1:
      # SYMBOL(unqualified_name, namespace, header)
      print "SYMBOL(%s, %s, %s)" % (symbol.name, symbol.namespace,
                                    symbol.headers[0])
    elif len(symbol.headers) == 0:
      sys.stderr.write("No header found for symbol %s\n" % symbol.name)
    else:
      # FIXME: support symbols with multiple headers (e.g. std::move).
      sys.stderr.write("Ambiguous header for symbol %s: %s\n" % (
          symbol.name, ', '.join(symbol.headers)))


if __name__ == '__main__':
  main()